add ci for mae and convmae (PaddlePaddle#197)

qizhaoaoe · Feb 8, 2023 · 2311442 · 2311442
1 parent 24f5c3b
commit 2311442
Show file tree

Hide file tree

Showing 12 changed files with 341 additions and 38 deletions.
diff --git a/task/ssl/mae/engine_finetune.py b/task/ssl/mae/engine_finetune.py
@@ -46,7 +46,6 @@ def train_one_epoch(model,
         'loss', misc.SmoothedValue(
             window_size=1, fmt='{value:.4f}'))
     header = 'Epoch: [{}]'.format(epoch)
-    print_freq = 20
 
     accum_iter = args.accum_iter
 
@@ -55,9 +54,14 @@ def train_one_epoch(model,
     if log_writer is not None:
         print('log_dir: {}'.format(log_writer.kwargs['log_dir']))
 
-    for data_iter_step, (
-            samples, targets
-    ) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+    for data_iter_step, (samples, targets) in enumerate(
+            metric_logger.log_every(data_loader, args.print_freq, header)):
+
+        if args.max_train_step is not None and data_iter_step >= args.max_train_step:
+            print(
+                f'step({data_iter_step}) >= max_train_step({args.max_train_step}), training stops early. This function is only used for debugging.'
+            )
+            exit(0)
 
         # we use a per iteration (instead of per epoch) lr scheduler
         if data_iter_step % accum_iter == 0:

diff --git a/task/ssl/mae/engine_pretrain.py b/task/ssl/mae/engine_pretrain.py
@@ -43,7 +43,6 @@ def train_one_epoch(model,
         'loss', misc.SmoothedValue(
             window_size=1, fmt='{value:.4f}'))
     header = 'Epoch: [{}]'.format(epoch)
-    print_freq = 20
 
     accum_iter = args.accum_iter
 
@@ -52,9 +51,14 @@ def train_one_epoch(model,
     if log_writer is not None:
         print('log_dir: {}'.format(log_writer.kwargs['log_dir']))
 
-    for data_iter_step, (
-            samples, _
-    ) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+    for data_iter_step, (samples, _) in enumerate(
+            metric_logger.log_every(data_loader, args.print_freq, header)):
+
+        if args.max_train_step is not None and data_iter_step >= args.max_train_step:
+            print(
+                f'step({data_iter_step}) >= max_train_step({args.max_train_step}), training stops early. This function is only used for debugging.'
+            )
+            exit(0)
 
         # we use a per iteration (instead of per epoch) lr scheduler
         if data_iter_step % accum_iter == 0:

diff --git a/task/ssl/mae/main_finetune.py b/task/ssl/mae/main_finetune.py
@@ -260,18 +260,12 @@ def get_args_parser():
     parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
     parser.set_defaults(pin_mem=True)
 
-    # distributed training parameters
+    parser.add_argument('--print_freq', default=20, type=int)
     parser.add_argument(
-        '--world_size',
-        default=1,
+        '--max_train_step',
+        default=None,
         type=int,
-        help='number of distributed processes')
-    parser.add_argument('--local_rank', default=-1, type=int)
-    parser.add_argument('--dist_on_itp', action='store_true')
-    parser.add_argument(
-        '--dist_url',
-        default='env://',
-        help='url used to set up distributed training')
+        help='only used for debugging')
 
     return parser
 

diff --git a/task/ssl/mae/main_linprobe.py b/task/ssl/mae/main_linprobe.py
@@ -157,18 +157,12 @@ def get_args_parser():
     parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
     parser.set_defaults(pin_mem=True)
 
-    # distributed training parameters
+    parser.add_argument('--print_freq', default=20, type=int)
     parser.add_argument(
-        '--world_size',
-        default=1,
+        '--max_train_step',
+        default=None,
         type=int,
-        help='number of distributed processes')
-    parser.add_argument('--local_rank', default=-1, type=int)
-    parser.add_argument('--dist_on_itp', action='store_true')
-    parser.add_argument(
-        '--dist_url',
-        default='env://',
-        help='url used to set up distributed training')
+        help='only used for debugging')
 
     return parser
 

diff --git a/task/ssl/mae/main_pretrain.py b/task/ssl/mae/main_pretrain.py
@@ -145,18 +145,12 @@ def get_args_parser():
     parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
     parser.set_defaults(pin_mem=True)
 
-    # distributed training parameters
+    parser.add_argument('--print_freq', default=20, type=int)
     parser.add_argument(
-        '--world_size',
-        default=1,
+        '--max_train_step',
+        default=None,
         type=int,
-        help='number of distributed processes')
-    parser.add_argument('--local_rank', default=-1, type=int)
-    parser.add_argument('--dist_on_itp', action='store_true')
-    parser.add_argument(
-        '--dist_url',
-        default='env://',
-        help='url used to set up distributed training')
+        help='only used for debugging')
 
     return parser
 

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
@@ -31,6 +31,12 @@ plsc_gpu_model_list=( \
     DeiT_base_patch16_224_in1k_1n8c_dp_fp16o2 \
     cait_s24_224_in1k_1n8c_dp_fp16o2 \
     swin_base_patch4_window7_224_fp16o2 \
+    mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1 \
+    mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1 \
+    mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1 \
+    convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1 \
+    convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1 \
+    convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1 \
 )
 
 ###### Face ######
@@ -169,6 +175,67 @@ function swin_base_patch4_window7_224_fp16o2() {
 }
 
 
+###### MAE ######
+function mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1() {
+    cd ${plsc_path}
+    rm -rf log
+    bash ./ssl/mae/mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh
+    check_result $FUNCNAME
+    loss=`tail log/workerlog.0 | grep "199/1251" | cut -d " " -f15 `
+    check_diff 1.0064 ${loss} ${FUNCNAME}_loss
+}
+
+
+function mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1() {
+    cd ${plsc_path}
+    rm -rf log
+    bash ./ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh
+    check_result $FUNCNAME
+    loss=`tail log/workerlog.0 | grep "599/5004" | cut -d " " -f15 `
+    check_diff 6.7559 ${loss} ${FUNCNAME}_loss
+}
+
+
+function mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1() {
+    cd ${plsc_path}
+    rm -rf log
+    bash ./ssl/mae/mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh
+    check_result $FUNCNAME
+    loss=`tail log/workerlog.0 | grep "199/312" | cut -d " " -f14 `
+    check_diff 6.6991 ${loss} ${FUNCNAME}_loss
+}
+
+
+###### ConvMAE ######
+function convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1() {
+    cd ${plsc_path}
+    rm -rf log
+    bash ./ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh
+    check_result $FUNCNAME
+    loss=`tail log/workerlog.0 | grep "99/1251" | cut -d " " -f16 `
+    check_diff 1.2954 ${loss} ${FUNCNAME}_loss
+}
+
+
+function convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1() {
+    cd ${plsc_path}
+    rm -rf log
+    bash ./ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh
+    check_result $FUNCNAME
+    loss=`tail log/workerlog.0 | grep "599/5004" | cut -d " " -f15 `
+    check_diff 6.7890 ${loss} ${FUNCNAME}_loss
+}
+
+
+function convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1() {
+    cd ${plsc_path}
+    rm -rf log
+    bash ./ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh
+    check_result $FUNCNAME
+    loss=`tail log/workerlog.0 | grep "199/1251" | cut -d " " -f15 `
+    check_diff 6.9417 ${loss} ${FUNCNAME}_loss
+}
+
 function check_result() {
     if [ $? -ne 0 ];then
       echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log

diff --git a/tests/CI/ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#unset PADDLE_TRAINER_ENDPOINTS
+#export PADDLE_NNODES=4
+#export PADDLE_MASTER="10.67.228.16:12538"
+#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#export PADDLE_JOB_ID=ConvMAE
+
+# 4 nodes finetune setting
+ACCUM_ITER=1
+PRETRAIN_CHKPT='pretrained/convmae/convmae_convvit_base_pretrained_1599ep.pd'
+IMAGENET_DIR=./dataset/ILSVRC2012/
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    ../../task/ssl/mae/main_finetune.py \
+    --accum_iter $ACCUM_ITER \
+    --print_freq 1 \
+    --max_train_step 600 \
+    --batch_size 32 \
+    --model convvit_base_patch16 \
+    --finetune ${PRETRAIN_CHKPT} \
+    --epochs 100 \
+    --blr 5e-4 --layer_decay 0.65 \
+    --weight_decay 0.05 --drop_path 0.1 --reprob 0.25 --mixup 0.8 --cutmix 1.0 \
+    --dist_eval --data_path ${IMAGENET_DIR}
diff --git a/tests/CI/ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#unset PADDLE_TRAINER_ENDPOINTS
+#export PADDLE_NNODES=4
+#export PADDLE_MASTER="10.67.228.16:12538"
+#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#export PADDLE_JOB_ID=ConvMAE
+
+IMAGENET_DIR=./dataset/ILSVRC2012/
+
+# 1 for four node, 4 for single node
+ACCUM_ITER=1
+PRETRAIN_CHKPT='pretrained/convmae/convmae_convvit_base_pretrained_1599ep.pd'
+python -m paddle.distributed.launch \
+   --nnodes=$PADDLE_NNODES \
+   --master=$PADDLE_MASTER \
+   --devices=$CUDA_VISIBLE_DEVICES \
+   ../../task/ssl/mae/main_linprobe.py \
+   --accum_iter $ACCUM_ITER \
+    --print_freq 1 \
+    --max_train_step 200 \
+   --batch_size 128 \
+   --model convvit_base_patch16 \
+   --global_pool \
+   --finetune ${PRETRAIN_CHKPT} \
+   --epochs 90 \
+   --blr 0.1 \
+   --weight_decay 0.0 \
+   --dist_eval --data_path ${IMAGENET_DIR}
diff --git a/tests/CI/ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#unset PADDLE_TRAINER_ENDPOINTS
+#export PADDLE_NNODES=3
+#export PADDLE_MASTER="10.67.228.16:12538"
+#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#export PADDLE_JOB_ID=ConvMAE
+
+# 3 nodes for pretrain
+ACCUM_ITER=1
+IMAGENET_DIR=./dataset/ILSVRC2012/
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    ../../task/ssl/mae/main_pretrain.py \
+    --accum_iter $ACCUM_ITER \
+    --print_freq 1 \
+    --max_train_step 100 \
+    --batch_size 128 \
+    --model convmae_convvit_base_patch16 \
+    --norm_pix_loss \
+    --mask_ratio 0.75 \
+    --epochs 1600 \
+    --warmup_epochs 40 \
+    --blr 1.5e-4 --weight_decay 0.05 \
+    --data_path ${IMAGENET_DIR}
diff --git a/tests/CI/ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh
@@ -0,0 +1,43 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#unset PADDLE_TRAINER_ENDPOINTS
+#export PADDLE_NNODES=4
+#export PADDLE_MASTER="10.67.228.16:12538"
+#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+#export PADDLE_JOB_ID=MAE
+
+# for single node finetune
+# batch_size 32, ACCUM_ITER=4, effective batch size: 1024
+# batch_size 128, ACCUM_ITER=1, effective batch size: 1024
+
+# 4 nodes finetune setting
+ACCUM_ITER=1
+PRETRAIN_CHKPT='pretrained/mae/mae_pretrain_vit_base_1599ep.pd'
+IMAGENET_DIR=./dataset/ILSVRC2012/
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    ../../task/ssl/mae/main_finetune.py \
+    --accum_iter $ACCUM_ITER \
+    --print_freq 1 \
+    --max_train_step 600 \
+    --batch_size 32 \
+    --model vit_base_patch16 \
+    --finetune ${PRETRAIN_CHKPT} \
+    --epochs 100 \
+    --blr 5e-4 --layer_decay 0.65 \
+    --weight_decay 0.05 --drop_path 0.1 --reprob 0.25 --mixup 0.8 --cutmix 1.0 \
+    --dist_eval --data_path ${IMAGENET_DIR}