diff --git a/task/ssl/mae/engine_finetune.py b/task/ssl/mae/engine_finetune.py index e631a571fc8f5..fcdec8624923b 100644 --- a/task/ssl/mae/engine_finetune.py +++ b/task/ssl/mae/engine_finetune.py @@ -46,7 +46,6 @@ def train_one_epoch(model, 'loss', misc.SmoothedValue( window_size=1, fmt='{value:.4f}')) header = 'Epoch: [{}]'.format(epoch) - print_freq = 20 accum_iter = args.accum_iter @@ -55,9 +54,14 @@ def train_one_epoch(model, if log_writer is not None: print('log_dir: {}'.format(log_writer.kwargs['log_dir'])) - for data_iter_step, ( - samples, targets - ) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + for data_iter_step, (samples, targets) in enumerate( + metric_logger.log_every(data_loader, args.print_freq, header)): + + if args.max_train_step is not None and data_iter_step >= args.max_train_step: + print( + f'step({data_iter_step}) >= max_train_step({args.max_train_step}), training stops early. This function is only used for debugging.' + ) + exit(0) # we use a per iteration (instead of per epoch) lr scheduler if data_iter_step % accum_iter == 0: diff --git a/task/ssl/mae/engine_pretrain.py b/task/ssl/mae/engine_pretrain.py index 6613c7464f8ce..d6e62f91c9826 100644 --- a/task/ssl/mae/engine_pretrain.py +++ b/task/ssl/mae/engine_pretrain.py @@ -43,7 +43,6 @@ def train_one_epoch(model, 'loss', misc.SmoothedValue( window_size=1, fmt='{value:.4f}')) header = 'Epoch: [{}]'.format(epoch) - print_freq = 20 accum_iter = args.accum_iter @@ -52,9 +51,14 @@ def train_one_epoch(model, if log_writer is not None: print('log_dir: {}'.format(log_writer.kwargs['log_dir'])) - for data_iter_step, ( - samples, _ - ) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + for data_iter_step, (samples, _) in enumerate( + metric_logger.log_every(data_loader, args.print_freq, header)): + + if args.max_train_step is not None and data_iter_step >= args.max_train_step: + print( + f'step({data_iter_step}) >= max_train_step({args.max_train_step}), training stops early. This function is only used for debugging.' + ) + exit(0) # we use a per iteration (instead of per epoch) lr scheduler if data_iter_step % accum_iter == 0: diff --git a/task/ssl/mae/main_finetune.py b/task/ssl/mae/main_finetune.py index c2b9f9f57e517..fa4a6d1b336e4 100644 --- a/task/ssl/mae/main_finetune.py +++ b/task/ssl/mae/main_finetune.py @@ -260,18 +260,12 @@ def get_args_parser(): parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') parser.set_defaults(pin_mem=True) - # distributed training parameters + parser.add_argument('--print_freq', default=20, type=int) parser.add_argument( - '--world_size', - default=1, + '--max_train_step', + default=None, type=int, - help='number of distributed processes') - parser.add_argument('--local_rank', default=-1, type=int) - parser.add_argument('--dist_on_itp', action='store_true') - parser.add_argument( - '--dist_url', - default='env://', - help='url used to set up distributed training') + help='only used for debugging') return parser diff --git a/task/ssl/mae/main_linprobe.py b/task/ssl/mae/main_linprobe.py index fed138cd69657..ef15245d8f4db 100644 --- a/task/ssl/mae/main_linprobe.py +++ b/task/ssl/mae/main_linprobe.py @@ -157,18 +157,12 @@ def get_args_parser(): parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') parser.set_defaults(pin_mem=True) - # distributed training parameters + parser.add_argument('--print_freq', default=20, type=int) parser.add_argument( - '--world_size', - default=1, + '--max_train_step', + default=None, type=int, - help='number of distributed processes') - parser.add_argument('--local_rank', default=-1, type=int) - parser.add_argument('--dist_on_itp', action='store_true') - parser.add_argument( - '--dist_url', - default='env://', - help='url used to set up distributed training') + help='only used for debugging') return parser diff --git a/task/ssl/mae/main_pretrain.py b/task/ssl/mae/main_pretrain.py index cf55b6136ed10..81eb1c93a1b3b 100644 --- a/task/ssl/mae/main_pretrain.py +++ b/task/ssl/mae/main_pretrain.py @@ -145,18 +145,12 @@ def get_args_parser(): parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') parser.set_defaults(pin_mem=True) - # distributed training parameters + parser.add_argument('--print_freq', default=20, type=int) parser.add_argument( - '--world_size', - default=1, + '--max_train_step', + default=None, type=int, - help='number of distributed processes') - parser.add_argument('--local_rank', default=-1, type=int) - parser.add_argument('--dist_on_itp', action='store_true') - parser.add_argument( - '--dist_url', - default='env://', - help='url used to set up distributed training') + help='only used for debugging') return parser diff --git a/tests/CI/case.sh b/tests/CI/case.sh index ff69c50ea53ad..375e901a714d2 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -31,6 +31,12 @@ plsc_gpu_model_list=( \ DeiT_base_patch16_224_in1k_1n8c_dp_fp16o2 \ cait_s24_224_in1k_1n8c_dp_fp16o2 \ swin_base_patch4_window7_224_fp16o2 \ + mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1 \ + mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1 \ + mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1 \ + convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1 \ + convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1 \ + convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1 \ ) ###### Face ###### @@ -169,6 +175,67 @@ function swin_base_patch4_window7_224_fp16o2() { } +###### MAE ###### +function mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1() { + cd ${plsc_path} + rm -rf log + bash ./ssl/mae/mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh + check_result $FUNCNAME + loss=`tail log/workerlog.0 | grep "199/1251" | cut -d " " -f15 ` + check_diff 1.0064 ${loss} ${FUNCNAME}_loss +} + + +function mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1() { + cd ${plsc_path} + rm -rf log + bash ./ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh + check_result $FUNCNAME + loss=`tail log/workerlog.0 | grep "599/5004" | cut -d " " -f15 ` + check_diff 6.7559 ${loss} ${FUNCNAME}_loss +} + + +function mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1() { + cd ${plsc_path} + rm -rf log + bash ./ssl/mae/mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh + check_result $FUNCNAME + loss=`tail log/workerlog.0 | grep "199/312" | cut -d " " -f14 ` + check_diff 6.6991 ${loss} ${FUNCNAME}_loss +} + + +###### ConvMAE ###### +function convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1() { + cd ${plsc_path} + rm -rf log + bash ./ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh + check_result $FUNCNAME + loss=`tail log/workerlog.0 | grep "99/1251" | cut -d " " -f16 ` + check_diff 1.2954 ${loss} ${FUNCNAME}_loss +} + + +function convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1() { + cd ${plsc_path} + rm -rf log + bash ./ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh + check_result $FUNCNAME + loss=`tail log/workerlog.0 | grep "599/5004" | cut -d " " -f15 ` + check_diff 6.7890 ${loss} ${FUNCNAME}_loss +} + + +function convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1() { + cd ${plsc_path} + rm -rf log + bash ./ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh + check_result $FUNCNAME + loss=`tail log/workerlog.0 | grep "199/1251" | cut -d " " -f15 ` + check_diff 6.9417 ${loss} ${FUNCNAME}_loss +} + function check_result() { if [ $? -ne 0 ];then echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log diff --git a/tests/CI/ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh new file mode 100644 index 0000000000000..57b529813dd95 --- /dev/null +++ b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh @@ -0,0 +1,39 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=4 +#export PADDLE_MASTER="10.67.228.16:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +#export PADDLE_JOB_ID=ConvMAE + +# 4 nodes finetune setting +ACCUM_ITER=1 +PRETRAIN_CHKPT='pretrained/convmae/convmae_convvit_base_pretrained_1599ep.pd' +IMAGENET_DIR=./dataset/ILSVRC2012/ +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + ../../task/ssl/mae/main_finetune.py \ + --accum_iter $ACCUM_ITER \ + --print_freq 1 \ + --max_train_step 600 \ + --batch_size 32 \ + --model convvit_base_patch16 \ + --finetune ${PRETRAIN_CHKPT} \ + --epochs 100 \ + --blr 5e-4 --layer_decay 0.65 \ + --weight_decay 0.05 --drop_path 0.1 --reprob 0.25 --mixup 0.8 --cutmix 1.0 \ + --dist_eval --data_path ${IMAGENET_DIR} diff --git a/tests/CI/ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh new file mode 100644 index 0000000000000..7348bc6396628 --- /dev/null +++ b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh @@ -0,0 +1,41 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=4 +#export PADDLE_MASTER="10.67.228.16:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +#export PADDLE_JOB_ID=ConvMAE + +IMAGENET_DIR=./dataset/ILSVRC2012/ + +# 1 for four node, 4 for single node +ACCUM_ITER=1 +PRETRAIN_CHKPT='pretrained/convmae/convmae_convvit_base_pretrained_1599ep.pd' +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + ../../task/ssl/mae/main_linprobe.py \ + --accum_iter $ACCUM_ITER \ + --print_freq 1 \ + --max_train_step 200 \ + --batch_size 128 \ + --model convvit_base_patch16 \ + --global_pool \ + --finetune ${PRETRAIN_CHKPT} \ + --epochs 90 \ + --blr 0.1 \ + --weight_decay 0.0 \ + --dist_eval --data_path ${IMAGENET_DIR} diff --git a/tests/CI/ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh new file mode 100644 index 0000000000000..5babc2babc925 --- /dev/null +++ b/tests/CI/ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh @@ -0,0 +1,39 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=3 +#export PADDLE_MASTER="10.67.228.16:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +#export PADDLE_JOB_ID=ConvMAE + +# 3 nodes for pretrain +ACCUM_ITER=1 +IMAGENET_DIR=./dataset/ILSVRC2012/ +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + ../../task/ssl/mae/main_pretrain.py \ + --accum_iter $ACCUM_ITER \ + --print_freq 1 \ + --max_train_step 100 \ + --batch_size 128 \ + --model convmae_convvit_base_patch16 \ + --norm_pix_loss \ + --mask_ratio 0.75 \ + --epochs 1600 \ + --warmup_epochs 40 \ + --blr 1.5e-4 --weight_decay 0.05 \ + --data_path ${IMAGENET_DIR} diff --git a/tests/CI/ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh new file mode 100644 index 0000000000000..44323506366dc --- /dev/null +++ b/tests/CI/ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh @@ -0,0 +1,43 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=4 +#export PADDLE_MASTER="10.67.228.16:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +#export PADDLE_JOB_ID=MAE + +# for single node finetune +# batch_size 32, ACCUM_ITER=4, effective batch size: 1024 +# batch_size 128, ACCUM_ITER=1, effective batch size: 1024 + +# 4 nodes finetune setting +ACCUM_ITER=1 +PRETRAIN_CHKPT='pretrained/mae/mae_pretrain_vit_base_1599ep.pd' +IMAGENET_DIR=./dataset/ILSVRC2012/ +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + ../../task/ssl/mae/main_finetune.py \ + --accum_iter $ACCUM_ITER \ + --print_freq 1 \ + --max_train_step 600 \ + --batch_size 32 \ + --model vit_base_patch16 \ + --finetune ${PRETRAIN_CHKPT} \ + --epochs 100 \ + --blr 5e-4 --layer_decay 0.65 \ + --weight_decay 0.05 --drop_path 0.1 --reprob 0.25 --mixup 0.8 --cutmix 1.0 \ + --dist_eval --data_path ${IMAGENET_DIR} diff --git a/tests/CI/ssl/mae/mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/mae/mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh new file mode 100644 index 0000000000000..0d74b8baf267e --- /dev/null +++ b/tests/CI/ssl/mae/mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh @@ -0,0 +1,41 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=1 +#export PADDLE_MASTER="10.67.228.16:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +#export PADDLE_JOB_ID=MAE + +IMAGENET_DIR=./dataset/ILSVRC2012/ + +# 1 for four node, 4 for single node +ACCUM_ITER=1 +PRETRAIN_CHKPT='pretrained/mae/mae_pretrain_vit_base_1599ep.pd' +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + ../../task/ssl/mae/main_linprobe.py \ + --accum_iter $ACCUM_ITER \ + --print_freq 1 \ + --max_train_step 200 \ + --batch_size 512 \ + --model vit_base_patch16 \ + --cls_token \ + --finetune ${PRETRAIN_CHKPT} \ + --epochs 90 \ + --blr 0.1 \ + --weight_decay 0.0 \ + --dist_eval --data_path ${IMAGENET_DIR} diff --git a/tests/CI/ssl/mae/mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/mae/mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh new file mode 100644 index 0000000000000..e5c0751be16d5 --- /dev/null +++ b/tests/CI/ssl/mae/mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh @@ -0,0 +1,43 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=4 +#export PADDLE_MASTER="10.67.228.16:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +#export PADDLE_JOB_ID=MAE + +# If you use single node +# batch_size 64, ACCUM_ITER=8, effective batch size: 4096 +# batch_size 256, ACCUM_ITER=2, effective batch size: 4096 + +# 4 nodes for pretrain +ACCUM_ITER=1 +IMAGENET_DIR=./dataset/ILSVRC2012/ +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + ../../task/ssl/mae/main_pretrain.py \ + --accum_iter $ACCUM_ITER \ + --print_freq 1 \ + --max_train_step 200 \ + --batch_size 128 \ + --model mae_vit_base_patch16 \ + --norm_pix_loss \ + --mask_ratio 0.75 \ + --epochs 1600 \ + --warmup_epochs 40 \ + --blr 1.5e-4 --weight_decay 0.05 \ + --data_path ${IMAGENET_DIR}