Skip to content

Commit

Permalink
add ci for mae and convmae (PaddlePaddle#197)
Browse files Browse the repository at this point in the history
  • Loading branch information
GuoxiaWang authored Feb 8, 2023
1 parent 24f5c3b commit 2311442
Show file tree
Hide file tree
Showing 12 changed files with 341 additions and 38 deletions.
12 changes: 8 additions & 4 deletions task/ssl/mae/engine_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def train_one_epoch(model,
'loss', misc.SmoothedValue(
window_size=1, fmt='{value:.4f}'))
header = 'Epoch: [{}]'.format(epoch)
print_freq = 20

accum_iter = args.accum_iter

Expand All @@ -55,9 +54,14 @@ def train_one_epoch(model,
if log_writer is not None:
print('log_dir: {}'.format(log_writer.kwargs['log_dir']))

for data_iter_step, (
samples, targets
) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
for data_iter_step, (samples, targets) in enumerate(
metric_logger.log_every(data_loader, args.print_freq, header)):

if args.max_train_step is not None and data_iter_step >= args.max_train_step:
print(
f'step({data_iter_step}) >= max_train_step({args.max_train_step}), training stops early. This function is only used for debugging.'
)
exit(0)

# we use a per iteration (instead of per epoch) lr scheduler
if data_iter_step % accum_iter == 0:
Expand Down
12 changes: 8 additions & 4 deletions task/ssl/mae/engine_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def train_one_epoch(model,
'loss', misc.SmoothedValue(
window_size=1, fmt='{value:.4f}'))
header = 'Epoch: [{}]'.format(epoch)
print_freq = 20

accum_iter = args.accum_iter

Expand All @@ -52,9 +51,14 @@ def train_one_epoch(model,
if log_writer is not None:
print('log_dir: {}'.format(log_writer.kwargs['log_dir']))

for data_iter_step, (
samples, _
) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
for data_iter_step, (samples, _) in enumerate(
metric_logger.log_every(data_loader, args.print_freq, header)):

if args.max_train_step is not None and data_iter_step >= args.max_train_step:
print(
f'step({data_iter_step}) >= max_train_step({args.max_train_step}), training stops early. This function is only used for debugging.'
)
exit(0)

# we use a per iteration (instead of per epoch) lr scheduler
if data_iter_step % accum_iter == 0:
Expand Down
14 changes: 4 additions & 10 deletions task/ssl/mae/main_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,18 +260,12 @@ def get_args_parser():
parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
parser.set_defaults(pin_mem=True)

# distributed training parameters
parser.add_argument('--print_freq', default=20, type=int)
parser.add_argument(
'--world_size',
default=1,
'--max_train_step',
default=None,
type=int,
help='number of distributed processes')
parser.add_argument('--local_rank', default=-1, type=int)
parser.add_argument('--dist_on_itp', action='store_true')
parser.add_argument(
'--dist_url',
default='env://',
help='url used to set up distributed training')
help='only used for debugging')

return parser

Expand Down
14 changes: 4 additions & 10 deletions task/ssl/mae/main_linprobe.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,18 +157,12 @@ def get_args_parser():
parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
parser.set_defaults(pin_mem=True)

# distributed training parameters
parser.add_argument('--print_freq', default=20, type=int)
parser.add_argument(
'--world_size',
default=1,
'--max_train_step',
default=None,
type=int,
help='number of distributed processes')
parser.add_argument('--local_rank', default=-1, type=int)
parser.add_argument('--dist_on_itp', action='store_true')
parser.add_argument(
'--dist_url',
default='env://',
help='url used to set up distributed training')
help='only used for debugging')

return parser

Expand Down
14 changes: 4 additions & 10 deletions task/ssl/mae/main_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,18 +145,12 @@ def get_args_parser():
parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
parser.set_defaults(pin_mem=True)

# distributed training parameters
parser.add_argument('--print_freq', default=20, type=int)
parser.add_argument(
'--world_size',
default=1,
'--max_train_step',
default=None,
type=int,
help='number of distributed processes')
parser.add_argument('--local_rank', default=-1, type=int)
parser.add_argument('--dist_on_itp', action='store_true')
parser.add_argument(
'--dist_url',
default='env://',
help='url used to set up distributed training')
help='only used for debugging')

return parser

Expand Down
67 changes: 67 additions & 0 deletions tests/CI/case.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ plsc_gpu_model_list=( \
DeiT_base_patch16_224_in1k_1n8c_dp_fp16o2 \
cait_s24_224_in1k_1n8c_dp_fp16o2 \
swin_base_patch4_window7_224_fp16o2 \
mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1 \
mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1 \
mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1 \
convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1 \
convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1 \
convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1 \
)

###### Face ######
Expand Down Expand Up @@ -169,6 +175,67 @@ function swin_base_patch4_window7_224_fp16o2() {
}


###### MAE ######
function mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1() {
cd ${plsc_path}
rm -rf log
bash ./ssl/mae/mae_vit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh
check_result $FUNCNAME
loss=`tail log/workerlog.0 | grep "199/1251" | cut -d " " -f15 `
check_diff 1.0064 ${loss} ${FUNCNAME}_loss
}


function mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1() {
cd ${plsc_path}
rm -rf log
bash ./ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh
check_result $FUNCNAME
loss=`tail log/workerlog.0 | grep "599/5004" | cut -d " " -f15 `
check_diff 6.7559 ${loss} ${FUNCNAME}_loss
}


function mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1() {
cd ${plsc_path}
rm -rf log
bash ./ssl/mae/mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh
check_result $FUNCNAME
loss=`tail log/workerlog.0 | grep "199/312" | cut -d " " -f14 `
check_diff 6.6991 ${loss} ${FUNCNAME}_loss
}


###### ConvMAE ######
function convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1() {
cd ${plsc_path}
rm -rf log
bash ./ssl/convmae/convmae_convvit_base_patch16_pt_in1k_1n8c_dp_fp16o1.sh
check_result $FUNCNAME
loss=`tail log/workerlog.0 | grep "99/1251" | cut -d " " -f16 `
check_diff 1.2954 ${loss} ${FUNCNAME}_loss
}


function convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1() {
cd ${plsc_path}
rm -rf log
bash ./ssl/convmae/convmae_convvit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh
check_result $FUNCNAME
loss=`tail log/workerlog.0 | grep "599/5004" | cut -d " " -f15 `
check_diff 6.7890 ${loss} ${FUNCNAME}_loss
}


function convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1() {
cd ${plsc_path}
rm -rf log
bash ./ssl/convmae/convmae_convvit_base_patch16_lp_in1k_1n8c_dp_fp16o1.sh
check_result $FUNCNAME
loss=`tail log/workerlog.0 | grep "199/1251" | cut -d " " -f15 `
check_diff 6.9417 ${loss} ${FUNCNAME}_loss
}

function check_result() {
if [ $? -ne 0 ];then
echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#unset PADDLE_TRAINER_ENDPOINTS
#export PADDLE_NNODES=4
#export PADDLE_MASTER="10.67.228.16:12538"
#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export PADDLE_JOB_ID=ConvMAE

# 4 nodes finetune setting
ACCUM_ITER=1
PRETRAIN_CHKPT='pretrained/convmae/convmae_convvit_base_pretrained_1599ep.pd'
IMAGENET_DIR=./dataset/ILSVRC2012/
python -m paddle.distributed.launch \
--nnodes=$PADDLE_NNODES \
--master=$PADDLE_MASTER \
--devices=$CUDA_VISIBLE_DEVICES \
../../task/ssl/mae/main_finetune.py \
--accum_iter $ACCUM_ITER \
--print_freq 1 \
--max_train_step 600 \
--batch_size 32 \
--model convvit_base_patch16 \
--finetune ${PRETRAIN_CHKPT} \
--epochs 100 \
--blr 5e-4 --layer_decay 0.65 \
--weight_decay 0.05 --drop_path 0.1 --reprob 0.25 --mixup 0.8 --cutmix 1.0 \
--dist_eval --data_path ${IMAGENET_DIR}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#unset PADDLE_TRAINER_ENDPOINTS
#export PADDLE_NNODES=4
#export PADDLE_MASTER="10.67.228.16:12538"
#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export PADDLE_JOB_ID=ConvMAE

IMAGENET_DIR=./dataset/ILSVRC2012/

# 1 for four node, 4 for single node
ACCUM_ITER=1
PRETRAIN_CHKPT='pretrained/convmae/convmae_convvit_base_pretrained_1599ep.pd'
python -m paddle.distributed.launch \
--nnodes=$PADDLE_NNODES \
--master=$PADDLE_MASTER \
--devices=$CUDA_VISIBLE_DEVICES \
../../task/ssl/mae/main_linprobe.py \
--accum_iter $ACCUM_ITER \
--print_freq 1 \
--max_train_step 200 \
--batch_size 128 \
--model convvit_base_patch16 \
--global_pool \
--finetune ${PRETRAIN_CHKPT} \
--epochs 90 \
--blr 0.1 \
--weight_decay 0.0 \
--dist_eval --data_path ${IMAGENET_DIR}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#unset PADDLE_TRAINER_ENDPOINTS
#export PADDLE_NNODES=3
#export PADDLE_MASTER="10.67.228.16:12538"
#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export PADDLE_JOB_ID=ConvMAE

# 3 nodes for pretrain
ACCUM_ITER=1
IMAGENET_DIR=./dataset/ILSVRC2012/
python -m paddle.distributed.launch \
--nnodes=$PADDLE_NNODES \
--master=$PADDLE_MASTER \
--devices=$CUDA_VISIBLE_DEVICES \
../../task/ssl/mae/main_pretrain.py \
--accum_iter $ACCUM_ITER \
--print_freq 1 \
--max_train_step 100 \
--batch_size 128 \
--model convmae_convvit_base_patch16 \
--norm_pix_loss \
--mask_ratio 0.75 \
--epochs 1600 \
--warmup_epochs 40 \
--blr 1.5e-4 --weight_decay 0.05 \
--data_path ${IMAGENET_DIR}
43 changes: 43 additions & 0 deletions tests/CI/ssl/mae/mae_vit_base_patch16_ft_in1k_1n8c_dp_fp16o1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#unset PADDLE_TRAINER_ENDPOINTS
#export PADDLE_NNODES=4
#export PADDLE_MASTER="10.67.228.16:12538"
#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export PADDLE_JOB_ID=MAE

# for single node finetune
# batch_size 32, ACCUM_ITER=4, effective batch size: 1024
# batch_size 128, ACCUM_ITER=1, effective batch size: 1024

# 4 nodes finetune setting
ACCUM_ITER=1
PRETRAIN_CHKPT='pretrained/mae/mae_pretrain_vit_base_1599ep.pd'
IMAGENET_DIR=./dataset/ILSVRC2012/
python -m paddle.distributed.launch \
--nnodes=$PADDLE_NNODES \
--master=$PADDLE_MASTER \
--devices=$CUDA_VISIBLE_DEVICES \
../../task/ssl/mae/main_finetune.py \
--accum_iter $ACCUM_ITER \
--print_freq 1 \
--max_train_step 600 \
--batch_size 32 \
--model vit_base_patch16 \
--finetune ${PRETRAIN_CHKPT} \
--epochs 100 \
--blr 5e-4 --layer_decay 0.65 \
--weight_decay 0.05 --drop_path 0.1 --reprob 0.25 --mixup 0.8 --cutmix 1.0 \
--dist_eval --data_path ${IMAGENET_DIR}
Loading

0 comments on commit 2311442

Please sign in to comment.