diff --git a/README.md b/README.md index e4f9d012cd7ea..c064709410e92 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,33 @@ -# PLSC +# PLSC (Paddle Large Scale Classification) ## 1. Introduction -[PLSC](https://github.com/PaddlePaddle/PLSC) is an open source Paddle Large Scale Classification Tools, which supports 60 million classes on single node 8 NVIDIA V100 (32G). +[PLSC](https://github.com/PaddlePaddle/PLSC) is an open source repo for a collection of Paddle Large Scale Classification Tools, which supports 92 million classes on a single node with 8 NVIDIA V100 (32G) GPUs and has high training throughtput. It implements [ArcFace](https://arxiv.org/abs/1801.07698), [CosFace](https://arxiv.org/abs/1801.09414), [PartialFC](https://arxiv.org/abs/2010.05222), SparseMomentum, DataParallel + ModelParallel distributed training, FP16 training. -## 2. Environment Preparation +## 2. Top News -### 2.1 Install Paddle from Source Code +**Update (2022-01-11):** Supported NHWC data format of FP16 to improve 10% throughtput and decreased 30% GPU memory. It supported 92 million classes on single node 8 NVIDIA V100 (32G) and has high training throughtput. Supported best checkpoint save. And we released 18 pretrained models and PLSC v2.2. + +**Update (2021-12-11):** Released [Zhihu Technical Artical](https://zhuanlan.zhihu.com/p/443091282) and [Bilibili Open Class](https://www.bilibili.com/video/BV1VP4y1G73X) + +**Update (2021-10-10):** Added FP16 training, improved throughtput and optimized GPU memory. It supported 60 million classes on single node 8 NVIDIA V100 (32G) and has high training throughtput. + +**Update (2021-09-10):** This repository supported both ``static`` mode and ``dynamic`` mode to use paddlepaddle v2.2, which supported 48 million classes on single node 8 NVIDIA V100 (32G). It added PartialFC, SparseMomentum, and [ArcFace](https://arxiv.org/abs/1801.07698), [CosFace](https://arxiv.org/abs/1801.09414), [PartialFC](https://arxiv.org/abs/2010.05222) (we refer to MarginLoss). Backbone includes IResNet and MobileNet. + + +## 3. Environment Preparation + +### 3.1 Install Paddle from PyPI + +```shell +# python required 3.x or later +# paddlepaddle required 2.2.2 or later +pip install paddlepaddle-gpu==2.2.2 +``` + +### 3.2 Install Paddle from Source Code + +For more install information, ref to [PaddlePaddle](https://www.paddlepaddle.org.cn/) ```shell @@ -14,25 +35,21 @@ git clone https://github.com/PaddlePaddle/Paddle.git cd /path/to/Paddle/ +# [optional] checkout release/2.2 branch +git checkout -b release/2.2 upstream/release/2.2 + mkdir build && cd build -cmake .. -DWITH_TESTING=ON -DWITH_GPU=ON -DWITH_GOLANG=OFF -DWITH_STYLE_CHECK=ON -DCMAKE_INSTALL_PREFIX=$PWD/output -DWITH_DISTRIBUTE=ON -DCMAKE_BUILD_TYPE=Release -DPY_VERSION=3.7 -DCUDA_ARCH_NAME=All -DPADDLE_VERSION=2.2.0 +cmake .. -DWITH_TESTING=ON -DWITH_GPU=ON -DWITH_GOLANG=OFF -DWITH_STYLE_CHECK=ON -DCMAKE_INSTALL_PREFIX=$PWD/output -DWITH_DISTRIBUTE=ON -DCMAKE_BUILD_TYPE=Release -DPY_VERSION=3.7 -DCUDA_ARCH_NAME=All -DPADDLE_VERSION=2.2.2 make -j20 && make install -j20 -pip install output/opt/paddle/share/wheels/paddlepaddle_gpu-2.2.0-cp37-cp37m-linux_x86_64.whl +pip install output/opt/paddle/share/wheels/paddlepaddle_gpu-2.2.2-cp37-cp37m-linux_x86_64.whl ``` -### 2.2 Install Paddle from PyPI - -```shell -# python required 3.x or later -# paddlepaddle required 2.2.0rc0 or later -pip install paddlepaddle-gpu==2.2.0rc0 -``` -### 2.3 Download PLSC +### 3.3 Download PLSC ```shell git clone https://github.com/PaddlePaddle/PLSC.git @@ -41,16 +58,16 @@ cd /path/to/PLSC/ ``` -## 3. Data Preparation +## 4. Data Preparation -### 3.1 Download Dataset +### 4.1 Download Dataset Download the dataset from [insightface datasets](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_). * MS1M_v2: MS1M-ArcFace * MS1M_v3: MS1M-RetinaFace -### 3.2 Extract MXNet Dataset to Images +### 4.2 Extract MXNet Dataset to Images ```shell python tools/mx_recordio_2_images.py --root_dir ms1m-retinaface-t1/ --output_dir MS1M_v3/ @@ -82,23 +99,10 @@ images/00000001.jpg 0 If you want to use customed dataset, you can arrange your data according to the above format. -### 3.3 Transform Between Original Image Files and Bin Files - -If you want to convert original image files to `bin` files used directly for training process, you can use the following command to finish the conversion. - -```shell -python tools/convert_image_bin.py --image_path="your/input/image/path" --bin_path="your/output/bin/path" --mode="image2bin" -``` - -If you want to convert `bin` files to original image files, you can use the following command to finish the conversion. +## 5. How to Training -```shell -python tools/convert_image_bin.py --image_path="your/input/bin/path" --bin_path="your/output/image/path" --mode="bin2image" -``` - -## 4. How to Training -### 4.1 Single Node, 8 GPUs: +### 5.1 Single Node, 8 GPUs: #### Static Mode @@ -112,11 +116,26 @@ sh scripts/train_static.sh sh scripts/train_dynamic.sh ``` +### 5.2 Single Node, 1 GPU: -During training, you can view loss changes in real time through `VisualDL`, For more information, please refer to [VisualDL](https://github.com/PaddlePaddle/VisualDL/). +Modify the ``CUDA_VISIBLE_DEVICES`` environment variable. +``` bash +TRAINER_IP_LIST=127.0.0.1 +CUDA_VISIBLE_DEVICES=3 +``` + +### 5.3 Multi Node, Multi GPUs: -## 5. Model Evaluation +Modify the ``TRAINER_IP_LIST`` and ``CUDA_VISIBLE_DEVICES`` environment variable and then run the training shell script on each node. + +``` bash +TRAINER_IP_LIST=10.11.12.1,10.11.12.2 +CUDA_VISIBLE_DEVICES=0,1,2,3 +``` + + +## 6. Model Evaluation The model evaluation process can be started as follows. @@ -132,7 +151,7 @@ sh scripts/validation_static.sh sh scripts/validation_dynamic.sh ``` -## 6. Export Model +## 7. Export Model PaddlePaddle supports inference using prediction engines. Firstly, you should export inference model. #### Static Mode @@ -149,7 +168,7 @@ sh scripts/export_dynamic.sh We also support export to onnx model, you only need to set `--export_type onnx`. -## 7. Model Inference +## 8. Model Inference The model inference process supports paddle save inference model and onnx model. @@ -157,40 +176,64 @@ The model inference process supports paddle save inference model and onnx model. sh scripts/inference.sh ``` -## 8. Model Performance +## 9. Model Performance -### 8.1 Accuracy on Verification Datasets +### 9.1 Accuracy on Verification Datasets **Configuration:** * GPU: 8 NVIDIA Tesla V100 32G - * Precison: FP16 * BatchSize: 128/1024 -| Mode | Datasets | backbone | Ratio | agedb30 | cfp_fp | lfw | log | last checkpoint | -| ------- | :------: | :------- | ----- | :------ | :----- | :--- | :--- | :--- | -| Static | MS1MV3 | r50 | 0.1 | 0.98317 | 0.98943| 0.99850 | [log](experiments/logs/static/ms1mv3_r50_static_128_fp16_0.1/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz) | -| Static | MS1MV3 | r50 | 1.0 | 0.98283 | 0.98843| 0.99850 | [log](experiments/logs/static/ms1mv3_r50_static_128_fp16_1.0/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_1.0_epoch_24.tgz) | -| Dynamic | MS1MV3 | r50 | 0.1 | 0.98367 | 0.98971| 0.99850 | [log](experiments/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_0.1/training.log) | [checkpoint](https://plsc.bj.bcebos.com/pretrained_model/ms1mv3_r50_dynamic_128_fp16_0.1_eopch_24.tgz) | -| Dynamic | MS1MV3 | r50 | 1.0 | 0.98333 | 0.99043| 0.99850 | [log](experiments/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_1.0/training.log) | [checkpoint](https://plsc.bj.bcebos.com/pretrained_model/ms1mv3_r50_dynamic_128_fp16_1.0_eopch_24.tgz) | +| Mode | Datasets | Backbone | Precision | DataFormat | Ratio | agedb30 | cfp_fp | lfw | checkpoint&log | +| ------- | :------: | :------------ | --------- | ---------- | ----- | ------- | ------- | :------ | :----------------------------------------------------------- | +| Static | MS1MV3 | Res50 | FP16 | NHWC | 0.1 | 0.98200 | 0.98943 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_0.1_NHWC_FP16_v2.2.tgz) | +| Static | MS1MV3 | Res50 | FP32 | NCHW | 0.1 | 0.98267 | 0.98986 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_0.1_NCHW_FP32_v2.2.tgz) | +| Static | MS1MV3 | Res50 | FP16 | NHWC | 1.0 | 0.98300 | 0.98929 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_1.0_NHWC_FP16_v2.2.tgz) | +| Static | MS1MV3 | Res50 | FP32 | NCHW | 1.0 | 0.98400 | 0.98929 | 0.99833 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_1.0_NCHW_FP32_v2.2.tgz) | +| Static | MS1MV3 | Res100 | FP16 | NHWC | 0.1 | 0.98383 | 0.99200 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_0.1_NHWC_FP16_v2.2.tgz) | +| Static | MS1MV3 | Res100 | FP32 | NCHW | 0.1 | 0.98317 | 0.99157 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_0.1_NCHW_FP32_v2.2.tgz) | +| Static | MS1MV3 | Res100 | FP16 | NHWC | 1.0 | 0.98367 | 0.99086 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_1.0_NHWC_FP16_v2.2.tgz) | +| Static | MS1MV3 | Res100 | FP32 | NCHW | 1.0 | 0.98417 | 0.99129 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_1.0_NCHW_FP32_v2.2.tgz) | +| Dynamic | MS1MV3 | Res50 | FP16 | NHWC | 0.1 | 0.98367 | 0.99029 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2.tgz) | +| Dynamic | MS1MV3 | Res50 | FP32 | NCHW | 0.1 | 0.98400 | 0.98986 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_0.1_NCHW_FP32_v2.2.tgz) | +| Dynamic | MS1MV3 | Res50 | FP16 | NHWC | 1.0 | 0.98317 | 0.98971 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_1.0_NHWC_FP16_v2.2.tgz) | +| Dynamic | MS1MV3 | Res50 | FP32 | NCHW | 1.0 | 0.98350 | 0.99000 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_1.0_NCHW_FP32_v2.2.tgz) | +| Dynamic | MS1MV3 | Res100 | FP16 | NHWC | 0.1 | 0.98500 | 0.99143 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_0.1_NHWC_FP16_v2.2.tgz) | +| Dynamic | MS1MV3 | Res100 | FP32 | NCHW | 0.1 | 0.98383 | 0.99114 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_0.1_NCHW_FP32_v2.2.tgz) | +| Dynamic | MS1MV3 | Res100 | FP16 | NHWC | 1.0 | 0.98500 | 0.99214 | 0.99883 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_1.0_NHWC_FP16_v2.2.tgz) | +| Dynamic | MS1MV3 | Res100 | FP32 | NCHW | 1.0 | 0.98400 | 0.99100 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_1.0_NCHW_FP32_v2.2.tgz) | +| Dynamic | MS1MV3 | MobileFaceNet | FP32 | NCHW | 0.1 | 0.96200 | 0.96571 | 0.99567 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_MobileFaceNet_dynamic_0.1_NCHW_FP32_v2.2.tgz) | +| Dynamic | MS1MV3 | MobileFaceNet | FP32 | NCHW | 1.0 | 0.96167 | 0.96657 | 0.99533 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_MobileFaceNet_dynamic_1.0_NCHW_FP32_v2.2.tgz) | -### 8.2 Maximum Number of Identities +### 9.2 Maximum Number of Identities + +#### Static Mode + +```bash +sh scripts/find_maximum_classes_static.sh +``` + +#### Dynamic Mode + +```bash +sh scripts/find_maximum_classes_dynamic.sh +``` **Configuration:** * GPU: 8 NVIDIA Tesla V100 32G (32510MiB) * BatchSize: 64/512 * SampleRatio: 0.1 -| Mode | Precision | Res50 | Res100 | -| ------------------------- | --------- | -------- | -------- | +| Mode | Precision | Res50 | Res100 | +| ------------------------- | --------- | ------------------ | ------------------ | | Framework1 (static) | AMP | 42000000 (31792MiB)| 39000000 (31938MiB)| | Framework2 (dynamic) | AMP | 30000000 (31702MiB)| 29000000 (32286MiB)| -| Paddle (static) | FP16 | 60000000 (32018MiB)| 60000000 (32018MiB)| -| Paddle (dynamic) | FP16 | 67000000 (31970MiB)| 67000000 (31970MiB)| +| Paddle (static) | FP16 | 92000000 (32298MiB)| 88000000 (32298MiB)| +| Paddle (dynamic) | FP16 | 87000000 (31978MiB)| 84000000 (31978MiB)| -**Note:** config environment variable by ``export FLAGS_allocator_strategy=naive_best_fit`` -### 8.3 Throughtput +### 9.3 Throughtput **Configuration:** * BatchSize: 128/1024 @@ -199,9 +242,12 @@ sh scripts/inference.sh ![insightface_throughtput](experiments/images/throughtput.png) -## 9. Demo +**Note:** please click the image to see high-definition image. + +## 10. Demo Combined with face detection model, we can complete the face recognition process. +**Note: We only show a demo which can not be use to commercial application.** Firstly, use the fllowing commands to download the models. @@ -214,36 +260,35 @@ wget https://paddle-model-ecology.bj.bcebos.com/model/insight-face/blazeface_fpn tar -xzf models/blazeface_fpn_ssh_1000e_v1.0_infer.tar -C models/ rm -rf models/blazeface_fpn_ssh_1000e_v1.0_infer.tar -# Download static ResNet50 PartialFC 0.1 model and extract it -wget https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz -P models/ -tar -xf models/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz -C models/ -rm -rf models/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz +# Download dynamic ResNet50 PartialFC 0.1 model and extract it +wget https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2.tgz -P models/ +tar -xzf models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2.tgz -C models/ -# Export static save inference model -python tools/export.py --is_static True --export_type paddle --backbone FresResNet50 --embedding_size 512 --checkpoint_dir models/ms1mv3_r50_static_128_fp16_0.1_epoch_24 --output_dir models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer -rm -rf models/ms1mv3_r50_static_128_fp16_0.1_epoch_24 +# Export dynamic save inference model using cfp_fp best model +python tools/export.py --is_static False --export_type paddle --backbone FresResNet50 --embedding_size 512 --checkpoint_dir models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2/FresResNet50/best_model/cfp_fp/ --output_dir models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer ``` Then, use the following commands to download the gallery, demo image and font file for visualization. And we generate gallery features. ```bash -# Download gallery, query and font file mkdir -p images/ -git clone https://github.com/littletomatodonkey/insight-face-paddle /tmp/insight-face-paddle -cp -r /tmp/insight-face-paddle/demo/friends/gallery/ images/ -cp -r /tmp/insight-face-paddle/demo/friends/query/ images/ + +# Download gallery, query +wget https://plsc.bj.bcebos.com/Friends.tgz -P images/ +tar -xzf images/Friends.tgz -C images/ + +# Download font file mkdir -p assets -cp /tmp/insight-face-paddle/SourceHanSansCN-Medium.otf assets/ -rm -rf /tmp/insight-face-paddle +wget https://plsc.bj.bcebos.com/SourceHanSansCN-Medium.otf -P assets/ # Build index file python tools/test_recognition.py \ --rec \ - --rec_model_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdmodel \ - --rec_params_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdiparams \ - --build_index=images/gallery/index.bin \ - --img_dir=images/gallery \ - --label=images/gallery/label.txt + --rec_model_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdmodel \ + --rec_params_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdiparams \ + --build_index=images/Friends/gallery/index.bin \ + --img_dir=images/Friends/gallery \ + --label=images/Friends/gallery/label.txt ``` Use the following command to run the whole face recognition demo. @@ -255,16 +300,15 @@ python tools/test_recognition.py \ --det_model_file_path models/blazeface_fpn_ssh_1000e_v1.0_infer/inference.pdmodel \ --det_params_file_path models/blazeface_fpn_ssh_1000e_v1.0_infer/inference.pdiparams \ --rec \ - --rec_model_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdmodel \ - --rec_params_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdiparams \ - --index=images/gallery/index.bin \ - --input=images/query/friends2.jpg \ + --rec_model_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdmodel \ + --rec_params_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdiparams \ + --index=images/Friends/gallery/index.bin \ + --input=images/Friends/query/friends2.jpg \ --cdd_num 10 \ - --rec_thresh 0.4 \ --output="./output" ``` -The final result is save in folder `output/`, which is shown as follows. +The final result is save in folder `output/`, which is shown as follows. **Note:** the recognition threshold is different according recognition model. Since we do not use landmark detection to align the face, the threshold is lower.
diff --git a/configs/argparser.py b/configs/argparser.py index dd220eb3efee9..5942d50f11c5e 100644 --- a/configs/argparser.py +++ b/configs/argparser.py @@ -71,6 +71,11 @@ def parse_args(): parser.parse_known_args(namespace=user_namespace) cfg = get_config(user_namespace.config_file) + parser.add_argument( + '--seed', + type=int, + default=cfg.seed, + help='global seed, None means do not fix seed, int value means to run reproduction') # Model setting parser.add_argument( '--is_static', @@ -81,7 +86,7 @@ def parse_args(): '--data_format', type=str, default=cfg.data_format, - help='model data layout, "NCHW" or "NHWC"') + help='model data layout, "NCHW" for FP32 or "NHWC" for FP16') parser.add_argument( '--backbone', type=str, default=cfg.backbone, help='backbone network') parser.add_argument( diff --git a/configs/config.py b/configs/config.py index d30c6fcc58877..c417a343f51f1 100644 --- a/configs/config.py +++ b/configs/config.py @@ -15,8 +15,10 @@ from easydict import EasyDict as edict config = edict() +config.seed = None # global seed, None means do not fix seed, int value means to run reproduction + config.is_static = True -config.data_format = 'NCHW' # 'NCHW' or 'NHWC' +config.data_format = 'NHWC' # 'NCHW' for FP32 or 'NHWC' for FP16 config.backbone = 'FresResNet100' config.classifier = 'LargeScaleClassifier' config.embedding_size = 512 diff --git a/configs/ms1mv2_mobileface.py b/configs/ms1mv3_mobileface.py similarity index 81% rename from configs/ms1mv2_mobileface.py rename to configs/ms1mv3_mobileface.py index e29a062a54911..ff7b4738ed20d 100644 --- a/configs/ms1mv2_mobileface.py +++ b/configs/ms1mv3_mobileface.py @@ -16,14 +16,17 @@ config = edict() config.is_static = False +config.data_format = 'NCHW' config.backbone = 'MobileFaceNet_128' config.classifier = 'LargeScaleClassifier' config.embedding_size = 128 config.model_parallel = True -config.sample_ratio = 1.0 +config.sample_ratio = 0.1 config.loss = 'ArcFace' config.dropout = 0.0 +config.fp16 = False + config.lr = 0.1 # for global batch size = 512 config.lr_decay = 0.1 config.weight_decay = 5e-4 @@ -34,11 +37,11 @@ config.decay_boundaries = [10, 16, 22] config.use_synthetic_dataset = False -config.dataset = "MS1M_v2" -config.data_dir = "./MS1M_v2" -config.label_file = "./MS1M_v2/label.txt" +config.dataset = "MS1M_v3" +config.data_dir = "./MS1M_v3" +config.label_file = "./MS1M_v3/label.txt" config.is_bin = False -config.num_classes = 85742 # 85742 for MS1M_v2, 93431 for MS1M_v3 +config.num_classes = 93431 # 85742 for MS1M_v2, 93431 for MS1M_v3 config.batch_size = 128 # global batch size 1024 of 8 GPU config.num_workers = 8 @@ -48,7 +51,7 @@ config.logdir = './log' config.log_interval_step = 100 -config.output = './MS1M_v2_arcface_MobileFaceNet_128_0.1' +config.output = './MS1M_v3_arcface_MobileFaceNet_128_0.1' config.resume = False config.checkpoint_dir = None config.max_num_last_checkpoint = 1 diff --git a/datasets/common_dataset.py b/datasets/common_dataset.py index 33ba8df498c25..5764279cebfb7 100644 --- a/datasets/common_dataset.py +++ b/datasets/common_dataset.py @@ -26,7 +26,6 @@ from datasets.kv_helper import read_img_from_bin - def transform(img): # random horizontal flip if random.randint(0, 1) == 0: @@ -40,11 +39,14 @@ def transform(img): class CommonDataset(paddle.io.Dataset): - def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True): + def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True, seed=0): super(CommonDataset, self).__init__() self.root_dir = root_dir self.label_file = label_file self.fp16 = fp16 + self.seed = seed + if self.seed != 0: + random.seed(self.seed) with open(label_file, "r") as fin: self.full_lines = fin.readlines() @@ -78,13 +80,17 @@ def __len__(self): return self.num_samples class SplitDataset(paddle.io.Dataset): - def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True): + def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True, seed=0): super(SplitDataset, self).__init__() self.root_dir = root_dir self.label_file = label_file self.rank = rank self.world_size = world_size self.fp16 = fp16 + self.seed = seed + if self.seed != 0: + random.seed(self.seed) + with open(label_file, "r") as fin: self.full_lines = fin.readlines() diff --git a/dynamic/backbones/iresnet.py b/dynamic/backbones/iresnet.py index 3c087fe0a2c6a..37bc145bd772a 100644 --- a/dynamic/backbones/iresnet.py +++ b/dynamic/backbones/iresnet.py @@ -101,7 +101,7 @@ def __init__(self, act=None, name=name + "_branch2a", data_format=data_format) - self.prelu = PReLU(num_parameters=num_filters, name=name + "_branch2a_prelu") + self.prelu = PReLU(num_parameters=num_filters, data_format=data_format, name=name + "_branch2a_prelu") self.conv1 = ConvBNLayer( num_channels=num_filters, num_filters=num_filters, @@ -283,7 +283,7 @@ def __init__(self, act=None, name="conv1", data_format=self.data_format) - self.prelu = PReLU(num_parameters=64, name="prelu1") + self.prelu = PReLU(num_parameters=64, data_format=self.data_format, name="prelu1") self.block_list = paddle.nn.LayerList() for block in range(len(units)): @@ -308,13 +308,15 @@ def __init__(self, feat_w = input_image_width // 16 feat_h = input_image_height // 16 self.fc_channels = num_filters[-1] * feat_w * feat_h + #NOTE(GuoxiaWang): don't use NHWC for last fc, + # thus we can train using NHWC and test using NCHW self.fc = FC(num_filters[-1], self.fc_channels, num_features, fc_type, dropout, name='fc', - data_format=self.data_format) + data_format="NCHW") def forward(self, inputs): if self.data_format == "NHWC": @@ -324,6 +326,8 @@ def forward(self, inputs): y = self.prelu(y) for block in self.block_list: y = block(y) + if self.data_format == "NHWC": + y = paddle.tensor.transpose(y, [0, 3, 1, 2]) y = self.fc(y) return y diff --git a/dynamic/backbones/mobilefacenet.py b/dynamic/backbones/mobilefacenet.py index 5b7bfd1d919a6..56a4490a1497a 100644 --- a/dynamic/backbones/mobilefacenet.py +++ b/dynamic/backbones/mobilefacenet.py @@ -42,7 +42,7 @@ def __init__(self, inp, oup, stride, expansion, data_format="NCHW"): nn.Conv2D( inp, inp * expansion, 1, 1, 0, bias_attr=False, data_format=data_format), nn.BatchNorm2D(inp * expansion, data_format=data_format), - nn.PReLU(inp * expansion), + nn.PReLU(inp * expansion, data_format=data_format), # 3*3 depth wise conv nn.Conv2D( @@ -56,7 +56,7 @@ def __init__(self, inp, oup, stride, expansion, data_format="NCHW"): data_format=data_format ), nn.BatchNorm2D(inp * expansion, data_format=data_format), - nn.PReLU(inp * expansion), + nn.PReLU(inp * expansion, data_format=data_format), # 1*1 conv nn.Conv2D( @@ -82,7 +82,7 @@ def __init__(self, inp, oup, k, s, p, dw=False, linear=False, data_format="NCHW" self.bn = nn.BatchNorm2D(oup, data_format=data_format) if not linear: - self.prelu = nn.PReLU(oup) + self.prelu = nn.PReLU(oup, data_format=data_format) def forward(self, x): x = self.conv(x) @@ -155,6 +155,8 @@ def forward(self, x): x = self.conv2(x) x = self.linear7(x) x = self.linear1(x) + if self.data_format == "NHWC": + x = paddle.tensor.transpose(x, [0, 3, 1, 2]) x = x.reshape([x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]]) return x diff --git a/dynamic/export.py b/dynamic/export.py index a41d30aeb1282..1b91262c00ece 100644 --- a/dynamic/export.py +++ b/dynamic/export.py @@ -30,7 +30,7 @@ def export(args): backbone = eval("backbones.{}".format(args.backbone))( num_features=args.embedding_size) - checkpoint.load(backbone, for_train=False, dtype='float32') + checkpoint.load(backbone, for_train=False) print("Load checkpoint from '{}'.".format(args.checkpoint_dir)) backbone.eval() diff --git a/dynamic/train.py b/dynamic/train.py index 047fd92cf863d..87804faff4e56 100644 --- a/dynamic/train.py +++ b/dynamic/train.py @@ -17,6 +17,7 @@ import sys import numpy as np import logging +import random import paddle from visualdl import LogWriter @@ -33,17 +34,8 @@ from . import classifiers from . import backbones -RELATED_FLAGS_SETTING = { - 'FLAGS_cudnn_exhaustive_search': 1, - 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, - 'FLAGS_max_inplace_grad_add': 8, - 'FLAGS_fraction_of_gpu_memory_to_use': 0.9999, -} -paddle.fluid.set_flags(RELATED_FLAGS_SETTING) - def train(args): - writer = LogWriter(logdir=args.logdir) rank = int(os.getenv("PADDLE_TRAINER_ID", 0)) world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) @@ -51,6 +43,24 @@ def train(args): gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) place = paddle.CUDAPlace(gpu_id) + RELATED_FLAGS_SETTING = {} + if args.seed == 0: + RELATED_FLAGS_SETTING['FLAGS_cudnn_deterministic'] = 1 + RELATED_FLAGS_SETTING['FLAGS_benchmark'] = 1 + args.num_workers = 0 + else: + # args.seed == None or args.seed != 0 + RELATED_FLAGS_SETTING['FLAGS_cudnn_exhaustive_search'] = 1 + RELATED_FLAGS_SETTING['FLAGS_cudnn_batchnorm_spatial_persistent'] = 1 + RELATED_FLAGS_SETTING['FLAGS_max_inplace_grad_add'] = 8 + paddle.fluid.set_flags(RELATED_FLAGS_SETTING) + + if args.seed is not None: + args.seed = args.seed + rank + paddle.seed(args.seed) + np.random.seed(args.seed) + random.seed(args.seed) + if world_size > 1: import paddle.distributed.fleet as fleet @@ -67,7 +77,8 @@ def train(args): rank=rank, world_size=world_size, fp16=args.fp16, - is_bin=args.is_bin) + is_bin=args.is_bin, + seed=args.seed) num_image = trainset.total_num_samples total_batch_size = args.batch_size * world_size @@ -139,6 +150,7 @@ def train(args): callback_verification = CallBackVerification( args.validation_interval_step, rank, + world_size, args.batch_size, args.val_targets, args.data_dir, @@ -146,7 +158,7 @@ def train(args): callback_logging = CallBackLogging(args.log_interval_step, rank, world_size, total_steps, - args.batch_size, writer) + args.batch_size) checkpoint = Checkpoint( rank=rank, @@ -213,7 +225,16 @@ def train(args): loss_avg.update(loss_v.item(), 1) callback_logging(global_step, loss_avg, epoch, lr_value) if args.do_validation_while_train: - callback_verification(global_step, backbone) + best_metric = callback_verification(global_step, backbone) + if best_metric is not None and len(best_metric) > 0: + for ver_dataset in best_metric: + checkpoint.save( + backbone, + classifier, + optimizer, + epoch=epoch, + for_train=True, + best_metric=best_metric[ver_dataset]) lr_scheduler.step() if global_step >= total_steps: @@ -222,4 +243,3 @@ def train(args): checkpoint.save( backbone, classifier, optimizer, epoch=epoch, for_train=True) - writer.close() diff --git a/dynamic/utils/io.py b/dynamic/utils/io.py index a30449c86fecb..b98f601ba9ee9 100644 --- a/dynamic/utils/io.py +++ b/dynamic/utils/io.py @@ -19,6 +19,8 @@ import numpy as np import shutil import json + +from paddle.fluid.data_feeder import convert_dtype from utils.rearrange_weight import rearrange_weight @@ -45,9 +47,16 @@ def save(self, classifier: paddle.nn.Layer=None, optimizer=None, epoch=0, - for_train=True): - - model_save_dir = os.path.join(self.model_save_dir, str(epoch)) + for_train=True, + best_metric=None): + + if best_metric is not None: + save_rank = best_metric['rank'] + model_save_dir = os.path.join(self.model_save_dir, 'best_model', str(best_metric['dataset_name'])) + else: + save_rank = 0 # default we only save rank 0 backbone + model_save_dir = os.path.join(self.model_save_dir, str(epoch)) + if not os.path.exists(model_save_dir): # may be more than one processes trying # to create the directory @@ -58,7 +67,7 @@ def save(self, raise pass - if self.rank == 0: + if self.rank == save_rank: # for non dist param, we only save their at rank 0. for name, param in backbone.state_dict().items(): paddle.save( @@ -85,7 +94,7 @@ def save(self, paddle.save(opt, os.path.join(model_save_dir, name + '.pdopt')) - if self.rank == 0: + if self.rank == save_rank: # save some extra info for resume # pretrain_world_size, embedding_size, num_classes are used for # re-split fc weight when gpu setting changed. @@ -97,6 +106,8 @@ def save(self, extra_info['num_classes'] = self.num_classes extra_info['epoch'] = epoch extra_info['lr_state'] = lr_state_dict + if best_metric is not None: + extra_info['best_metric'] = best_metric with open(config_file, 'w') as f: json.dump(extra_info, f) @@ -117,7 +128,25 @@ def load(self, assert os.path.exists(self.checkpoint_dir) checkpoint_dir = os.path.abspath(self.checkpoint_dir) - + + type_dict = {} + for name, param in backbone.state_dict().items(): + type_dict[param.name] = convert_dtype(param.dtype) + + if classifier is not None: + # for dist param, we need to save their at all ranks. + for name, param in classifier.state_dict().items(): + type_dict[param.name] = convert_dtype(param.dtype) + + if for_train: + assert optimizer is not None + opt_state_dict = optimizer.state_dict() + lr_state_dict = opt_state_dict['LR_Scheduler'] + for name, opt in opt_state_dict.items(): + if name == 'LR_Scheduler' or '@GRAD' in name: + continue + type_dict[name] = convert_dtype(opt.dtype) + param_state_dict = {} opt_state_dict = {} dist_param_state_dict = {} @@ -139,11 +168,16 @@ def load(self, if not for_train and ext == '.pdopt': continue + + if classifier is None and 'dist@' in name and '@rank@' in name: + continue tensor = paddle.load(path, return_numpy=True) if dtype: assert dtype in ['float32', 'float16'] tensor = tensor.astype(dtype) + else: + tensor = tensor.astype(type_dict[name]) if 'dist@' in name and '@rank@' in name: if '.w' in name and 'velocity' not in name: diff --git a/dynamic/utils/verification.py b/dynamic/utils/verification.py index bafda1eab0dc0..bd9f78e3d3300 100644 --- a/dynamic/utils/verification.py +++ b/dynamic/utils/verification.py @@ -80,6 +80,7 @@ class CallBackVerification(object): def __init__(self, frequent, rank, + world_size, batch_size, val_targets, rec_prefix, @@ -87,6 +88,7 @@ def __init__(self, image_size=(112, 112)): self.frequent: int = frequent self.rank: int = rank + self.world_size: int = world_size self.batch_size: int = batch_size self.fp16 = fp16 self.highest_acc_list: List[float] = [0.0] * len(val_targets) @@ -98,6 +100,7 @@ def __init__(self, image_size=image_size) def ver_test(self, backbone: paddle.nn.Layer, global_step: int): + best_metric = {} for i in range(len(self.ver_list)): test_start = time.time() acc1, std1, acc2, std2, xnorm, embeddings_list = test( @@ -110,12 +113,27 @@ def ver_test(self, backbone: paddle.nn.Layer, global_step: int): (self.ver_name_list[i], global_step, xnorm)) logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2)) - if acc2 > self.highest_acc_list[i]: - self.highest_acc_list[i] = acc2 + if self.world_size > 1: + max_acc_tensor = paddle.to_tensor(acc2, dtype='float64') + paddle.distributed.all_reduce(max_acc_tensor, paddle.distributed.ReduceOp.MAX) + max_acc = max_acc_tensor.item() + else: + max_acc = acc2 + if max_acc > self.highest_acc_list[i]: + self.highest_acc_list[i] = max_acc + if abs(max_acc - acc2) < 1e-8: + best_metric[self.ver_name_list[i]] = { + 'global_step': global_step, + 'acc2': acc2, + 'rank': self.rank, + 'dataset_name': self.ver_name_list[i], + } + logging.info('[%s][%d]Accuracy-Highest: %1.5f' % ( self.ver_name_list[i], global_step, self.highest_acc_list[i])) test_end = time.time() logging.info("test time: {:.4f}".format(test_end - test_start)) + return best_metric def init_dataset(self, val_targets, data_dir, image_size): for name in val_targets: @@ -129,5 +147,7 @@ def __call__(self, num_update, backbone: paddle.nn.Layer): if num_update > 0 and num_update % self.frequent == 0: backbone.eval() with paddle.no_grad(): - self.ver_test(backbone, num_update) + best_metric = self.ver_test(backbone, num_update) backbone.train() + return best_metric + return None diff --git a/dynamic/validation.py b/dynamic/validation.py index 52dc9bd875da5..330df0d17a4f3 100644 --- a/dynamic/validation.py +++ b/dynamic/validation.py @@ -30,11 +30,11 @@ def validation(args): checkpoint_dir=args.checkpoint_dir, ) backbone = eval("backbones.{}".format(args.backbone))( - num_features=args.embedding_size) + num_features=args.embedding_size, dropout=0.0, data_format="NHWC") checkpoint.load(backbone, for_train=False) backbone.eval() callback_verification = CallBackVerification( - 1, 0, args.batch_size, args.val_targets, args.data_dir) + 1, 0, 1, args.batch_size, args.val_targets, args.data_dir) callback_verification(1, backbone) diff --git a/experiments/images/friends2.jpg b/experiments/images/friends2.jpg index f5dad20a7fcf9..5a2ba40ed2b5a 100644 Binary files a/experiments/images/friends2.jpg and b/experiments/images/friends2.jpg differ diff --git a/experiments/images/throughtput.png b/experiments/images/throughtput.png index 8d81dee589563..f94f6092a7786 100644 Binary files a/experiments/images/throughtput.png and b/experiments/images/throughtput.png differ diff --git a/requirement.txt b/requirement.txt index 25de974effc99..17669c34bf1f3 100644 --- a/requirement.txt +++ b/requirement.txt @@ -1,4 +1,3 @@ -visualdl opencv-python pillow numpy @@ -14,3 +13,4 @@ opencv-python==4.4.0.46 onnxruntime onnx paddle2onnx +paddlepaddle-gpu>=2.2.2 diff --git a/scripts/find_maximum_classes_dynamic.sh b/scripts/find_maximum_classes_dynamic.sh index 53cd9d1f1591b..8628fd6d4b1e7 100644 --- a/scripts/find_maximum_classes_dynamic.sh +++ b/scripts/find_maximum_classes_dynamic.sh @@ -13,6 +13,7 @@ # limitations under the License. export FLAGS_allocator_strategy=naive_best_fit +export FLAGS_fraction_of_gpu_memory_to_use=0.9999 python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \ --config_file configs/ms1mv3_r50.py \ --is_static False \ @@ -22,7 +23,7 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \ --sample_ratio 0.1 \ --loss ArcFace \ --batch_size 64 \ - --num_classes 67000000 \ + --num_classes 87000000 \ --use_synthetic_dataset True \ --do_validation_while_train False \ --log_interval_step 1 \ diff --git a/scripts/find_maximum_classes_static.sh b/scripts/find_maximum_classes_static.sh index 4bc72f58fbf1e..e3fc91014f1c0 100644 --- a/scripts/find_maximum_classes_static.sh +++ b/scripts/find_maximum_classes_static.sh @@ -13,6 +13,7 @@ # limitations under the License. export FLAGS_allocator_strategy=naive_best_fit +export FLAGS_fraction_of_gpu_memory_to_use=0.999 python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \ --config_file configs/ms1mv3_r50.py \ --is_static True \ @@ -22,10 +23,11 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \ --sample_ratio 0.1 \ --loss ArcFace \ --batch_size 64 \ - --num_classes 60000000 \ + --num_classes 92000000 \ --use_synthetic_dataset True \ --do_validation_while_train False \ --log_interval_step 1 \ --fp16 True \ --lsc_init_from_numpy False \ --output fp16_arcface_static_0.1_maximum_classes + diff --git a/scripts/perf_runner.sh b/scripts/perf_runner.sh index 267c8d3cb7982..002a5e703f982 100644 --- a/scripts/perf_runner.sh +++ b/scripts/perf_runner.sh @@ -32,9 +32,7 @@ fi if [ $dtype = "fp16" ]; then fp16=True - data_format=NCHW -# TODO(GuoxiaWang): remove NCHW when PRelu support NHWC -# data_format=NHWC + data_format=NHWC else fp16=False data_format=NCHW diff --git a/scripts/train_dynamic.sh b/scripts/train_dynamic.sh index a623945992f51..dc4b502b730d0 100644 --- a/scripts/train_dynamic.sh +++ b/scripts/train_dynamic.sh @@ -12,7 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \ +# multi node ip list, e.g: +# TRAINER_IP_LIST=10.11.12.1,10.11.12.2 +TRAINER_IP_LIST=127.0.0.1 +# other gpus training, e.g: +# CUDA_VISIBLE_DEVICES=2 +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +python -m paddle.distributed.launch --ips=$TRAINER_IP_LIST --gpus=$CUDA_VISIBLE_DEVICES tools/train.py \ --config_file configs/ms1mv3_r50.py \ --is_static False \ --backbone FresResNet50 \ @@ -38,4 +44,4 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \ --warmup_num 0 \ --train_num 25 \ --decay_boundaries "10,16,22" \ - --output MS1M_v3_arcface_dynamic_0.1 + --output MS1M_v3_arcface_dynamic_0.1_NHWC_FP16 diff --git a/scripts/train_static.sh b/scripts/train_static.sh index 52ca2be1eb12d..4c027097b0c96 100644 --- a/scripts/train_static.sh +++ b/scripts/train_static.sh @@ -12,7 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \ +# multi node ip list, e.g: +# TRAINER_IP_LIST=10.11.12.1,10.11.12.2 +TRAINER_IP_LIST=127.0.0.1 +# other gpus training, e.g: +# CUDA_VISIBLE_DEVICES=2 +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +python -m paddle.distributed.launch --ips=$TRAINER_IP_LIST --gpus=$CUDA_VISIBLE_DEVICES tools/train.py \ --config_file configs/ms1mv3_r50.py \ --is_static True \ --backbone FresResNet50 \ diff --git a/scripts/validation_dynamic.sh b/scripts/validation_dynamic.sh index 3b635c5f05896..c927e28290137 100644 --- a/scripts/validation_dynamic.sh +++ b/scripts/validation_dynamic.sh @@ -16,7 +16,7 @@ python tools/validation.py \ --is_static False \ --backbone FresResNet50 \ --embedding_size 512 \ - --checkpoint_dir MS1M_v3_arcface_dynamic_128_fp16_0.1/FresResNet50/24 \ + --checkpoint_dir MS1M_v3_arcface_dynamic_0.1_NHWC/FresResNet50/best_model/cfp_fp \ --data_dir MS1M_v3/ \ --val_targets lfw,cfp_fp,agedb_30 \ --batch_size 128 diff --git a/scripts/validation_static.sh b/scripts/validation_static.sh index 64227f7debd86..1f7a586be5cdc 100644 --- a/scripts/validation_static.sh +++ b/scripts/validation_static.sh @@ -16,7 +16,7 @@ python tools/validation.py \ --is_static True \ --backbone FresResNet50 \ --embedding_size 512 \ - --checkpoint_dir MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/24 \ + --checkpoint_dir MS1M_v3_arcface_static_0.1_NHWC_FP16/FresResNet50/best_model/cfp_fp/ \ --data_dir MS1M_v3/ \ --val_targets lfw,cfp_fp,agedb_30 \ --batch_size 128 diff --git a/static/backbones/iresnet.py b/static/backbones/iresnet.py index 7396c52e708ed..02aaaf1164e36 100644 --- a/static/backbones/iresnet.py +++ b/static/backbones/iresnet.py @@ -86,10 +86,10 @@ def __init__(self, momentum=0.9, data_layout=data_format, is_test=False if is_train else True) - # TODO(GuoxiaWang): add data_format attr input_blob = paddle.static.nn.prelu( input_blob, mode="channel", + data_format=data_format, param_attr=paddle.ParamAttr( initializer=paddle.nn.initializer.Constant(0.25))) @@ -98,8 +98,12 @@ def __init__(self, input_blob = self.residual_unit_v3( input_blob, filter_list[i + 1], 3, 2 if j == 0 else 1, 1, is_train, data_format) - fc1 = self.get_fc1(input_blob, is_train, dropout, data_format) + if data_format == 'NHWC': + input_blob = paddle.tensor.transpose(input_blob, [0, 3, 1, 2]) + #NOTE(GuoxiaWang): don't use NHWC for last fc, + # thus we can train using NHWC and test using NCHW + fc1 = self.get_fc1(input_blob, is_train, dropout, data_format="NCHW") self.output_dict['feature'] = fc1 def residual_unit_v3(self, @@ -135,10 +139,10 @@ def residual_unit_v3(self, momentum=0.9, data_layout=data_format, is_test=False if is_train else True) - # TODO(GuoxiaWang): add data_format attr prelu = paddle.static.nn.prelu( bn2, mode="channel", + data_format=data_format, param_attr=paddle.ParamAttr( initializer=paddle.nn.initializer.Constant(0.25))) conv2 = paddle.static.nn.conv2d( diff --git a/static/train.py b/static/train.py index 98031df3d5ae4..296aae415458a 100644 --- a/static/train.py +++ b/static/train.py @@ -17,6 +17,7 @@ import sys import numpy as np import logging +import random import paddle from visualdl import LogWriter @@ -32,31 +33,39 @@ from . import backbones from .static_model import StaticModel -RELATED_FLAGS_SETTING = { - 'FLAGS_cudnn_exhaustive_search': 1, - 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, - 'FLAGS_max_inplace_grad_add': 8, - 'FLAGS_fraction_of_gpu_memory_to_use': 0.9999, -} -paddle.fluid.set_flags(RELATED_FLAGS_SETTING) - def train(args): - writer = LogWriter(logdir=args.logdir) - rank = int(os.getenv("PADDLE_TRAINER_ID", 0)) world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) place = paddle.CUDAPlace(gpu_id) + RELATED_FLAGS_SETTING = {} + if args.seed == 0: + RELATED_FLAGS_SETTING['FLAGS_cudnn_deterministic'] = 1 + RELATED_FLAGS_SETTING['FLAGS_benchmark'] = 1 + args.num_workers = 0 + else: + # args.seed == None or args.seed != 0 + RELATED_FLAGS_SETTING['FLAGS_cudnn_exhaustive_search'] = 1 + RELATED_FLAGS_SETTING['FLAGS_cudnn_batchnorm_spatial_persistent'] = 1 + RELATED_FLAGS_SETTING['FLAGS_max_inplace_grad_add'] = 8 + paddle.fluid.set_flags(RELATED_FLAGS_SETTING) + + if args.seed is not None: + args.seed = args.seed + rank + paddle.seed(args.seed) + np.random.seed(args.seed) + random.seed(args.seed) + if world_size > 1: import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.without_graph_optimization = True fleet.init(is_collective=True, strategy=strategy) - + if args.use_synthetic_dataset: trainset = datasets.SyntheticDataset(args.num_classes, fp16=args.fp16) else: @@ -131,9 +140,9 @@ def train(args): 'custom_black_list': args.custom_black_list, }, margin_loss_params=margin_loss_params, - data_format=args.data_format, + data_format=args.data_format, lsc_init_from_numpy=args.lsc_init_from_numpy, ) - + if rank == 0: with open(os.path.join(args.output, 'main_program.txt'), 'w') as f: f.write(str(train_program)) @@ -150,14 +159,15 @@ def train(args): data_format=args.data_format, ) callback_verification = CallBackVerification( - args.validation_interval_step, rank, args.batch_size, test_program, + args.validation_interval_step, rank, world_size, args.batch_size, + test_program, list(test_model.backbone.input_dict.values()), list(test_model.backbone.output_dict.values()), args.val_targets, args.data_dir) callback_logging = CallBackLogging(args.log_interval_step, rank, world_size, total_steps, - args.batch_size, writer) + args.batch_size) checkpoint = Checkpoint( rank=rank, world_size=world_size, @@ -181,7 +191,7 @@ def train(args): # since we always use step style for lr_scheduler global_step = lr_state['last_epoch'] train_model.lr_scheduler.set_state_dict(lr_state) - + batch_sampler = eval("paddle.io.{}".format(args.batch_sampler))( dataset=trainset, batch_size=args.batch_size, @@ -210,7 +220,16 @@ def train(args): lr_value = train_model.optimizer.get_lr() callback_logging(global_step, loss_avg, epoch, lr_value) if args.do_validation_while_train: - callback_verification(global_step) + best_metric = callback_verification(global_step) + if best_metric is not None and len(best_metric) > 0: + for ver_dataset in best_metric: + checkpoint.save( + train_program, + lr_scheduler=train_model.lr_scheduler, + epoch=epoch, + for_train=True, + best_metric=best_metric[ver_dataset]) + train_model.lr_scheduler.step() if global_step >= total_steps: @@ -222,4 +241,3 @@ def train(args): lr_scheduler=train_model.lr_scheduler, epoch=epoch, for_train=True) - writer.close() diff --git a/static/utils/io.py b/static/utils/io.py index acc98ce38109a..1dbff6118832a 100644 --- a/static/utils/io.py +++ b/static/utils/io.py @@ -19,6 +19,8 @@ import numpy as np import shutil import json + +from paddle.fluid.data_feeder import convert_dtype from utils.rearrange_weight import rearrange_weight @@ -40,8 +42,21 @@ def __init__(self, self.checkpoint_dir: str = checkpoint_dir self.max_num_last_checkpoint: int = max_num_last_checkpoint - def save(self, program, lr_scheduler=None, epoch=0, for_train=True): - model_save_dir = os.path.join(self.model_save_dir, str(epoch)) + def save(self, + program, + lr_scheduler=None, + epoch=0, + for_train=True, + best_metric=None): + + if best_metric is not None: + save_rank = best_metric['rank'] + model_save_dir = os.path.join(self.model_save_dir, 'best_model', + str(best_metric['dataset_name'])) + else: + save_rank = 0 # default we only save rank 0 backbone + model_save_dir = os.path.join(self.model_save_dir, str(epoch)) + if not os.path.exists(model_save_dir): # may be more than one processes trying # to create the directory @@ -56,7 +71,7 @@ def save(self, program, lr_scheduler=None, epoch=0, for_train=True): for name, param in param_state_dict.items(): # for non dist param, we only save their at rank 0, # but for dist param, we need to save their at all ranks. - if 'dist@' in name and '@rank@' in name or self.rank == 0: + if 'dist@' in name and '@rank@' in name or self.rank == save_rank: paddle.save(param, os.path.join(model_save_dir, name + '.pdparam')) @@ -71,7 +86,7 @@ def save(self, program, lr_scheduler=None, epoch=0, for_train=True): paddle.save(opt, os.path.join(model_save_dir, name + '.pdopt')) - if self.rank == 0: + if self.rank == save_rank: # save some extra info for resume # pretrain_world_size, embedding_size, num_classes are used for # re-split fc weight when gpu setting changed. @@ -83,6 +98,8 @@ def save(self, program, lr_scheduler=None, epoch=0, for_train=True): extra_info['num_classes'] = self.num_classes extra_info['epoch'] = epoch extra_info['lr_state'] = lr_scheduler.state_dict() + if best_metric is not None: + extra_info['best_metric'] = best_metric with open(config_file, 'w') as f: json.dump(extra_info, f) @@ -98,6 +115,17 @@ def load(self, program, for_train=True, dtype=None): assert os.path.exists(self.checkpoint_dir) checkpoint_dir = os.path.abspath(self.checkpoint_dir) + param_state_dict = program.state_dict(mode='param') + opt_state_dict = program.state_dict(mode='opt') + type_dict = {} + shape_dict = {} + for name, param in param_state_dict.items(): + type_dict[name] = convert_dtype(param._dtype()) + shape_dict[name] = param.shape() + for name, opt in opt_state_dict.items(): + type_dict[name] = convert_dtype(opt._dtype()) + shape_dict[name] = opt.shape() + state_dict = {} dist_weight_state_dict = {} dist_weight_velocity_state_dict = {} @@ -117,10 +145,30 @@ def load(self, program, for_train=True, dtype=None): if not for_train and ext == '.pdopt': continue + if name not in type_dict: + continue + tensor = paddle.load(path, return_numpy=True) if dtype: assert dtype in ['float32', 'float16'] tensor = tensor.astype(dtype) + elif name in type_dict: + tensor = tensor.astype(type_dict[name]) + else: + pass + + if list(shape_dict[name]) != list(tensor.shape): + # for prelu NHWC[1, 1, 1, C] and NCHW [1, C, 1, 1] + expect_shape = list(shape_dict[name]) + actual_shape = list(tensor.shape) + if len(expect_shape) == len(actual_shape) and \ + expect_shape[0] == actual_shape[0] and expect_shape[0] == 1 and \ + expect_shape[2] == actual_shape[2] and expect_shape[2] == 1 and \ + expect_shape[1] == actual_shape[3]: + if actual_shape[3] != 1: + tensor = tensor.transpose([0, 3, 1, 2]) + elif actual_shape[1] != 1: + tensor = tensor.transpose([0, 2, 3, 1]) if 'dist@' in name and '@rank@' in name: if '.w' in name and 'velocity' not in name: diff --git a/static/utils/verification.py b/static/utils/verification.py index 3040d2cd74e0a..c0ce0bb490561 100644 --- a/static/utils/verification.py +++ b/static/utils/verification.py @@ -66,11 +66,12 @@ def test(rank, batch_size, data_set, executor, test_program, data_feeder, acc, std = np.mean(accuracy), np.std(accuracy) return acc, std, xnorm - +# Ref: https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/utils/utils_callbacks.py class CallBackVerification(object): def __init__(self, frequent, rank, + world_size, batch_size, test_program, feed_list, @@ -80,12 +81,13 @@ def __init__(self, image_size=(112, 112)): self.frequent: int = frequent self.rank: int = rank + self.world_size: int = world_size self.batch_size: int = batch_size self.test_program: paddle.static.Program = test_program self.feed_list: List[paddle.fluid.framework.Variable] = feed_list self.fetch_list: List[paddle.fluid.framework.Variable] = fetch_list - + self.highest_acc_list: List[float] = [0.0] * len(val_targets) self.ver_list: List[object] = [] self.ver_name_list: List[str] = [] @@ -95,12 +97,19 @@ def __init__(self, image_size=image_size) gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) - place = paddle.CUDAPlace(gpu_id) - self.executor = paddle.static.Executor(place) + self.place = paddle.CUDAPlace(gpu_id) + self.executor = paddle.static.Executor(self.place) self.data_feeder = paddle.fluid.DataFeeder( - place=place, feed_list=self.feed_list, program=self.test_program) + place=self.place, feed_list=self.feed_list, program=self.test_program) + + if self.world_size > 1: + self.test_scope = paddle.static.Scope() + with paddle.fluid.scope_guard(self.test_scope): + self.max_acc_var = paddle.static.create_global_var(shape=[1], value=0.0, dtype='float64', persistable=False, name='acc2') + self.executor.run(paddle.static.default_startup_program()) def ver_test(self, global_step: int): + best_metric = {} for i in range(len(self.ver_list)): test_start = time.time() acc2, std2, xnorm = test( @@ -110,12 +119,30 @@ def ver_test(self, global_step: int): (self.ver_name_list[i], global_step, xnorm)) logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2)) - if acc2 > self.highest_acc_list[i]: - self.highest_acc_list[i] = acc2 + if self.world_size > 1: + with paddle.fluid.scope_guard(self.test_scope): + max_acc_tensor = self.max_acc_var.get_value() + max_acc_tensor.set(np.array([acc2], dtype='float64'), self.place) + paddle.distributed.all_reduce(self.max_acc_var, paddle.distributed.ReduceOp.MAX) + max_acc = np.asscalar(np.array(max_acc_tensor)) + else: + max_acc = acc2 + + if max_acc > self.highest_acc_list[i]: + self.highest_acc_list[i] = max_acc + if abs(max_acc - acc2) < 1e-8: + best_metric[self.ver_name_list[i]] = { + 'global_step': global_step, + 'acc2': acc2, + 'rank': self.rank, + 'dataset_name': self.ver_name_list[i], + } + logging.info('[%s][%d]Accuracy-Highest: %1.5f' % ( self.ver_name_list[i], global_step, self.highest_acc_list[i])) test_end = time.time() logging.info("test time: {:.4f}".format(test_end - test_start)) + return best_metric def init_dataset(self, val_targets, data_dir, image_size): for name in val_targets: @@ -127,4 +154,6 @@ def init_dataset(self, val_targets, data_dir, image_size): def __call__(self, num_update): if num_update > 0 and num_update % self.frequent == 0: - self.ver_test(num_update) + best_metric = self.ver_test(num_update) + return best_metric + return None diff --git a/static/validation.py b/static/validation.py index 0ab6ac661d2b2..b8b38ec4be4d2 100644 --- a/static/validation.py +++ b/static/validation.py @@ -50,7 +50,7 @@ def validation(args): checkpoint.load(program=test_program, for_train=False) callback_verification = CallBackVerification( - 1, 0, args.batch_size, test_program, + 1, 0, 1, args.batch_size, test_program, list(test_model.backbone.input_dict.values()), list(test_model.backbone.output_dict.values()), args.val_targets, args.data_dir) diff --git a/tools/test_recognition.py b/tools/test_recognition.py index b64ac6fdf0051..ef14c28554ae0 100644 --- a/tools/test_recognition.py +++ b/tools/test_recognition.py @@ -20,6 +20,7 @@ import pickle import tarfile from functools import partial +from collections import defaultdict import cv2 import numpy as np @@ -108,8 +109,8 @@ def str2bool(v): parser.add_argument( "--rec_thresh", type=float, - default=0.45, - help="The threshold of recognition postprocess. Default by 0.45.") + default=0.35, + help="The threshold of recognition postprocess. Default by 0.35.") parser.add_argument( "--max_batch_size", type=int, @@ -141,7 +142,7 @@ def print_config(args): table.add_row([param, args[param]]) width = len(str(table).split("\n")[0]) print("{}".format("-" * width)) - print("PaddleFace".center(width)) + print("PLSC".center(width)) print(table) print("Powered by PaddlePaddle!".rjust(width)) print("{}".format("-" * width)) @@ -473,23 +474,30 @@ def postprocess(self): pass def retrieval(self, np_feature): - labels = [] + id_score_list = [] for feature in np_feature: similarity = cosine_similarity(self.index_feature, feature).squeeze() abs_similarity = np.abs(similarity) candidate_idx = np.argpartition(abs_similarity, -self.cdd_num)[-self.cdd_num:] + remove_idx = np.where(abs_similarity[candidate_idx] < self.thresh) candidate_idx = np.delete(candidate_idx, remove_idx) candidate_label_list = list(np.array(self.label)[candidate_idx]) + candidate_score_list = abs_similarity[candidate_idx] + candidate_score_dict = defaultdict(list) + for lb, score in zip(candidate_label_list, candidate_score_list): + candidate_score_dict[lb].append(score) if len(candidate_label_list) == 0: - maxlabel = "" + maxlabel = "unknown" + maxscore = -1.0 else: maxlabel = max(candidate_label_list, key=candidate_label_list.count) - labels.append(maxlabel) - return labels + maxscore = max(candidate_score_dict[maxlabel]) + id_score_list.append((maxlabel, maxscore)) + return id_score_list def load_index(self, file_path): with open(file_path, "rb") as f: @@ -549,14 +557,14 @@ def preprocess(self, img): img = img.astype(np.float32, copy=False) return img - def draw(self, img, box_list, labels): - self.color_map.update(labels) + def draw(self, img, box_list, id_score_list): + self.color_map.update([id for id, score in id_score_list]) im = Image.fromarray(img) draw = ImageDraw.Draw(im) for i, dt in enumerate(box_list): bbox, score = dt[2:], dt[1] - label = labels[i] + label, idscore = id_score_list[i] color = tuple(self.color_map[label]) xmin, ymin, xmax, ymax = bbox @@ -564,19 +572,31 @@ def draw(self, img, box_list, labels): font_size = max(int((xmax - xmin) // 6), 10) font = ImageFont.truetype(self.font_path, font_size) - text = "{} {:.4f}".format(label, score) + face_text = "{} {:.4f}".format('face', score) th = sum(font.getmetrics()) - tw = font.getsize(text)[0] + tw = font.getsize(face_text)[0] start_y = max(0, ymin - th) + id_text = "{} {:.4f}".format(label, idscore) + tw = max(tw, font.getsize(id_text)[0]) + draw.rectangle( + [(xmin, start_y - th), (xmin + tw + 1, start_y)], fill=color) + draw.text( + (xmin + 1, start_y - th), + id_text, + fill=(255, 255, 255), + font=font, + anchor="la") + draw.rectangle( [(xmin, start_y), (xmin + tw + 1, start_y + th)], fill=color) draw.text( (xmin + 1, start_y), - text, + face_text, fill=(255, 255, 255), font=font, anchor="la") + draw.rectangle( [(xmin, ymin), (xmax, ymax)], width=2, outline=color) return np.array(im) @@ -631,18 +651,20 @@ def predict(self, input_data, print_info=False): continue box_list, np_feature = self.predict_np_img(img) if np_feature is not None: - labels = self.rec_predictor.retrieval(np_feature) + id_score_list = self.rec_predictor.retrieval(np_feature) else: - labels = ["face"] * len(box_list) + id_score_list = [("unknown", -1.0)] * len(box_list) if box_list is not None: - result = self.draw(img, box_list, labels=labels) + result = self.draw(img, box_list, id_score_list=id_score_list) self.output_writer.write(result, file_name) if print_info: - logging.info(f"File: {file_name}, predict label(s): {labels}") + logging.info( + f"File: {file_name}, predict id_score_list(s): {id_score_list}" + ) yield { "box_list": box_list, "features": np_feature, - "labels": labels + "id_score_list": id_score_list } logging.info(f"Predict complete!")