diff --git a/README.md b/README.md
index e4f9d012cd7ea..c064709410e92 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,33 @@
-# PLSC
+# PLSC (Paddle Large Scale Classification)
## 1. Introduction
-[PLSC](https://github.com/PaddlePaddle/PLSC) is an open source Paddle Large Scale Classification Tools, which supports 60 million classes on single node 8 NVIDIA V100 (32G).
+[PLSC](https://github.com/PaddlePaddle/PLSC) is an open source repo for a collection of Paddle Large Scale Classification Tools, which supports 92 million classes on a single node with 8 NVIDIA V100 (32G) GPUs and has high training throughtput. It implements [ArcFace](https://arxiv.org/abs/1801.07698), [CosFace](https://arxiv.org/abs/1801.09414), [PartialFC](https://arxiv.org/abs/2010.05222), SparseMomentum, DataParallel + ModelParallel distributed training, FP16 training.
-## 2. Environment Preparation
+## 2. Top News
-### 2.1 Install Paddle from Source Code
+**Update (2022-01-11):** Supported NHWC data format of FP16 to improve 10% throughtput and decreased 30% GPU memory. It supported 92 million classes on single node 8 NVIDIA V100 (32G) and has high training throughtput. Supported best checkpoint save. And we released 18 pretrained models and PLSC v2.2.
+
+**Update (2021-12-11):** Released [Zhihu Technical Artical](https://zhuanlan.zhihu.com/p/443091282) and [Bilibili Open Class](https://www.bilibili.com/video/BV1VP4y1G73X)
+
+**Update (2021-10-10):** Added FP16 training, improved throughtput and optimized GPU memory. It supported 60 million classes on single node 8 NVIDIA V100 (32G) and has high training throughtput.
+
+**Update (2021-09-10):** This repository supported both ``static`` mode and ``dynamic`` mode to use paddlepaddle v2.2, which supported 48 million classes on single node 8 NVIDIA V100 (32G). It added PartialFC, SparseMomentum, and [ArcFace](https://arxiv.org/abs/1801.07698), [CosFace](https://arxiv.org/abs/1801.09414), [PartialFC](https://arxiv.org/abs/2010.05222) (we refer to MarginLoss). Backbone includes IResNet and MobileNet.
+
+
+## 3. Environment Preparation
+
+### 3.1 Install Paddle from PyPI
+
+```shell
+# python required 3.x or later
+# paddlepaddle required 2.2.2 or later
+pip install paddlepaddle-gpu==2.2.2
+```
+
+### 3.2 Install Paddle from Source Code
+
+For more install information, ref to [PaddlePaddle](https://www.paddlepaddle.org.cn/)
```shell
@@ -14,25 +35,21 @@ git clone https://github.com/PaddlePaddle/Paddle.git
cd /path/to/Paddle/
+# [optional] checkout release/2.2 branch
+git checkout -b release/2.2 upstream/release/2.2
+
mkdir build && cd build
-cmake .. -DWITH_TESTING=ON -DWITH_GPU=ON -DWITH_GOLANG=OFF -DWITH_STYLE_CHECK=ON -DCMAKE_INSTALL_PREFIX=$PWD/output -DWITH_DISTRIBUTE=ON -DCMAKE_BUILD_TYPE=Release -DPY_VERSION=3.7 -DCUDA_ARCH_NAME=All -DPADDLE_VERSION=2.2.0
+cmake .. -DWITH_TESTING=ON -DWITH_GPU=ON -DWITH_GOLANG=OFF -DWITH_STYLE_CHECK=ON -DCMAKE_INSTALL_PREFIX=$PWD/output -DWITH_DISTRIBUTE=ON -DCMAKE_BUILD_TYPE=Release -DPY_VERSION=3.7 -DCUDA_ARCH_NAME=All -DPADDLE_VERSION=2.2.2
make -j20 && make install -j20
-pip install output/opt/paddle/share/wheels/paddlepaddle_gpu-2.2.0-cp37-cp37m-linux_x86_64.whl
+pip install output/opt/paddle/share/wheels/paddlepaddle_gpu-2.2.2-cp37-cp37m-linux_x86_64.whl
```
-### 2.2 Install Paddle from PyPI
-
-```shell
-# python required 3.x or later
-# paddlepaddle required 2.2.0rc0 or later
-pip install paddlepaddle-gpu==2.2.0rc0
-```
-### 2.3 Download PLSC
+### 3.3 Download PLSC
```shell
git clone https://github.com/PaddlePaddle/PLSC.git
@@ -41,16 +58,16 @@ cd /path/to/PLSC/
```
-## 3. Data Preparation
+## 4. Data Preparation
-### 3.1 Download Dataset
+### 4.1 Download Dataset
Download the dataset from [insightface datasets](https://github.com/deepinsight/insightface/tree/master/recognition/_datasets_).
* MS1M_v2: MS1M-ArcFace
* MS1M_v3: MS1M-RetinaFace
-### 3.2 Extract MXNet Dataset to Images
+### 4.2 Extract MXNet Dataset to Images
```shell
python tools/mx_recordio_2_images.py --root_dir ms1m-retinaface-t1/ --output_dir MS1M_v3/
@@ -82,23 +99,10 @@ images/00000001.jpg 0
If you want to use customed dataset, you can arrange your data according to the above format.
-### 3.3 Transform Between Original Image Files and Bin Files
-
-If you want to convert original image files to `bin` files used directly for training process, you can use the following command to finish the conversion.
-
-```shell
-python tools/convert_image_bin.py --image_path="your/input/image/path" --bin_path="your/output/bin/path" --mode="image2bin"
-```
-
-If you want to convert `bin` files to original image files, you can use the following command to finish the conversion.
+## 5. How to Training
-```shell
-python tools/convert_image_bin.py --image_path="your/input/bin/path" --bin_path="your/output/image/path" --mode="bin2image"
-```
-
-## 4. How to Training
-### 4.1 Single Node, 8 GPUs:
+### 5.1 Single Node, 8 GPUs:
#### Static Mode
@@ -112,11 +116,26 @@ sh scripts/train_static.sh
sh scripts/train_dynamic.sh
```
+### 5.2 Single Node, 1 GPU:
-During training, you can view loss changes in real time through `VisualDL`, For more information, please refer to [VisualDL](https://github.com/PaddlePaddle/VisualDL/).
+Modify the ``CUDA_VISIBLE_DEVICES`` environment variable.
+``` bash
+TRAINER_IP_LIST=127.0.0.1
+CUDA_VISIBLE_DEVICES=3
+```
+
+### 5.3 Multi Node, Multi GPUs:
-## 5. Model Evaluation
+Modify the ``TRAINER_IP_LIST`` and ``CUDA_VISIBLE_DEVICES`` environment variable and then run the training shell script on each node.
+
+``` bash
+TRAINER_IP_LIST=10.11.12.1,10.11.12.2
+CUDA_VISIBLE_DEVICES=0,1,2,3
+```
+
+
+## 6. Model Evaluation
The model evaluation process can be started as follows.
@@ -132,7 +151,7 @@ sh scripts/validation_static.sh
sh scripts/validation_dynamic.sh
```
-## 6. Export Model
+## 7. Export Model
PaddlePaddle supports inference using prediction engines. Firstly, you should export inference model.
#### Static Mode
@@ -149,7 +168,7 @@ sh scripts/export_dynamic.sh
We also support export to onnx model, you only need to set `--export_type onnx`.
-## 7. Model Inference
+## 8. Model Inference
The model inference process supports paddle save inference model and onnx model.
@@ -157,40 +176,64 @@ The model inference process supports paddle save inference model and onnx model.
sh scripts/inference.sh
```
-## 8. Model Performance
+## 9. Model Performance
-### 8.1 Accuracy on Verification Datasets
+### 9.1 Accuracy on Verification Datasets
**Configuration:**
* GPU: 8 NVIDIA Tesla V100 32G
- * Precison: FP16
* BatchSize: 128/1024
-| Mode | Datasets | backbone | Ratio | agedb30 | cfp_fp | lfw | log | last checkpoint |
-| ------- | :------: | :------- | ----- | :------ | :----- | :--- | :--- | :--- |
-| Static | MS1MV3 | r50 | 0.1 | 0.98317 | 0.98943| 0.99850 | [log](experiments/logs/static/ms1mv3_r50_static_128_fp16_0.1/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz) |
-| Static | MS1MV3 | r50 | 1.0 | 0.98283 | 0.98843| 0.99850 | [log](experiments/logs/static/ms1mv3_r50_static_128_fp16_1.0/training.log) | [checkpoint](https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_1.0_epoch_24.tgz) |
-| Dynamic | MS1MV3 | r50 | 0.1 | 0.98367 | 0.98971| 0.99850 | [log](experiments/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_0.1/training.log) | [checkpoint](https://plsc.bj.bcebos.com/pretrained_model/ms1mv3_r50_dynamic_128_fp16_0.1_eopch_24.tgz) |
-| Dynamic | MS1MV3 | r50 | 1.0 | 0.98333 | 0.99043| 0.99850 | [log](experiments/logs/dynamic/ms1mv3_r50_dynamic_128_fp16_1.0/training.log) | [checkpoint](https://plsc.bj.bcebos.com/pretrained_model/ms1mv3_r50_dynamic_128_fp16_1.0_eopch_24.tgz) |
+| Mode | Datasets | Backbone | Precision | DataFormat | Ratio | agedb30 | cfp_fp | lfw | checkpoint&log |
+| ------- | :------: | :------------ | --------- | ---------- | ----- | ------- | ------- | :------ | :----------------------------------------------------------- |
+| Static | MS1MV3 | Res50 | FP16 | NHWC | 0.1 | 0.98200 | 0.98943 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_0.1_NHWC_FP16_v2.2.tgz) |
+| Static | MS1MV3 | Res50 | FP32 | NCHW | 0.1 | 0.98267 | 0.98986 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_0.1_NCHW_FP32_v2.2.tgz) |
+| Static | MS1MV3 | Res50 | FP16 | NHWC | 1.0 | 0.98300 | 0.98929 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_1.0_NHWC_FP16_v2.2.tgz) |
+| Static | MS1MV3 | Res50 | FP32 | NCHW | 1.0 | 0.98400 | 0.98929 | 0.99833 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_static_1.0_NCHW_FP32_v2.2.tgz) |
+| Static | MS1MV3 | Res100 | FP16 | NHWC | 0.1 | 0.98383 | 0.99200 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_0.1_NHWC_FP16_v2.2.tgz) |
+| Static | MS1MV3 | Res100 | FP32 | NCHW | 0.1 | 0.98317 | 0.99157 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_0.1_NCHW_FP32_v2.2.tgz) |
+| Static | MS1MV3 | Res100 | FP16 | NHWC | 1.0 | 0.98367 | 0.99086 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_1.0_NHWC_FP16_v2.2.tgz) |
+| Static | MS1MV3 | Res100 | FP32 | NCHW | 1.0 | 0.98417 | 0.99129 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_static_1.0_NCHW_FP32_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res50 | FP16 | NHWC | 0.1 | 0.98367 | 0.99029 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res50 | FP32 | NCHW | 0.1 | 0.98400 | 0.98986 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_0.1_NCHW_FP32_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res50 | FP16 | NHWC | 1.0 | 0.98317 | 0.98971 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_1.0_NHWC_FP16_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res50 | FP32 | NCHW | 1.0 | 0.98350 | 0.99000 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_1.0_NCHW_FP32_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res100 | FP16 | NHWC | 0.1 | 0.98500 | 0.99143 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_0.1_NHWC_FP16_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res100 | FP32 | NCHW | 0.1 | 0.98383 | 0.99114 | 0.99867 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_0.1_NCHW_FP32_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res100 | FP16 | NHWC | 1.0 | 0.98500 | 0.99214 | 0.99883 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_1.0_NHWC_FP16_v2.2.tgz) |
+| Dynamic | MS1MV3 | Res100 | FP32 | NCHW | 1.0 | 0.98400 | 0.99100 | 0.99850 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res100_dynamic_1.0_NCHW_FP32_v2.2.tgz) |
+| Dynamic | MS1MV3 | MobileFaceNet | FP32 | NCHW | 0.1 | 0.96200 | 0.96571 | 0.99567 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_MobileFaceNet_dynamic_0.1_NCHW_FP32_v2.2.tgz) |
+| Dynamic | MS1MV3 | MobileFaceNet | FP32 | NCHW | 1.0 | 0.96167 | 0.96657 | 0.99533 | [download](https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_MobileFaceNet_dynamic_1.0_NCHW_FP32_v2.2.tgz) |
-### 8.2 Maximum Number of Identities
+### 9.2 Maximum Number of Identities
+
+#### Static Mode
+
+```bash
+sh scripts/find_maximum_classes_static.sh
+```
+
+#### Dynamic Mode
+
+```bash
+sh scripts/find_maximum_classes_dynamic.sh
+```
**Configuration:**
* GPU: 8 NVIDIA Tesla V100 32G (32510MiB)
* BatchSize: 64/512
* SampleRatio: 0.1
-| Mode | Precision | Res50 | Res100 |
-| ------------------------- | --------- | -------- | -------- |
+| Mode | Precision | Res50 | Res100 |
+| ------------------------- | --------- | ------------------ | ------------------ |
| Framework1 (static) | AMP | 42000000 (31792MiB)| 39000000 (31938MiB)|
| Framework2 (dynamic) | AMP | 30000000 (31702MiB)| 29000000 (32286MiB)|
-| Paddle (static) | FP16 | 60000000 (32018MiB)| 60000000 (32018MiB)|
-| Paddle (dynamic) | FP16 | 67000000 (31970MiB)| 67000000 (31970MiB)|
+| Paddle (static) | FP16 | 92000000 (32298MiB)| 88000000 (32298MiB)|
+| Paddle (dynamic) | FP16 | 87000000 (31978MiB)| 84000000 (31978MiB)|
-**Note:** config environment variable by ``export FLAGS_allocator_strategy=naive_best_fit``
-### 8.3 Throughtput
+### 9.3 Throughtput
**Configuration:**
* BatchSize: 128/1024
@@ -199,9 +242,12 @@ sh scripts/inference.sh
![insightface_throughtput](experiments/images/throughtput.png)
-## 9. Demo
+**Note:** please click the image to see high-definition image.
+
+## 10. Demo
Combined with face detection model, we can complete the face recognition process.
+**Note: We only show a demo which can not be use to commercial application.**
Firstly, use the fllowing commands to download the models.
@@ -214,36 +260,35 @@ wget https://paddle-model-ecology.bj.bcebos.com/model/insight-face/blazeface_fpn
tar -xzf models/blazeface_fpn_ssh_1000e_v1.0_infer.tar -C models/
rm -rf models/blazeface_fpn_ssh_1000e_v1.0_infer.tar
-# Download static ResNet50 PartialFC 0.1 model and extract it
-wget https://paddle-model-ecology.bj.bcebos.com/model/insight-face/distributed/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz -P models/
-tar -xf models/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz -C models/
-rm -rf models/ms1mv3_r50_static_128_fp16_0.1_epoch_24.tgz
+# Download dynamic ResNet50 PartialFC 0.1 model and extract it
+wget https://plsc.bj.bcebos.com/pretrained_model/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2.tgz -P models/
+tar -xzf models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2.tgz -C models/
-# Export static save inference model
-python tools/export.py --is_static True --export_type paddle --backbone FresResNet50 --embedding_size 512 --checkpoint_dir models/ms1mv3_r50_static_128_fp16_0.1_epoch_24 --output_dir models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer
-rm -rf models/ms1mv3_r50_static_128_fp16_0.1_epoch_24
+# Export dynamic save inference model using cfp_fp best model
+python tools/export.py --is_static False --export_type paddle --backbone FresResNet50 --embedding_size 512 --checkpoint_dir models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2/FresResNet50/best_model/cfp_fp/ --output_dir models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer
```
Then, use the following commands to download the gallery, demo image and font file for visualization. And we generate gallery features.
```bash
-# Download gallery, query and font file
mkdir -p images/
-git clone https://github.com/littletomatodonkey/insight-face-paddle /tmp/insight-face-paddle
-cp -r /tmp/insight-face-paddle/demo/friends/gallery/ images/
-cp -r /tmp/insight-face-paddle/demo/friends/query/ images/
+
+# Download gallery, query
+wget https://plsc.bj.bcebos.com/Friends.tgz -P images/
+tar -xzf images/Friends.tgz -C images/
+
+# Download font file
mkdir -p assets
-cp /tmp/insight-face-paddle/SourceHanSansCN-Medium.otf assets/
-rm -rf /tmp/insight-face-paddle
+wget https://plsc.bj.bcebos.com/SourceHanSansCN-Medium.otf -P assets/
# Build index file
python tools/test_recognition.py \
--rec \
- --rec_model_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdmodel \
- --rec_params_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdiparams \
- --build_index=images/gallery/index.bin \
- --img_dir=images/gallery \
- --label=images/gallery/label.txt
+ --rec_model_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdmodel \
+ --rec_params_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdiparams \
+ --build_index=images/Friends/gallery/index.bin \
+ --img_dir=images/Friends/gallery \
+ --label=images/Friends/gallery/label.txt
```
Use the following command to run the whole face recognition demo.
@@ -255,16 +300,15 @@ python tools/test_recognition.py \
--det_model_file_path models/blazeface_fpn_ssh_1000e_v1.0_infer/inference.pdmodel \
--det_params_file_path models/blazeface_fpn_ssh_1000e_v1.0_infer/inference.pdiparams \
--rec \
- --rec_model_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdmodel \
- --rec_params_file_path models/ms1mv3_r50_static_128_fp16_0.1_epoch_24_infer/FresResNet50.pdiparams \
- --index=images/gallery/index.bin \
- --input=images/query/friends2.jpg \
+ --rec_model_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdmodel \
+ --rec_params_file_path models/MS1M_v3_arcface_Res50_dynamic_0.1_NHWC_FP16_v2.2_infer/FresResNet50.pdiparams \
+ --index=images/Friends/gallery/index.bin \
+ --input=images/Friends/query/friends2.jpg \
--cdd_num 10 \
- --rec_thresh 0.4 \
--output="./output"
```
-The final result is save in folder `output/`, which is shown as follows.
+The final result is save in folder `output/`, which is shown as follows. **Note:** the recognition threshold is different according recognition model. Since we do not use landmark detection to align the face, the threshold is lower.
diff --git a/configs/argparser.py b/configs/argparser.py
index dd220eb3efee9..5942d50f11c5e 100644
--- a/configs/argparser.py
+++ b/configs/argparser.py
@@ -71,6 +71,11 @@ def parse_args():
parser.parse_known_args(namespace=user_namespace)
cfg = get_config(user_namespace.config_file)
+ parser.add_argument(
+ '--seed',
+ type=int,
+ default=cfg.seed,
+ help='global seed, None means do not fix seed, int value means to run reproduction')
# Model setting
parser.add_argument(
'--is_static',
@@ -81,7 +86,7 @@ def parse_args():
'--data_format',
type=str,
default=cfg.data_format,
- help='model data layout, "NCHW" or "NHWC"')
+ help='model data layout, "NCHW" for FP32 or "NHWC" for FP16')
parser.add_argument(
'--backbone', type=str, default=cfg.backbone, help='backbone network')
parser.add_argument(
diff --git a/configs/config.py b/configs/config.py
index d30c6fcc58877..c417a343f51f1 100644
--- a/configs/config.py
+++ b/configs/config.py
@@ -15,8 +15,10 @@
from easydict import EasyDict as edict
config = edict()
+config.seed = None # global seed, None means do not fix seed, int value means to run reproduction
+
config.is_static = True
-config.data_format = 'NCHW' # 'NCHW' or 'NHWC'
+config.data_format = 'NHWC' # 'NCHW' for FP32 or 'NHWC' for FP16
config.backbone = 'FresResNet100'
config.classifier = 'LargeScaleClassifier'
config.embedding_size = 512
diff --git a/configs/ms1mv2_mobileface.py b/configs/ms1mv3_mobileface.py
similarity index 81%
rename from configs/ms1mv2_mobileface.py
rename to configs/ms1mv3_mobileface.py
index e29a062a54911..ff7b4738ed20d 100644
--- a/configs/ms1mv2_mobileface.py
+++ b/configs/ms1mv3_mobileface.py
@@ -16,14 +16,17 @@
config = edict()
config.is_static = False
+config.data_format = 'NCHW'
config.backbone = 'MobileFaceNet_128'
config.classifier = 'LargeScaleClassifier'
config.embedding_size = 128
config.model_parallel = True
-config.sample_ratio = 1.0
+config.sample_ratio = 0.1
config.loss = 'ArcFace'
config.dropout = 0.0
+config.fp16 = False
+
config.lr = 0.1 # for global batch size = 512
config.lr_decay = 0.1
config.weight_decay = 5e-4
@@ -34,11 +37,11 @@
config.decay_boundaries = [10, 16, 22]
config.use_synthetic_dataset = False
-config.dataset = "MS1M_v2"
-config.data_dir = "./MS1M_v2"
-config.label_file = "./MS1M_v2/label.txt"
+config.dataset = "MS1M_v3"
+config.data_dir = "./MS1M_v3"
+config.label_file = "./MS1M_v3/label.txt"
config.is_bin = False
-config.num_classes = 85742 # 85742 for MS1M_v2, 93431 for MS1M_v3
+config.num_classes = 93431 # 85742 for MS1M_v2, 93431 for MS1M_v3
config.batch_size = 128 # global batch size 1024 of 8 GPU
config.num_workers = 8
@@ -48,7 +51,7 @@
config.logdir = './log'
config.log_interval_step = 100
-config.output = './MS1M_v2_arcface_MobileFaceNet_128_0.1'
+config.output = './MS1M_v3_arcface_MobileFaceNet_128_0.1'
config.resume = False
config.checkpoint_dir = None
config.max_num_last_checkpoint = 1
diff --git a/datasets/common_dataset.py b/datasets/common_dataset.py
index 33ba8df498c25..5764279cebfb7 100644
--- a/datasets/common_dataset.py
+++ b/datasets/common_dataset.py
@@ -26,7 +26,6 @@
from datasets.kv_helper import read_img_from_bin
-
def transform(img):
# random horizontal flip
if random.randint(0, 1) == 0:
@@ -40,11 +39,14 @@ def transform(img):
class CommonDataset(paddle.io.Dataset):
- def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True):
+ def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True, seed=0):
super(CommonDataset, self).__init__()
self.root_dir = root_dir
self.label_file = label_file
self.fp16 = fp16
+ self.seed = seed
+ if self.seed != 0:
+ random.seed(self.seed)
with open(label_file, "r") as fin:
self.full_lines = fin.readlines()
@@ -78,13 +80,17 @@ def __len__(self):
return self.num_samples
class SplitDataset(paddle.io.Dataset):
- def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True):
+ def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True, seed=0):
super(SplitDataset, self).__init__()
self.root_dir = root_dir
self.label_file = label_file
self.rank = rank
self.world_size = world_size
self.fp16 = fp16
+ self.seed = seed
+ if self.seed != 0:
+ random.seed(self.seed)
+
with open(label_file, "r") as fin:
self.full_lines = fin.readlines()
diff --git a/dynamic/backbones/iresnet.py b/dynamic/backbones/iresnet.py
index 3c087fe0a2c6a..37bc145bd772a 100644
--- a/dynamic/backbones/iresnet.py
+++ b/dynamic/backbones/iresnet.py
@@ -101,7 +101,7 @@ def __init__(self,
act=None,
name=name + "_branch2a",
data_format=data_format)
- self.prelu = PReLU(num_parameters=num_filters, name=name + "_branch2a_prelu")
+ self.prelu = PReLU(num_parameters=num_filters, data_format=data_format, name=name + "_branch2a_prelu")
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters,
@@ -283,7 +283,7 @@ def __init__(self,
act=None,
name="conv1",
data_format=self.data_format)
- self.prelu = PReLU(num_parameters=64, name="prelu1")
+ self.prelu = PReLU(num_parameters=64, data_format=self.data_format, name="prelu1")
self.block_list = paddle.nn.LayerList()
for block in range(len(units)):
@@ -308,13 +308,15 @@ def __init__(self,
feat_w = input_image_width // 16
feat_h = input_image_height // 16
self.fc_channels = num_filters[-1] * feat_w * feat_h
+ #NOTE(GuoxiaWang): don't use NHWC for last fc,
+ # thus we can train using NHWC and test using NCHW
self.fc = FC(num_filters[-1],
self.fc_channels,
num_features,
fc_type,
dropout,
name='fc',
- data_format=self.data_format)
+ data_format="NCHW")
def forward(self, inputs):
if self.data_format == "NHWC":
@@ -324,6 +326,8 @@ def forward(self, inputs):
y = self.prelu(y)
for block in self.block_list:
y = block(y)
+ if self.data_format == "NHWC":
+ y = paddle.tensor.transpose(y, [0, 3, 1, 2])
y = self.fc(y)
return y
diff --git a/dynamic/backbones/mobilefacenet.py b/dynamic/backbones/mobilefacenet.py
index 5b7bfd1d919a6..56a4490a1497a 100644
--- a/dynamic/backbones/mobilefacenet.py
+++ b/dynamic/backbones/mobilefacenet.py
@@ -42,7 +42,7 @@ def __init__(self, inp, oup, stride, expansion, data_format="NCHW"):
nn.Conv2D(
inp, inp * expansion, 1, 1, 0, bias_attr=False, data_format=data_format),
nn.BatchNorm2D(inp * expansion, data_format=data_format),
- nn.PReLU(inp * expansion),
+ nn.PReLU(inp * expansion, data_format=data_format),
# 3*3 depth wise conv
nn.Conv2D(
@@ -56,7 +56,7 @@ def __init__(self, inp, oup, stride, expansion, data_format="NCHW"):
data_format=data_format
),
nn.BatchNorm2D(inp * expansion, data_format=data_format),
- nn.PReLU(inp * expansion),
+ nn.PReLU(inp * expansion, data_format=data_format),
# 1*1 conv
nn.Conv2D(
@@ -82,7 +82,7 @@ def __init__(self, inp, oup, k, s, p, dw=False, linear=False, data_format="NCHW"
self.bn = nn.BatchNorm2D(oup, data_format=data_format)
if not linear:
- self.prelu = nn.PReLU(oup)
+ self.prelu = nn.PReLU(oup, data_format=data_format)
def forward(self, x):
x = self.conv(x)
@@ -155,6 +155,8 @@ def forward(self, x):
x = self.conv2(x)
x = self.linear7(x)
x = self.linear1(x)
+ if self.data_format == "NHWC":
+ x = paddle.tensor.transpose(x, [0, 3, 1, 2])
x = x.reshape([x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]])
return x
diff --git a/dynamic/export.py b/dynamic/export.py
index a41d30aeb1282..1b91262c00ece 100644
--- a/dynamic/export.py
+++ b/dynamic/export.py
@@ -30,7 +30,7 @@ def export(args):
backbone = eval("backbones.{}".format(args.backbone))(
num_features=args.embedding_size)
- checkpoint.load(backbone, for_train=False, dtype='float32')
+ checkpoint.load(backbone, for_train=False)
print("Load checkpoint from '{}'.".format(args.checkpoint_dir))
backbone.eval()
diff --git a/dynamic/train.py b/dynamic/train.py
index 047fd92cf863d..87804faff4e56 100644
--- a/dynamic/train.py
+++ b/dynamic/train.py
@@ -17,6 +17,7 @@
import sys
import numpy as np
import logging
+import random
import paddle
from visualdl import LogWriter
@@ -33,17 +34,8 @@
from . import classifiers
from . import backbones
-RELATED_FLAGS_SETTING = {
- 'FLAGS_cudnn_exhaustive_search': 1,
- 'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
- 'FLAGS_max_inplace_grad_add': 8,
- 'FLAGS_fraction_of_gpu_memory_to_use': 0.9999,
-}
-paddle.fluid.set_flags(RELATED_FLAGS_SETTING)
-
def train(args):
- writer = LogWriter(logdir=args.logdir)
rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
@@ -51,6 +43,24 @@ def train(args):
gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
place = paddle.CUDAPlace(gpu_id)
+ RELATED_FLAGS_SETTING = {}
+ if args.seed == 0:
+ RELATED_FLAGS_SETTING['FLAGS_cudnn_deterministic'] = 1
+ RELATED_FLAGS_SETTING['FLAGS_benchmark'] = 1
+ args.num_workers = 0
+ else:
+ # args.seed == None or args.seed != 0
+ RELATED_FLAGS_SETTING['FLAGS_cudnn_exhaustive_search'] = 1
+ RELATED_FLAGS_SETTING['FLAGS_cudnn_batchnorm_spatial_persistent'] = 1
+ RELATED_FLAGS_SETTING['FLAGS_max_inplace_grad_add'] = 8
+ paddle.fluid.set_flags(RELATED_FLAGS_SETTING)
+
+ if args.seed is not None:
+ args.seed = args.seed + rank
+ paddle.seed(args.seed)
+ np.random.seed(args.seed)
+ random.seed(args.seed)
+
if world_size > 1:
import paddle.distributed.fleet as fleet
@@ -67,7 +77,8 @@ def train(args):
rank=rank,
world_size=world_size,
fp16=args.fp16,
- is_bin=args.is_bin)
+ is_bin=args.is_bin,
+ seed=args.seed)
num_image = trainset.total_num_samples
total_batch_size = args.batch_size * world_size
@@ -139,6 +150,7 @@ def train(args):
callback_verification = CallBackVerification(
args.validation_interval_step,
rank,
+ world_size,
args.batch_size,
args.val_targets,
args.data_dir,
@@ -146,7 +158,7 @@ def train(args):
callback_logging = CallBackLogging(args.log_interval_step, rank,
world_size, total_steps,
- args.batch_size, writer)
+ args.batch_size)
checkpoint = Checkpoint(
rank=rank,
@@ -213,7 +225,16 @@ def train(args):
loss_avg.update(loss_v.item(), 1)
callback_logging(global_step, loss_avg, epoch, lr_value)
if args.do_validation_while_train:
- callback_verification(global_step, backbone)
+ best_metric = callback_verification(global_step, backbone)
+ if best_metric is not None and len(best_metric) > 0:
+ for ver_dataset in best_metric:
+ checkpoint.save(
+ backbone,
+ classifier,
+ optimizer,
+ epoch=epoch,
+ for_train=True,
+ best_metric=best_metric[ver_dataset])
lr_scheduler.step()
if global_step >= total_steps:
@@ -222,4 +243,3 @@ def train(args):
checkpoint.save(
backbone, classifier, optimizer, epoch=epoch, for_train=True)
- writer.close()
diff --git a/dynamic/utils/io.py b/dynamic/utils/io.py
index a30449c86fecb..b98f601ba9ee9 100644
--- a/dynamic/utils/io.py
+++ b/dynamic/utils/io.py
@@ -19,6 +19,8 @@
import numpy as np
import shutil
import json
+
+from paddle.fluid.data_feeder import convert_dtype
from utils.rearrange_weight import rearrange_weight
@@ -45,9 +47,16 @@ def save(self,
classifier: paddle.nn.Layer=None,
optimizer=None,
epoch=0,
- for_train=True):
-
- model_save_dir = os.path.join(self.model_save_dir, str(epoch))
+ for_train=True,
+ best_metric=None):
+
+ if best_metric is not None:
+ save_rank = best_metric['rank']
+ model_save_dir = os.path.join(self.model_save_dir, 'best_model', str(best_metric['dataset_name']))
+ else:
+ save_rank = 0 # default we only save rank 0 backbone
+ model_save_dir = os.path.join(self.model_save_dir, str(epoch))
+
if not os.path.exists(model_save_dir):
# may be more than one processes trying
# to create the directory
@@ -58,7 +67,7 @@ def save(self,
raise
pass
- if self.rank == 0:
+ if self.rank == save_rank:
# for non dist param, we only save their at rank 0.
for name, param in backbone.state_dict().items():
paddle.save(
@@ -85,7 +94,7 @@ def save(self,
paddle.save(opt,
os.path.join(model_save_dir, name + '.pdopt'))
- if self.rank == 0:
+ if self.rank == save_rank:
# save some extra info for resume
# pretrain_world_size, embedding_size, num_classes are used for
# re-split fc weight when gpu setting changed.
@@ -97,6 +106,8 @@ def save(self,
extra_info['num_classes'] = self.num_classes
extra_info['epoch'] = epoch
extra_info['lr_state'] = lr_state_dict
+ if best_metric is not None:
+ extra_info['best_metric'] = best_metric
with open(config_file, 'w') as f:
json.dump(extra_info, f)
@@ -117,7 +128,25 @@ def load(self,
assert os.path.exists(self.checkpoint_dir)
checkpoint_dir = os.path.abspath(self.checkpoint_dir)
-
+
+ type_dict = {}
+ for name, param in backbone.state_dict().items():
+ type_dict[param.name] = convert_dtype(param.dtype)
+
+ if classifier is not None:
+ # for dist param, we need to save their at all ranks.
+ for name, param in classifier.state_dict().items():
+ type_dict[param.name] = convert_dtype(param.dtype)
+
+ if for_train:
+ assert optimizer is not None
+ opt_state_dict = optimizer.state_dict()
+ lr_state_dict = opt_state_dict['LR_Scheduler']
+ for name, opt in opt_state_dict.items():
+ if name == 'LR_Scheduler' or '@GRAD' in name:
+ continue
+ type_dict[name] = convert_dtype(opt.dtype)
+
param_state_dict = {}
opt_state_dict = {}
dist_param_state_dict = {}
@@ -139,11 +168,16 @@ def load(self,
if not for_train and ext == '.pdopt':
continue
+
+ if classifier is None and 'dist@' in name and '@rank@' in name:
+ continue
tensor = paddle.load(path, return_numpy=True)
if dtype:
assert dtype in ['float32', 'float16']
tensor = tensor.astype(dtype)
+ else:
+ tensor = tensor.astype(type_dict[name])
if 'dist@' in name and '@rank@' in name:
if '.w' in name and 'velocity' not in name:
diff --git a/dynamic/utils/verification.py b/dynamic/utils/verification.py
index bafda1eab0dc0..bd9f78e3d3300 100644
--- a/dynamic/utils/verification.py
+++ b/dynamic/utils/verification.py
@@ -80,6 +80,7 @@ class CallBackVerification(object):
def __init__(self,
frequent,
rank,
+ world_size,
batch_size,
val_targets,
rec_prefix,
@@ -87,6 +88,7 @@ def __init__(self,
image_size=(112, 112)):
self.frequent: int = frequent
self.rank: int = rank
+ self.world_size: int = world_size
self.batch_size: int = batch_size
self.fp16 = fp16
self.highest_acc_list: List[float] = [0.0] * len(val_targets)
@@ -98,6 +100,7 @@ def __init__(self,
image_size=image_size)
def ver_test(self, backbone: paddle.nn.Layer, global_step: int):
+ best_metric = {}
for i in range(len(self.ver_list)):
test_start = time.time()
acc1, std1, acc2, std2, xnorm, embeddings_list = test(
@@ -110,12 +113,27 @@ def ver_test(self, backbone: paddle.nn.Layer, global_step: int):
(self.ver_name_list[i], global_step, xnorm))
logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
(self.ver_name_list[i], global_step, acc2, std2))
- if acc2 > self.highest_acc_list[i]:
- self.highest_acc_list[i] = acc2
+ if self.world_size > 1:
+ max_acc_tensor = paddle.to_tensor(acc2, dtype='float64')
+ paddle.distributed.all_reduce(max_acc_tensor, paddle.distributed.ReduceOp.MAX)
+ max_acc = max_acc_tensor.item()
+ else:
+ max_acc = acc2
+ if max_acc > self.highest_acc_list[i]:
+ self.highest_acc_list[i] = max_acc
+ if abs(max_acc - acc2) < 1e-8:
+ best_metric[self.ver_name_list[i]] = {
+ 'global_step': global_step,
+ 'acc2': acc2,
+ 'rank': self.rank,
+ 'dataset_name': self.ver_name_list[i],
+ }
+
logging.info('[%s][%d]Accuracy-Highest: %1.5f' % (
self.ver_name_list[i], global_step, self.highest_acc_list[i]))
test_end = time.time()
logging.info("test time: {:.4f}".format(test_end - test_start))
+ return best_metric
def init_dataset(self, val_targets, data_dir, image_size):
for name in val_targets:
@@ -129,5 +147,7 @@ def __call__(self, num_update, backbone: paddle.nn.Layer):
if num_update > 0 and num_update % self.frequent == 0:
backbone.eval()
with paddle.no_grad():
- self.ver_test(backbone, num_update)
+ best_metric = self.ver_test(backbone, num_update)
backbone.train()
+ return best_metric
+ return None
diff --git a/dynamic/validation.py b/dynamic/validation.py
index 52dc9bd875da5..330df0d17a4f3 100644
--- a/dynamic/validation.py
+++ b/dynamic/validation.py
@@ -30,11 +30,11 @@ def validation(args):
checkpoint_dir=args.checkpoint_dir, )
backbone = eval("backbones.{}".format(args.backbone))(
- num_features=args.embedding_size)
+ num_features=args.embedding_size, dropout=0.0, data_format="NHWC")
checkpoint.load(backbone, for_train=False)
backbone.eval()
callback_verification = CallBackVerification(
- 1, 0, args.batch_size, args.val_targets, args.data_dir)
+ 1, 0, 1, args.batch_size, args.val_targets, args.data_dir)
callback_verification(1, backbone)
diff --git a/experiments/images/friends2.jpg b/experiments/images/friends2.jpg
index f5dad20a7fcf9..5a2ba40ed2b5a 100644
Binary files a/experiments/images/friends2.jpg and b/experiments/images/friends2.jpg differ
diff --git a/experiments/images/throughtput.png b/experiments/images/throughtput.png
index 8d81dee589563..f94f6092a7786 100644
Binary files a/experiments/images/throughtput.png and b/experiments/images/throughtput.png differ
diff --git a/requirement.txt b/requirement.txt
index 25de974effc99..17669c34bf1f3 100644
--- a/requirement.txt
+++ b/requirement.txt
@@ -1,4 +1,3 @@
-visualdl
opencv-python
pillow
numpy
@@ -14,3 +13,4 @@ opencv-python==4.4.0.46
onnxruntime
onnx
paddle2onnx
+paddlepaddle-gpu>=2.2.2
diff --git a/scripts/find_maximum_classes_dynamic.sh b/scripts/find_maximum_classes_dynamic.sh
index 53cd9d1f1591b..8628fd6d4b1e7 100644
--- a/scripts/find_maximum_classes_dynamic.sh
+++ b/scripts/find_maximum_classes_dynamic.sh
@@ -13,6 +13,7 @@
# limitations under the License.
export FLAGS_allocator_strategy=naive_best_fit
+export FLAGS_fraction_of_gpu_memory_to_use=0.9999
python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
--config_file configs/ms1mv3_r50.py \
--is_static False \
@@ -22,7 +23,7 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
--sample_ratio 0.1 \
--loss ArcFace \
--batch_size 64 \
- --num_classes 67000000 \
+ --num_classes 87000000 \
--use_synthetic_dataset True \
--do_validation_while_train False \
--log_interval_step 1 \
diff --git a/scripts/find_maximum_classes_static.sh b/scripts/find_maximum_classes_static.sh
index 4bc72f58fbf1e..e3fc91014f1c0 100644
--- a/scripts/find_maximum_classes_static.sh
+++ b/scripts/find_maximum_classes_static.sh
@@ -13,6 +13,7 @@
# limitations under the License.
export FLAGS_allocator_strategy=naive_best_fit
+export FLAGS_fraction_of_gpu_memory_to_use=0.999
python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
--config_file configs/ms1mv3_r50.py \
--is_static True \
@@ -22,10 +23,11 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
--sample_ratio 0.1 \
--loss ArcFace \
--batch_size 64 \
- --num_classes 60000000 \
+ --num_classes 92000000 \
--use_synthetic_dataset True \
--do_validation_while_train False \
--log_interval_step 1 \
--fp16 True \
--lsc_init_from_numpy False \
--output fp16_arcface_static_0.1_maximum_classes
+
diff --git a/scripts/perf_runner.sh b/scripts/perf_runner.sh
index 267c8d3cb7982..002a5e703f982 100644
--- a/scripts/perf_runner.sh
+++ b/scripts/perf_runner.sh
@@ -32,9 +32,7 @@ fi
if [ $dtype = "fp16" ]; then
fp16=True
- data_format=NCHW
-# TODO(GuoxiaWang): remove NCHW when PRelu support NHWC
-# data_format=NHWC
+ data_format=NHWC
else
fp16=False
data_format=NCHW
diff --git a/scripts/train_dynamic.sh b/scripts/train_dynamic.sh
index a623945992f51..dc4b502b730d0 100644
--- a/scripts/train_dynamic.sh
+++ b/scripts/train_dynamic.sh
@@ -12,7 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
+# multi node ip list, e.g:
+# TRAINER_IP_LIST=10.11.12.1,10.11.12.2
+TRAINER_IP_LIST=127.0.0.1
+# other gpus training, e.g:
+# CUDA_VISIBLE_DEVICES=2
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m paddle.distributed.launch --ips=$TRAINER_IP_LIST --gpus=$CUDA_VISIBLE_DEVICES tools/train.py \
--config_file configs/ms1mv3_r50.py \
--is_static False \
--backbone FresResNet50 \
@@ -38,4 +44,4 @@ python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
--warmup_num 0 \
--train_num 25 \
--decay_boundaries "10,16,22" \
- --output MS1M_v3_arcface_dynamic_0.1
+ --output MS1M_v3_arcface_dynamic_0.1_NHWC_FP16
diff --git a/scripts/train_static.sh b/scripts/train_static.sh
index 52ca2be1eb12d..4c027097b0c96 100644
--- a/scripts/train_static.sh
+++ b/scripts/train_static.sh
@@ -12,7 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 tools/train.py \
+# multi node ip list, e.g:
+# TRAINER_IP_LIST=10.11.12.1,10.11.12.2
+TRAINER_IP_LIST=127.0.0.1
+# other gpus training, e.g:
+# CUDA_VISIBLE_DEVICES=2
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m paddle.distributed.launch --ips=$TRAINER_IP_LIST --gpus=$CUDA_VISIBLE_DEVICES tools/train.py \
--config_file configs/ms1mv3_r50.py \
--is_static True \
--backbone FresResNet50 \
diff --git a/scripts/validation_dynamic.sh b/scripts/validation_dynamic.sh
index 3b635c5f05896..c927e28290137 100644
--- a/scripts/validation_dynamic.sh
+++ b/scripts/validation_dynamic.sh
@@ -16,7 +16,7 @@ python tools/validation.py \
--is_static False \
--backbone FresResNet50 \
--embedding_size 512 \
- --checkpoint_dir MS1M_v3_arcface_dynamic_128_fp16_0.1/FresResNet50/24 \
+ --checkpoint_dir MS1M_v3_arcface_dynamic_0.1_NHWC/FresResNet50/best_model/cfp_fp \
--data_dir MS1M_v3/ \
--val_targets lfw,cfp_fp,agedb_30 \
--batch_size 128
diff --git a/scripts/validation_static.sh b/scripts/validation_static.sh
index 64227f7debd86..1f7a586be5cdc 100644
--- a/scripts/validation_static.sh
+++ b/scripts/validation_static.sh
@@ -16,7 +16,7 @@ python tools/validation.py \
--is_static True \
--backbone FresResNet50 \
--embedding_size 512 \
- --checkpoint_dir MS1M_v3_arcface_static_128_fp16_0.1/FresResNet50/24 \
+ --checkpoint_dir MS1M_v3_arcface_static_0.1_NHWC_FP16/FresResNet50/best_model/cfp_fp/ \
--data_dir MS1M_v3/ \
--val_targets lfw,cfp_fp,agedb_30 \
--batch_size 128
diff --git a/static/backbones/iresnet.py b/static/backbones/iresnet.py
index 7396c52e708ed..02aaaf1164e36 100644
--- a/static/backbones/iresnet.py
+++ b/static/backbones/iresnet.py
@@ -86,10 +86,10 @@ def __init__(self,
momentum=0.9,
data_layout=data_format,
is_test=False if is_train else True)
- # TODO(GuoxiaWang): add data_format attr
input_blob = paddle.static.nn.prelu(
input_blob,
mode="channel",
+ data_format=data_format,
param_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Constant(0.25)))
@@ -98,8 +98,12 @@ def __init__(self,
input_blob = self.residual_unit_v3(
input_blob, filter_list[i + 1], 3, 2
if j == 0 else 1, 1, is_train, data_format)
- fc1 = self.get_fc1(input_blob, is_train, dropout, data_format)
+ if data_format == 'NHWC':
+ input_blob = paddle.tensor.transpose(input_blob, [0, 3, 1, 2])
+ #NOTE(GuoxiaWang): don't use NHWC for last fc,
+ # thus we can train using NHWC and test using NCHW
+ fc1 = self.get_fc1(input_blob, is_train, dropout, data_format="NCHW")
self.output_dict['feature'] = fc1
def residual_unit_v3(self,
@@ -135,10 +139,10 @@ def residual_unit_v3(self,
momentum=0.9,
data_layout=data_format,
is_test=False if is_train else True)
- # TODO(GuoxiaWang): add data_format attr
prelu = paddle.static.nn.prelu(
bn2,
mode="channel",
+ data_format=data_format,
param_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Constant(0.25)))
conv2 = paddle.static.nn.conv2d(
diff --git a/static/train.py b/static/train.py
index 98031df3d5ae4..296aae415458a 100644
--- a/static/train.py
+++ b/static/train.py
@@ -17,6 +17,7 @@
import sys
import numpy as np
import logging
+import random
import paddle
from visualdl import LogWriter
@@ -32,31 +33,39 @@
from . import backbones
from .static_model import StaticModel
-RELATED_FLAGS_SETTING = {
- 'FLAGS_cudnn_exhaustive_search': 1,
- 'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
- 'FLAGS_max_inplace_grad_add': 8,
- 'FLAGS_fraction_of_gpu_memory_to_use': 0.9999,
-}
-paddle.fluid.set_flags(RELATED_FLAGS_SETTING)
-
def train(args):
- writer = LogWriter(logdir=args.logdir)
-
rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
place = paddle.CUDAPlace(gpu_id)
+ RELATED_FLAGS_SETTING = {}
+ if args.seed == 0:
+ RELATED_FLAGS_SETTING['FLAGS_cudnn_deterministic'] = 1
+ RELATED_FLAGS_SETTING['FLAGS_benchmark'] = 1
+ args.num_workers = 0
+ else:
+ # args.seed == None or args.seed != 0
+ RELATED_FLAGS_SETTING['FLAGS_cudnn_exhaustive_search'] = 1
+ RELATED_FLAGS_SETTING['FLAGS_cudnn_batchnorm_spatial_persistent'] = 1
+ RELATED_FLAGS_SETTING['FLAGS_max_inplace_grad_add'] = 8
+ paddle.fluid.set_flags(RELATED_FLAGS_SETTING)
+
+ if args.seed is not None:
+ args.seed = args.seed + rank
+ paddle.seed(args.seed)
+ np.random.seed(args.seed)
+ random.seed(args.seed)
+
if world_size > 1:
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.without_graph_optimization = True
fleet.init(is_collective=True, strategy=strategy)
-
+
if args.use_synthetic_dataset:
trainset = datasets.SyntheticDataset(args.num_classes, fp16=args.fp16)
else:
@@ -131,9 +140,9 @@ def train(args):
'custom_black_list': args.custom_black_list,
},
margin_loss_params=margin_loss_params,
- data_format=args.data_format,
+ data_format=args.data_format,
lsc_init_from_numpy=args.lsc_init_from_numpy, )
-
+
if rank == 0:
with open(os.path.join(args.output, 'main_program.txt'), 'w') as f:
f.write(str(train_program))
@@ -150,14 +159,15 @@ def train(args):
data_format=args.data_format, )
callback_verification = CallBackVerification(
- args.validation_interval_step, rank, args.batch_size, test_program,
+ args.validation_interval_step, rank, world_size, args.batch_size,
+ test_program,
list(test_model.backbone.input_dict.values()),
list(test_model.backbone.output_dict.values()), args.val_targets,
args.data_dir)
callback_logging = CallBackLogging(args.log_interval_step, rank,
world_size, total_steps,
- args.batch_size, writer)
+ args.batch_size)
checkpoint = Checkpoint(
rank=rank,
world_size=world_size,
@@ -181,7 +191,7 @@ def train(args):
# since we always use step style for lr_scheduler
global_step = lr_state['last_epoch']
train_model.lr_scheduler.set_state_dict(lr_state)
-
+
batch_sampler = eval("paddle.io.{}".format(args.batch_sampler))(
dataset=trainset,
batch_size=args.batch_size,
@@ -210,7 +220,16 @@ def train(args):
lr_value = train_model.optimizer.get_lr()
callback_logging(global_step, loss_avg, epoch, lr_value)
if args.do_validation_while_train:
- callback_verification(global_step)
+ best_metric = callback_verification(global_step)
+ if best_metric is not None and len(best_metric) > 0:
+ for ver_dataset in best_metric:
+ checkpoint.save(
+ train_program,
+ lr_scheduler=train_model.lr_scheduler,
+ epoch=epoch,
+ for_train=True,
+ best_metric=best_metric[ver_dataset])
+
train_model.lr_scheduler.step()
if global_step >= total_steps:
@@ -222,4 +241,3 @@ def train(args):
lr_scheduler=train_model.lr_scheduler,
epoch=epoch,
for_train=True)
- writer.close()
diff --git a/static/utils/io.py b/static/utils/io.py
index acc98ce38109a..1dbff6118832a 100644
--- a/static/utils/io.py
+++ b/static/utils/io.py
@@ -19,6 +19,8 @@
import numpy as np
import shutil
import json
+
+from paddle.fluid.data_feeder import convert_dtype
from utils.rearrange_weight import rearrange_weight
@@ -40,8 +42,21 @@ def __init__(self,
self.checkpoint_dir: str = checkpoint_dir
self.max_num_last_checkpoint: int = max_num_last_checkpoint
- def save(self, program, lr_scheduler=None, epoch=0, for_train=True):
- model_save_dir = os.path.join(self.model_save_dir, str(epoch))
+ def save(self,
+ program,
+ lr_scheduler=None,
+ epoch=0,
+ for_train=True,
+ best_metric=None):
+
+ if best_metric is not None:
+ save_rank = best_metric['rank']
+ model_save_dir = os.path.join(self.model_save_dir, 'best_model',
+ str(best_metric['dataset_name']))
+ else:
+ save_rank = 0 # default we only save rank 0 backbone
+ model_save_dir = os.path.join(self.model_save_dir, str(epoch))
+
if not os.path.exists(model_save_dir):
# may be more than one processes trying
# to create the directory
@@ -56,7 +71,7 @@ def save(self, program, lr_scheduler=None, epoch=0, for_train=True):
for name, param in param_state_dict.items():
# for non dist param, we only save their at rank 0,
# but for dist param, we need to save their at all ranks.
- if 'dist@' in name and '@rank@' in name or self.rank == 0:
+ if 'dist@' in name and '@rank@' in name or self.rank == save_rank:
paddle.save(param,
os.path.join(model_save_dir, name + '.pdparam'))
@@ -71,7 +86,7 @@ def save(self, program, lr_scheduler=None, epoch=0, for_train=True):
paddle.save(opt,
os.path.join(model_save_dir, name + '.pdopt'))
- if self.rank == 0:
+ if self.rank == save_rank:
# save some extra info for resume
# pretrain_world_size, embedding_size, num_classes are used for
# re-split fc weight when gpu setting changed.
@@ -83,6 +98,8 @@ def save(self, program, lr_scheduler=None, epoch=0, for_train=True):
extra_info['num_classes'] = self.num_classes
extra_info['epoch'] = epoch
extra_info['lr_state'] = lr_scheduler.state_dict()
+ if best_metric is not None:
+ extra_info['best_metric'] = best_metric
with open(config_file, 'w') as f:
json.dump(extra_info, f)
@@ -98,6 +115,17 @@ def load(self, program, for_train=True, dtype=None):
assert os.path.exists(self.checkpoint_dir)
checkpoint_dir = os.path.abspath(self.checkpoint_dir)
+ param_state_dict = program.state_dict(mode='param')
+ opt_state_dict = program.state_dict(mode='opt')
+ type_dict = {}
+ shape_dict = {}
+ for name, param in param_state_dict.items():
+ type_dict[name] = convert_dtype(param._dtype())
+ shape_dict[name] = param.shape()
+ for name, opt in opt_state_dict.items():
+ type_dict[name] = convert_dtype(opt._dtype())
+ shape_dict[name] = opt.shape()
+
state_dict = {}
dist_weight_state_dict = {}
dist_weight_velocity_state_dict = {}
@@ -117,10 +145,30 @@ def load(self, program, for_train=True, dtype=None):
if not for_train and ext == '.pdopt':
continue
+ if name not in type_dict:
+ continue
+
tensor = paddle.load(path, return_numpy=True)
if dtype:
assert dtype in ['float32', 'float16']
tensor = tensor.astype(dtype)
+ elif name in type_dict:
+ tensor = tensor.astype(type_dict[name])
+ else:
+ pass
+
+ if list(shape_dict[name]) != list(tensor.shape):
+ # for prelu NHWC[1, 1, 1, C] and NCHW [1, C, 1, 1]
+ expect_shape = list(shape_dict[name])
+ actual_shape = list(tensor.shape)
+ if len(expect_shape) == len(actual_shape) and \
+ expect_shape[0] == actual_shape[0] and expect_shape[0] == 1 and \
+ expect_shape[2] == actual_shape[2] and expect_shape[2] == 1 and \
+ expect_shape[1] == actual_shape[3]:
+ if actual_shape[3] != 1:
+ tensor = tensor.transpose([0, 3, 1, 2])
+ elif actual_shape[1] != 1:
+ tensor = tensor.transpose([0, 2, 3, 1])
if 'dist@' in name and '@rank@' in name:
if '.w' in name and 'velocity' not in name:
diff --git a/static/utils/verification.py b/static/utils/verification.py
index 3040d2cd74e0a..c0ce0bb490561 100644
--- a/static/utils/verification.py
+++ b/static/utils/verification.py
@@ -66,11 +66,12 @@ def test(rank, batch_size, data_set, executor, test_program, data_feeder,
acc, std = np.mean(accuracy), np.std(accuracy)
return acc, std, xnorm
-
+# Ref: https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/utils/utils_callbacks.py
class CallBackVerification(object):
def __init__(self,
frequent,
rank,
+ world_size,
batch_size,
test_program,
feed_list,
@@ -80,12 +81,13 @@ def __init__(self,
image_size=(112, 112)):
self.frequent: int = frequent
self.rank: int = rank
+ self.world_size: int = world_size
self.batch_size: int = batch_size
self.test_program: paddle.static.Program = test_program
self.feed_list: List[paddle.fluid.framework.Variable] = feed_list
self.fetch_list: List[paddle.fluid.framework.Variable] = fetch_list
-
+
self.highest_acc_list: List[float] = [0.0] * len(val_targets)
self.ver_list: List[object] = []
self.ver_name_list: List[str] = []
@@ -95,12 +97,19 @@ def __init__(self,
image_size=image_size)
gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
- place = paddle.CUDAPlace(gpu_id)
- self.executor = paddle.static.Executor(place)
+ self.place = paddle.CUDAPlace(gpu_id)
+ self.executor = paddle.static.Executor(self.place)
self.data_feeder = paddle.fluid.DataFeeder(
- place=place, feed_list=self.feed_list, program=self.test_program)
+ place=self.place, feed_list=self.feed_list, program=self.test_program)
+
+ if self.world_size > 1:
+ self.test_scope = paddle.static.Scope()
+ with paddle.fluid.scope_guard(self.test_scope):
+ self.max_acc_var = paddle.static.create_global_var(shape=[1], value=0.0, dtype='float64', persistable=False, name='acc2')
+ self.executor.run(paddle.static.default_startup_program())
def ver_test(self, global_step: int):
+ best_metric = {}
for i in range(len(self.ver_list)):
test_start = time.time()
acc2, std2, xnorm = test(
@@ -110,12 +119,30 @@ def ver_test(self, global_step: int):
(self.ver_name_list[i], global_step, xnorm))
logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
(self.ver_name_list[i], global_step, acc2, std2))
- if acc2 > self.highest_acc_list[i]:
- self.highest_acc_list[i] = acc2
+ if self.world_size > 1:
+ with paddle.fluid.scope_guard(self.test_scope):
+ max_acc_tensor = self.max_acc_var.get_value()
+ max_acc_tensor.set(np.array([acc2], dtype='float64'), self.place)
+ paddle.distributed.all_reduce(self.max_acc_var, paddle.distributed.ReduceOp.MAX)
+ max_acc = np.asscalar(np.array(max_acc_tensor))
+ else:
+ max_acc = acc2
+
+ if max_acc > self.highest_acc_list[i]:
+ self.highest_acc_list[i] = max_acc
+ if abs(max_acc - acc2) < 1e-8:
+ best_metric[self.ver_name_list[i]] = {
+ 'global_step': global_step,
+ 'acc2': acc2,
+ 'rank': self.rank,
+ 'dataset_name': self.ver_name_list[i],
+ }
+
logging.info('[%s][%d]Accuracy-Highest: %1.5f' % (
self.ver_name_list[i], global_step, self.highest_acc_list[i]))
test_end = time.time()
logging.info("test time: {:.4f}".format(test_end - test_start))
+ return best_metric
def init_dataset(self, val_targets, data_dir, image_size):
for name in val_targets:
@@ -127,4 +154,6 @@ def init_dataset(self, val_targets, data_dir, image_size):
def __call__(self, num_update):
if num_update > 0 and num_update % self.frequent == 0:
- self.ver_test(num_update)
+ best_metric = self.ver_test(num_update)
+ return best_metric
+ return None
diff --git a/static/validation.py b/static/validation.py
index 0ab6ac661d2b2..b8b38ec4be4d2 100644
--- a/static/validation.py
+++ b/static/validation.py
@@ -50,7 +50,7 @@ def validation(args):
checkpoint.load(program=test_program, for_train=False)
callback_verification = CallBackVerification(
- 1, 0, args.batch_size, test_program,
+ 1, 0, 1, args.batch_size, test_program,
list(test_model.backbone.input_dict.values()),
list(test_model.backbone.output_dict.values()), args.val_targets,
args.data_dir)
diff --git a/tools/test_recognition.py b/tools/test_recognition.py
index b64ac6fdf0051..ef14c28554ae0 100644
--- a/tools/test_recognition.py
+++ b/tools/test_recognition.py
@@ -20,6 +20,7 @@
import pickle
import tarfile
from functools import partial
+from collections import defaultdict
import cv2
import numpy as np
@@ -108,8 +109,8 @@ def str2bool(v):
parser.add_argument(
"--rec_thresh",
type=float,
- default=0.45,
- help="The threshold of recognition postprocess. Default by 0.45.")
+ default=0.35,
+ help="The threshold of recognition postprocess. Default by 0.35.")
parser.add_argument(
"--max_batch_size",
type=int,
@@ -141,7 +142,7 @@ def print_config(args):
table.add_row([param, args[param]])
width = len(str(table).split("\n")[0])
print("{}".format("-" * width))
- print("PaddleFace".center(width))
+ print("PLSC".center(width))
print(table)
print("Powered by PaddlePaddle!".rjust(width))
print("{}".format("-" * width))
@@ -473,23 +474,30 @@ def postprocess(self):
pass
def retrieval(self, np_feature):
- labels = []
+ id_score_list = []
for feature in np_feature:
similarity = cosine_similarity(self.index_feature,
feature).squeeze()
abs_similarity = np.abs(similarity)
candidate_idx = np.argpartition(abs_similarity,
-self.cdd_num)[-self.cdd_num:]
+
remove_idx = np.where(abs_similarity[candidate_idx] < self.thresh)
candidate_idx = np.delete(candidate_idx, remove_idx)
candidate_label_list = list(np.array(self.label)[candidate_idx])
+ candidate_score_list = abs_similarity[candidate_idx]
+ candidate_score_dict = defaultdict(list)
+ for lb, score in zip(candidate_label_list, candidate_score_list):
+ candidate_score_dict[lb].append(score)
if len(candidate_label_list) == 0:
- maxlabel = ""
+ maxlabel = "unknown"
+ maxscore = -1.0
else:
maxlabel = max(candidate_label_list,
key=candidate_label_list.count)
- labels.append(maxlabel)
- return labels
+ maxscore = max(candidate_score_dict[maxlabel])
+ id_score_list.append((maxlabel, maxscore))
+ return id_score_list
def load_index(self, file_path):
with open(file_path, "rb") as f:
@@ -549,14 +557,14 @@ def preprocess(self, img):
img = img.astype(np.float32, copy=False)
return img
- def draw(self, img, box_list, labels):
- self.color_map.update(labels)
+ def draw(self, img, box_list, id_score_list):
+ self.color_map.update([id for id, score in id_score_list])
im = Image.fromarray(img)
draw = ImageDraw.Draw(im)
for i, dt in enumerate(box_list):
bbox, score = dt[2:], dt[1]
- label = labels[i]
+ label, idscore = id_score_list[i]
color = tuple(self.color_map[label])
xmin, ymin, xmax, ymax = bbox
@@ -564,19 +572,31 @@ def draw(self, img, box_list, labels):
font_size = max(int((xmax - xmin) // 6), 10)
font = ImageFont.truetype(self.font_path, font_size)
- text = "{} {:.4f}".format(label, score)
+ face_text = "{} {:.4f}".format('face', score)
th = sum(font.getmetrics())
- tw = font.getsize(text)[0]
+ tw = font.getsize(face_text)[0]
start_y = max(0, ymin - th)
+ id_text = "{} {:.4f}".format(label, idscore)
+ tw = max(tw, font.getsize(id_text)[0])
+ draw.rectangle(
+ [(xmin, start_y - th), (xmin + tw + 1, start_y)], fill=color)
+ draw.text(
+ (xmin + 1, start_y - th),
+ id_text,
+ fill=(255, 255, 255),
+ font=font,
+ anchor="la")
+
draw.rectangle(
[(xmin, start_y), (xmin + tw + 1, start_y + th)], fill=color)
draw.text(
(xmin + 1, start_y),
- text,
+ face_text,
fill=(255, 255, 255),
font=font,
anchor="la")
+
draw.rectangle(
[(xmin, ymin), (xmax, ymax)], width=2, outline=color)
return np.array(im)
@@ -631,18 +651,20 @@ def predict(self, input_data, print_info=False):
continue
box_list, np_feature = self.predict_np_img(img)
if np_feature is not None:
- labels = self.rec_predictor.retrieval(np_feature)
+ id_score_list = self.rec_predictor.retrieval(np_feature)
else:
- labels = ["face"] * len(box_list)
+ id_score_list = [("unknown", -1.0)] * len(box_list)
if box_list is not None:
- result = self.draw(img, box_list, labels=labels)
+ result = self.draw(img, box_list, id_score_list=id_score_list)
self.output_writer.write(result, file_name)
if print_info:
- logging.info(f"File: {file_name}, predict label(s): {labels}")
+ logging.info(
+ f"File: {file_name}, predict id_score_list(s): {id_score_list}"
+ )
yield {
"box_list": box_list,
"features": np_feature,
- "labels": labels
+ "id_score_list": id_score_list
}
logging.info(f"Predict complete!")