From da29c3fab6ad6c517ec642d005a2a5464b0a8ba9 Mon Sep 17 00:00:00 2001 From: Topdu <784990967@qq.com> Date: Wed, 3 Jul 2024 23:32:36 +0800 Subject: [PATCH 1/3] Updated Recognition Competition Model Link --- configs/rec/SVTRv2/rec_repsvtr_ch.yml | 134 +++++++++++ configs/rec/SVTRv2/rec_svtrv2_ch.yml | 143 ++++++++++++ .../rec/SVTRv2/rec_svtrv2_ch_distillation.yml | 208 ++++++++++++++++++ doc/doc_ch/algorithm_rec_svtrv2.md | 9 +- 4 files changed, 493 insertions(+), 1 deletion(-) create mode 100644 configs/rec/SVTRv2/rec_repsvtr_ch.yml create mode 100644 configs/rec/SVTRv2/rec_svtrv2_ch.yml create mode 100644 configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml diff --git a/configs/rec/SVTRv2/rec_repsvtr_ch.yml b/configs/rec/SVTRv2/rec_repsvtr_ch.yml new file mode 100644 index 0000000000..4fd643bc98 --- /dev/null +++ b/configs/rec/SVTRv2/rec_repsvtr_ch.yml @@ -0,0 +1,134 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_repsvtr_ch + save_epoch_step: 10 + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_repsvtr.txt + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + epsilon: 1.e-8 + weight_decay: 0.025 + no_weight_decay_name: norm + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.001 # 8gpus 192bs + warmup_epoch: 5 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: RepSVTR + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + num_decoder_layers: 2 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/SVTRv2/rec_svtrv2_ch.yml b/configs/rec/SVTRv2/rec_svtrv2_ch.yml new file mode 100644 index 0000000000..70efe10934 --- /dev/null +++ b/configs/rec/SVTRv2/rec_svtrv2_ch.yml @@ -0,0 +1,143 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_svtrv2_ch + save_epoch_step: 10 + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_svrtv2.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + epsilon: 1.e-8 + weight_decay: 0.05 + no_weight_decay_name: norm + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.001 # 8gpus 192bs + warmup_epoch: 5 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: SVTRv2 + use_pos_embed: False + dims: [128, 256, 384] + depths: [6, 6, 6] + num_heads: [4, 8, 12] + mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','Global','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']] + local_k: [[5, 5], [5, 5], [-1, -1]] + sub_k: [[2, 1], [2, 1], [-1, -1]] + last_stage: False + use_pool: True + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + num_decoder_layers: 2 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml b/configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml new file mode 100644 index 0000000000..071070ed83 --- /dev/null +++ b/configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml @@ -0,0 +1,208 @@ +Global: + debug: false + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_svtrv2_ch_distill_lr00002/ + save_epoch_step: 5 + eval_batch_step: + - 0 + - 1000 + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_svtrv2_ch_distill.txt +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 1.e-8 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed patch_embed downsample + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.0002 # 8gpus 192bs + warmup_epoch: 5 +Architecture: + model_type: rec + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: ./output/rec_svtrv2_ch/best_accuracy + freeze_params: true + return_all_feats: true + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: SVTRv2 + use_pos_embed: False + dims: [128, 256, 384] + depths: [6, 6, 6] + num_heads: [4, 8, 12] + mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','Global','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']] + local_k: [[5, 5], [5, 5], [-1, -1]] + sub_k: [[2, 1], [2, 1], [-1, -1]] + last_stage: False + use_pool: True + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + num_decoder_layers: 2 + max_text_length: *max_text_length + Student: + pretrained: ./output/rec_repsvtr_ch/best_accuracy + freeze_params: false + return_all_feats: true + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: RepSVTR + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + num_decoder_layers: 2 + max_text_length: *max_text_length +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDKDLoss: + weight: 0.1 + model_name_pairs: + - - Student + - Teacher + key: head_out + multi_head: true + alpha: 1.0 + beta: 2.0 + dis_head: gtc + name: dkd + - DistillationCTCLoss: + weight: 1.0 + model_name_list: + - Student + key: head_out + multi_head: true + - DistillationNRTRLoss: + weight: 1.0 + smoothing: false + model_name_list: + - Student + key: head_out + multi_head: true + - DistillCTCLogits: + weight: 1.0 + reduction: mean + model_name_pairs: + - - Student + - Teacher + key: head_out +PostProcess: + name: DistillationCTCLabelDecode + model_name: + - Student + key: head_out + multi_head: true +Metric: + name: DistillationMetric + base_metric_name: RecMetric + main_indicator: acc + key: Student + + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/doc/doc_ch/algorithm_rec_svtrv2.md b/doc/doc_ch/algorithm_rec_svtrv2.md index a508b4f02c..bcdec6308a 100644 --- a/doc/doc_ch/algorithm_rec_svtrv2.md +++ b/doc/doc_ch/algorithm_rec_svtrv2.md @@ -19,8 +19,15 @@ ### SVTRv2算法简介 -[PaddleOCR 算法模型挑战赛 - 赛题一:OCR 端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)排行榜第一算法。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 +🔥 该算法由来自复旦大学视觉与学习实验室([FVL](https://fvl.fudan.edu.cn))的[OpenOCR](https://github.com/Topdu/OpenOCR)团队研发,其在[PaddleOCR 算法模型挑战赛 - 赛题一:OCR 端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)中荣获一等奖,B榜端到端识别精度相比PP-OCRv4提升2.5%,推理速度持平。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 +|模型|配置文件|端到端|下载链接| +| --- | --- | --- | --- | +|PP-OCRv4| |A榜 62.77% / B榜62.51%| [Model List](../../doc/doc_ch/models_list.md) | +|SVTRv2(Rec Sever)|[configs/rec/SVTRv2/rec_svtrv2_ch.yml](../../configs/rec/SVTRv2/rec_svtrv2_ch.yml)|A榜 68.81% (使用PP-OCRv4检测模型)| [训练模型]() / [推理模型]() | +|RepSVTR(Mobile)|[识别](../../configs/rec/SVTRv2/rec_repsvtr_ch.yml) / [检测](../../configs/det/det_repsvtr_db.yml)|B榜 65.07%| 识别: [训练模型]() / [推理模型]() 检测: [训练模型]() / [推理模型]() | + +🚀 快速使用:参考PP-OCR推理[说明文档](../../doc/doc_ch/inference_ppocr.md),将检测和识别模型替换为上表中RepSVTR或SVTRv2的对应推理模型即可使用。 ## 2. 环境配置 From 2f2dafc9e368f7998b93ace00e6b3b17299231f6 Mon Sep 17 00:00:00 2001 From: Topdu <784990967@qq.com> Date: Thu, 4 Jul 2024 11:58:39 +0800 Subject: [PATCH 2/3] Updated Recognition Competition Model Link --- doc/doc_ch/algorithm_rec_svtrv2.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/doc_ch/algorithm_rec_svtrv2.md b/doc/doc_ch/algorithm_rec_svtrv2.md index bcdec6308a..ea39453094 100644 --- a/doc/doc_ch/algorithm_rec_svtrv2.md +++ b/doc/doc_ch/algorithm_rec_svtrv2.md @@ -19,15 +19,15 @@ ### SVTRv2算法简介 -🔥 该算法由来自复旦大学视觉与学习实验室([FVL](https://fvl.fudan.edu.cn))的[OpenOCR](https://github.com/Topdu/OpenOCR)团队研发,其在[PaddleOCR 算法模型挑战赛 - 赛题一:OCR 端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)中荣获一等奖,B榜端到端识别精度相比PP-OCRv4提升2.5%,推理速度持平。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 +🔥 该算法由来自复旦大学视觉与学习实验室([FVL](https://fvl.fudan.edu.cn))的[OpenOCR](https://github.com/Topdu/OpenOCR)团队研发,其在[PaddleOCR算法模型挑战赛 - 赛题一:OCR端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)中荣获一等奖,B榜端到端识别精度相比PP-OCRv4提升2.5%,推理速度持平。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 |模型|配置文件|端到端|下载链接| | --- | --- | --- | --- | -|PP-OCRv4| |A榜 62.77% / B榜62.51%| [Model List](../../doc/doc_ch/models_list.md) | -|SVTRv2(Rec Sever)|[configs/rec/SVTRv2/rec_svtrv2_ch.yml](../../configs/rec/SVTRv2/rec_svtrv2_ch.yml)|A榜 68.81% (使用PP-OCRv4检测模型)| [训练模型]() / [推理模型]() | -|RepSVTR(Mobile)|[识别](../../configs/rec/SVTRv2/rec_repsvtr_ch.yml) / [检测](../../configs/det/det_repsvtr_db.yml)|B榜 65.07%| 识别: [训练模型]() / [推理模型]() 检测: [训练模型]() / [推理模型]() | +|PP-OCRv4| |A榜 62.77%
B榜 62.51%| [Model List](../../doc/doc_ch/models_list.md) | +|SVTRv2(Rec Sever)|[configs/rec/SVTRv2/rec_svtrv2_ch.yml](../../configs/rec/SVTRv2/rec_svtrv2_ch.yml)|A榜 68.81% (使用PP-OCRv4检测模型)| [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_ch_infer.tar) | +|RepSVTR(Mobile)|[识别](../../configs/rec/SVTRv2/rec_repsvtr_ch.yml)
[识别蒸馏](../../configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml)
[检测](../../configs/det/det_repsvtr_db.yml)|B榜 65.07%| 识别: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_repsvtr_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_repsvtr_ch_infer.tar)
识别蒸馏: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_distill_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_distill_ch_infer.tar)
检测: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_det_repsvtr_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_det_repsvtr_ch_infer.tar) | -🚀 快速使用:参考PP-OCR推理[说明文档](../../doc/doc_ch/inference_ppocr.md),将检测和识别模型替换为上表中RepSVTR或SVTRv2的对应推理模型即可使用。 +🚀 快速使用:参考PP-OCR推理[说明文档](../../doc/doc_ch/inference_ppocr.md),将检测和识别模型替换为上表中对应的RepSVTR或SVTRv2推理模型即可使用。 ## 2. 环境配置 @@ -122,7 +122,7 @@ Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) ### 4.2 C++推理部署 -由于C++预处理后处理还未支持SVTRv2 +准备好推理模型后,参考[cpp infer](../../deploy/cpp_infer/)教程进行操作即可。 ### 4.3 Serving服务化部署 @@ -132,7 +132,7 @@ Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) ### 4.4 更多推理部署 -暂不支持 +- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../deploy/paddle2onnx/)教程操作。 ## 5. FAQ From e392ec3cbfe518e06c9d32ff1805ff24b8796cd4 Mon Sep 17 00:00:00 2001 From: Topdu <784990967@qq.com> Date: Thu, 4 Jul 2024 13:26:22 +0800 Subject: [PATCH 3/3] Updated Recognition Competition Model Link --- doc/doc_ch/algorithm_rec_svtrv2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/doc_ch/algorithm_rec_svtrv2.md b/doc/doc_ch/algorithm_rec_svtrv2.md index ea39453094..1391cc790c 100644 --- a/doc/doc_ch/algorithm_rec_svtrv2.md +++ b/doc/doc_ch/algorithm_rec_svtrv2.md @@ -23,7 +23,7 @@ |模型|配置文件|端到端|下载链接| | --- | --- | --- | --- | -|PP-OCRv4| |A榜 62.77%
B榜 62.51%| [Model List](../../doc/doc_ch/models_list.md) | +|PP-OCRv4| |A榜 62.77%
B榜 62.51%| [Model List](../../doc/doc_ch/models_list.md) | |SVTRv2(Rec Sever)|[configs/rec/SVTRv2/rec_svtrv2_ch.yml](../../configs/rec/SVTRv2/rec_svtrv2_ch.yml)|A榜 68.81% (使用PP-OCRv4检测模型)| [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_ch_infer.tar) | |RepSVTR(Mobile)|[识别](../../configs/rec/SVTRv2/rec_repsvtr_ch.yml)
[识别蒸馏](../../configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml)
[检测](../../configs/det/det_repsvtr_db.yml)|B榜 65.07%| 识别: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_repsvtr_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_repsvtr_ch_infer.tar)
识别蒸馏: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_distill_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_distill_ch_infer.tar)
检测: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_det_repsvtr_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_det_repsvtr_ch_infer.tar) |