From 2495138a7a55d46b53da0f159c0f6c6c5245a1a8 Mon Sep 17 00:00:00 2001 From: eran-deci Date: Mon, 27 Feb 2023 15:41:09 +0200 Subject: [PATCH 01/12] initial commit --- src/super_gradients/common/object_names.py | 6 + .../recipes/cityscapes_segformer_b0.yaml | 107 +++ .../recipes/cityscapes_segformer_b1.yaml | 109 +++ .../recipes/cityscapes_segformer_b2.yaml | 109 +++ .../recipes/cityscapes_segformer_b3.yaml | 109 +++ .../recipes/cityscapes_segformer_b4.yaml | 109 +++ .../recipes/cityscapes_segformer_b5.yaml | 109 +++ .../cityscapes_segformer_dataset_params.yaml | 45 ++ .../training/models/all_architectures.py | 14 + .../models/segmentation_models/segformer.py | 619 ++++++++++++++++++ 10 files changed, 1336 insertions(+) create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b0.yaml create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b1.yaml create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b2.yaml create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b3.yaml create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b4.yaml create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b5.yaml create mode 100644 src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml create mode 100644 src/super_gradients/training/models/segmentation_models/segformer.py diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py index 10443ae137..c650911747 100644 --- a/src/super_gradients/common/object_names.py +++ b/src/super_gradients/common/object_names.py @@ -288,6 +288,12 @@ class Models: PP_YOLOE_M = "ppyoloe_m" PP_YOLOE_L = "ppyoloe_l" PP_YOLOE_X = "ppyoloe_x" + SEGFORMER_B0 = "segformer_b0" + SEGFORMER_B1 = "segformer_b1" + SEGFORMER_B2 = "segformer_b2" + SEGFORMER_B3 = "segformer_b3" + SEGFORMER_B4 = "segformer_b4" + SEGFORMER_B5 = "segformer_b5" DEKR_CUSTOM = "dekr_custom" diff --git a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml new file mode 100644 index 0000000000..19bbc4810e --- /dev/null +++ b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml @@ -0,0 +1,107 @@ +# SegFormer-B0 segmentation training example with Cityscapes dataset. +# Reproduction of paper: +# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo +# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" +# ( https://arxiv.org/pdf/2105.15203.pdf ) +# +# Official git repo: +# https://github.com/NVlabs/SegFormer +# +# Imagenet-1k pre-trained backbone weights taken and adapted from: +# https://github.com/sithu31296/semantic-segmentation +# +# Instructions: +# 1. We recommend preparing the data according to SG's CityScapes readme file: +# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md +# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and +# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly +# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory +# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs +# 5. Move to the project root (where you will find the ReadMe and src folder) +# 6. Run the command: +# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b0 +# +# +# Imagenet-1K pre-trained backbone: +# MiT (Mix Transformer) B0: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b0.pth +# +# 1. Download the weights from the above link and put them in a directory of your choice +# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path +# 3. Ensure checkpoint_params.load_backbone: True + + +defaults: + - training_hyperparams: default_train_params + - dataset_params: cityscapes_segformer_dataset_params + - checkpoint_params: default_checkpoint_params + - _self_ + +architecture: segformer_b0 + +data_root_dir: /data/cityscapes +dataset_params: + train_dataset_params: + root_dir: ${data_root_dir} + val_dataset_params: + root_dir: ${data_root_dir} + +experiment_name: ${architecture}_cityscapes +ckpt_root_dir: + +train_dataloader: cityscapes_train +val_dataloader: cityscapes_val + +cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML + +arch_params: + num_classes: 19 + +checkpoint_params: + checkpoint_path: + load_backbone: True + load_weights_only: True + strict_load: no_key_matching + +load_checkpoint: False + +resume: False +training_hyperparams: + + resume: ${resume} + + max_epochs: 400 + + lr_mode: poly + initial_lr: 0.0002 # for effective batch_size=8 + + optimizer: AdamW + zero_weight_decay_on_bias_and_bn: True + + sync_bn: True + + loss: cross_entropy + criterion_params: + ignore_index: ${cityscapes_ignored_label} + + train_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + valid_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + metric_to_watch: IoU + greater_metric_to_watch_is_better: True + +multi_gpu: DDP +num_gpus: 4 + + +# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA +hydra: + run: + # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) + dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml new file mode 100644 index 0000000000..6fcf073ff3 --- /dev/null +++ b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml @@ -0,0 +1,109 @@ +# SegFormer-B1 segmentation training example with Cityscapes dataset. +# Reproduction of paper: +# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo +# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" +# ( https://arxiv.org/pdf/2105.15203.pdf ) +# +# Official git repo: +# https://github.com/NVlabs/SegFormer +# +# +# Imagenet-1k pre-trained backbone weights taken and adapted from: +# https://github.com/sithu31296/semantic-segmentation +# +# +# Instructions: +# 1. We recommend preparing the data according to SG's CityScapes readme file: +# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md +# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and +# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly +# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory +# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs +# 5. Move to the project root (where you will find the ReadMe and src folder) +# 6. Run the command: +# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b1 +# +# +# Imagenet-1K pre-trained backbone: +# MiT (Mix Transformer) B1: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b1.pth +# +# 1. Download the weights from the above link and put them in a directory of your choice +# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path +# 3. Ensure checkpoint_params.load_backbone: True + + +defaults: + - training_hyperparams: default_train_params + - dataset_params: cityscapes_segformer_dataset_params + - checkpoint_params: default_checkpoint_params + - _self_ + +architecture: segformer_b1 + +data_root_dir: /data/cityscapes +dataset_params: + train_dataset_params: + root_dir: ${data_root_dir} + val_dataset_params: + root_dir: ${data_root_dir} + +experiment_name: ${architecture}_cityscapes +ckpt_root_dir: + +train_dataloader: cityscapes_train +val_dataloader: cityscapes_val + +cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML + +arch_params: + num_classes: 19 + +checkpoint_params: + checkpoint_path: + load_backbone: True + load_weights_only: True + strict_load: no_key_matching + +load_checkpoint: False + +resume: False +training_hyperparams: + + resume: ${resume} + + max_epochs: 400 + + lr_mode: poly + initial_lr: 0.0002 # for effective batch_size=8 + + optimizer: AdamW + zero_weight_decay_on_bias_and_bn: True + + sync_bn: True + + loss: cross_entropy + criterion_params: + ignore_index: ${cityscapes_ignored_label} + + train_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + valid_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + metric_to_watch: IoU + greater_metric_to_watch_is_better: True + +multi_gpu: DDP +num_gpus: 4 + + +# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA +hydra: + run: + # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) + dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml new file mode 100644 index 0000000000..23e5956ac5 --- /dev/null +++ b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml @@ -0,0 +1,109 @@ +# SegFormer-B2 segmentation training example with Cityscapes dataset. +# Reproduction of paper: +# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo +# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" +# ( https://arxiv.org/pdf/2105.15203.pdf ) +# +# Official git repo: +# https://github.com/NVlabs/SegFormer +# +# +# Imagenet-1k pre-trained backbone weights taken and adapted from: +# https://github.com/sithu31296/semantic-segmentation +# +# +# Instructions: +# 1. We recommend preparing the data according to SG's CityScapes readme file: +# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md +# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and +# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly +# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory +# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs +# 5. Move to the project root (where you will find the ReadMe and src folder) +# 6. Run the command: +# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b2 +# +# +# Imagenet-1K pre-trained backbone: +# MiT (Mix Transformer) B2: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b2.pth +# +# 1. Download the weights from the above link and put them in a directory of your choice +# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path +# 3. Ensure checkpoint_params.load_backbone: True + + +defaults: + - training_hyperparams: default_train_params + - dataset_params: cityscapes_segformer_dataset_params + - checkpoint_params: default_checkpoint_params + - _self_ + +architecture: segformer_b2 + +data_root_dir: /data/cityscapes +dataset_params: + train_dataset_params: + root_dir: ${data_root_dir} + val_dataset_params: + root_dir: ${data_root_dir} + +experiment_name: ${architecture}_cityscapes +ckpt_root_dir: + +train_dataloader: cityscapes_train +val_dataloader: cityscapes_val + +cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML + +arch_params: + num_classes: 19 + +checkpoint_params: + checkpoint_path: + load_backbone: True + load_weights_only: True + strict_load: no_key_matching + +load_checkpoint: False + +resume: False +training_hyperparams: + + resume: ${resume} + + max_epochs: 400 + + lr_mode: poly + initial_lr: 0.0002 # for effective batch_size=8 + + optimizer: AdamW + zero_weight_decay_on_bias_and_bn: True + + sync_bn: True + + loss: cross_entropy + criterion_params: + ignore_index: ${cityscapes_ignored_label} + + train_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + valid_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + metric_to_watch: IoU + greater_metric_to_watch_is_better: True + +multi_gpu: DDP +num_gpus: 4 + + +# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA +hydra: + run: + # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) + dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml new file mode 100644 index 0000000000..4d957502bf --- /dev/null +++ b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml @@ -0,0 +1,109 @@ +# SegFormer-B3 segmentation training example with Cityscapes dataset. +# Reproduction of paper: +# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo +# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" +# ( https://arxiv.org/pdf/2105.15203.pdf ) +# +# Official git repo: +# https://github.com/NVlabs/SegFormer +# +# +# Imagenet-1k pre-trained backbone weights taken and adapted from: +# https://github.com/sithu31296/semantic-segmentation +# +# +# Instructions: +# 1. We recommend preparing the data according to SG's CityScapes readme file: +# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md +# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and +# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly +# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory +# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs +# 5. Move to the project root (where you will find the ReadMe and src folder) +# 6. Run the command: +# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b3 +# +# +# Imagenet-1K pre-trained backbone: +# MiT (Mix Transformer) B3: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b3.pth +# +# 1. Download the weights from the above link and put them in a directory of your choice +# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path +# 3. Ensure checkpoint_params.load_backbone: True + + +defaults: + - training_hyperparams: default_train_params + - dataset_params: cityscapes_segformer_dataset_params + - checkpoint_params: default_checkpoint_params + - _self_ + +architecture: segformer_b3 + +data_root_dir: /data/cityscapes +dataset_params: + train_dataset_params: + root_dir: ${data_root_dir} + val_dataset_params: + root_dir: ${data_root_dir} + +experiment_name: ${architecture}_cityscapes +ckpt_root_dir: + +train_dataloader: cityscapes_train +val_dataloader: cityscapes_val + +cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML + +arch_params: + num_classes: 19 + +checkpoint_params: + checkpoint_path: + load_backbone: True + load_weights_only: True + strict_load: no_key_matching + +load_checkpoint: False + +resume: False +training_hyperparams: + + resume: ${resume} + + max_epochs: 400 + + lr_mode: poly + initial_lr: 0.0002 # for effective batch_size=8 + + optimizer: AdamW + zero_weight_decay_on_bias_and_bn: True + + sync_bn: True + + loss: cross_entropy + criterion_params: + ignore_index: ${cityscapes_ignored_label} + + train_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + valid_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + metric_to_watch: IoU + greater_metric_to_watch_is_better: True + +multi_gpu: DDP +num_gpus: 4 + + +# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA +hydra: + run: + # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) + dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml new file mode 100644 index 0000000000..4d34fd1c95 --- /dev/null +++ b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml @@ -0,0 +1,109 @@ +# SegFormer-B4 segmentation training example with Cityscapes dataset. +# Reproduction of paper: +# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo +# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" +# ( https://arxiv.org/pdf/2105.15203.pdf ) +# +# Official git repo: +# https://github.com/NVlabs/SegFormer +# +# +# Imagenet-1k pre-trained backbone weights taken and adapted from: +# https://github.com/sithu31296/semantic-segmentation +# +# +# Instructions: +# 1. We recommend preparing the data according to SG's CityScapes readme file: +# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md +# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and +# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly +# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory +# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs +# 5. Move to the project root (where you will find the ReadMe and src folder) +# 6. Run the command: +# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b4 +# +# +# Imagenet-1K pre-trained backbone: +# MiT (Mix Transformer) B4: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b4.pth +# +# 1. Download the weights from the above link and put them in a directory of your choice +# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path +# 3. Ensure checkpoint_params.load_backbone: True + + +defaults: + - training_hyperparams: default_train_params + - dataset_params: cityscapes_segformer_dataset_params + - checkpoint_params: default_checkpoint_params + - _self_ + +architecture: segformer_b4 + +data_root_dir: /data/cityscapes +dataset_params: + train_dataset_params: + root_dir: ${data_root_dir} + val_dataset_params: + root_dir: ${data_root_dir} + +experiment_name: ${architecture}_cityscapes +ckpt_root_dir: + +train_dataloader: cityscapes_train +val_dataloader: cityscapes_val + +cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML + +arch_params: + num_classes: 19 + +checkpoint_params: + checkpoint_path: + load_backbone: True + load_weights_only: True + strict_load: no_key_matching + +load_checkpoint: False + +resume: False +training_hyperparams: + + resume: ${resume} + + max_epochs: 400 + + lr_mode: poly + initial_lr: 0.0002 # for effective batch_size=8 + + optimizer: AdamW + zero_weight_decay_on_bias_and_bn: True + + sync_bn: True + + loss: cross_entropy + criterion_params: + ignore_index: ${cityscapes_ignored_label} + + train_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + valid_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + metric_to_watch: IoU + greater_metric_to_watch_is_better: True + +multi_gpu: DDP +num_gpus: 4 + + +# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA +hydra: + run: + # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) + dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml new file mode 100644 index 0000000000..ba8dd776de --- /dev/null +++ b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml @@ -0,0 +1,109 @@ +# SegFormer-B5 segmentation training example with Cityscapes dataset. +# Reproduction of paper: +# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo +# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" +# ( https://arxiv.org/pdf/2105.15203.pdf ) +# +# Official git repo: +# https://github.com/NVlabs/SegFormer +# +# +# Imagenet-1k pre-trained backbone weights taken and adapted from: +# https://github.com/sithu31296/semantic-segmentation +# +# +# Instructions: +# 1. We recommend preparing the data according to SG's CityScapes readme file: +# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md +# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and +# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly +# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory +# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs +# 5. Move to the project root (where you will find the ReadMe and src folder) +# 6. Run the command: +# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b5 +# +# +# Imagenet-1K pre-trained backbone: +# MiT (Mix Transformer) B5: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b5.pth +# +# 1. Download the weights from the above link and put them in a directory of your choice +# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path +# 3. Ensure checkpoint_params.load_backbone: True + + +defaults: + - training_hyperparams: default_train_params + - dataset_params: cityscapes_segformer_dataset_params + - checkpoint_params: default_checkpoint_params + - _self_ + +architecture: segformer_b5 + +data_root_dir: /data/cityscapes +dataset_params: + train_dataset_params: + root_dir: ${data_root_dir} + val_dataset_params: + root_dir: ${data_root_dir} + +experiment_name: ${architecture}_cityscapes +ckpt_root_dir: + +train_dataloader: cityscapes_train +val_dataloader: cityscapes_val + +cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML + +arch_params: + num_classes: 19 + +checkpoint_params: + checkpoint_path: + load_backbone: True + load_weights_only: True + strict_load: no_key_matching + +load_checkpoint: False + +resume: False +training_hyperparams: + + resume: ${resume} + + max_epochs: 400 + + lr_mode: poly + initial_lr: 0.0002 # for effective batch_size=8 + + optimizer: AdamW + zero_weight_decay_on_bias_and_bn: True + + sync_bn: True + + loss: cross_entropy + criterion_params: + ignore_index: ${cityscapes_ignored_label} + + train_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + valid_metrics_list: + - IoU: + num_classes: 20 + ignore_index: ${cityscapes_ignored_label} + + metric_to_watch: IoU + greater_metric_to_watch_is_better: True + +multi_gpu: DDP +num_gpus: 4 + + +# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA +hydra: + run: + # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) + dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml new file mode 100644 index 0000000000..7d90027a7f --- /dev/null +++ b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml @@ -0,0 +1,45 @@ +defaults: + - cityscapes_dataset_params + - _self_ + +train_dataset_params: + transforms: + - SegColorJitter: + brightness: 0.5 + contrast: 0.5 + saturation: 0.5 + + - SegRandomFlip: + prob: 0.5 + + - SegRandomRescale: + scales: [ 0.5, 2.0 ] + + - SegPadShortToCropSize: + crop_size: [ 1024, 1024 ] + fill_mask: 19 + + - SegCropImageAndMask: + crop_size: [ 1024, 1024 ] + mode: random + +val_dataset_params: + transforms: + - SegRescale: + short_size: 1024 + + - SegPadShortToCropSize: + crop_size: [ 1024, 1024 ] + fill_mask: 19 + + - SegCropImageAndMask: + crop_size: [ 1024, 1024 ] + mode: center + +train_dataloader_params: + batch_size: 2 + shuffle: True + +val_dataloader_params: + batch_size: 2 + shuffle: False diff --git a/src/super_gradients/training/models/all_architectures.py b/src/super_gradients/training/models/all_architectures.py index c681f39733..9bd0af6778 100755 --- a/src/super_gradients/training/models/all_architectures.py +++ b/src/super_gradients/training/models/all_architectures.py @@ -28,6 +28,14 @@ CustomSTDCSegmentation, STDCClassification, ) +from super_gradients.training.models.segmentation_models.segformer import ( + SegFormerB0, + SegFormerB1, + SegFormerB2, + SegFormerB3, + SegFormerB4, + SegFormerB5 +) from super_gradients.training.models.kd_modules.kd_module import KDModule from super_gradients.training.models.classification_models.beit import BeitBasePatch16_224, BeitLargePatch16_224 @@ -141,6 +149,12 @@ Models.PP_YOLOE_M: PPYoloE_M, Models.PP_YOLOE_L: PPYoloE_L, Models.PP_YOLOE_X: PPYoloE_X, + Models.SEGFORMER_B0: SegFormerB0, + Models.SEGFORMER_B1: SegFormerB1, + Models.SEGFORMER_B2: SegFormerB2, + Models.SEGFORMER_B3: SegFormerB3, + Models.SEGFORMER_B4: SegFormerB4, + Models.SEGFORMER_B5: SegFormerB5, # Models.DEKR_CUSTOM: DEKRPoseEstimationModel, } diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py new file mode 100644 index 0000000000..d66442176d --- /dev/null +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -0,0 +1,619 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from super_gradients.training.models import HpmStruct +from super_gradients.training.utils import get_param +from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule +from super_gradients.common.abstractions.abstract_logger import get_logger + +""" +paper: SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers + ( https://arxiv.org/pdf/2105.15203.pdf ) +code adopted from git repo: https://github.com/sithu31296/semantic-segmentation + +Imagenet-1k pre-trained backbone weights taken and adapted from: https://github.com/sithu31296/semantic-segmentation + +""" + +logger = get_logger(__name__) + + +# TODO: this function (and trunc_normal_) are copy-pasted from BEIT model code. We need to consider implementing +# it in a more general location +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + logger.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " "The distribution of values may be incorrect.", stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + lower = norm_cdf((a - mean) / std) + upper = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * lower - 1, 2 * upper - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + # type: (Tensor, float, float, float, float) -> Tensor + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +class PatchEmbedding(nn.Module): + def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int): + """ + Overlapped patch merging (https://arxiv.org/pdf/2105.15203.pdf) + :param in_channels: number of input channels + :param out_channels: number of output channels (embedding dimension) + :param patch_size: patch size (k for size (k, k)) + :param stride: patch stride (k for size (k, k)) + :param padding: patch padding (k for size (k, k)) + """ + + super().__init__() + + self.proj = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=patch_size, + stride=stride, + padding=padding) + self.norm = nn.LayerNorm(out_channels) + + def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]: + x = self.proj(x) + _, _, h, w = x.shape + + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + + return x, h, w + + +class EfficientSelfAttention(nn.Module): + def __init__(self, dim: int, head: int, sr_ratio: int): + """ + Efficient self-attention (https://arxiv.org/pdf/2105.15203.pdf) + :param dim: embedding dimension + :param head: number of attention heads + :param sr_ratio: the reduction ratio of the efficient self-attention + """ + + super().__init__() + + self.head = head + self.sr_ratio = sr_ratio + self.scale = (dim // head) ** -0.5 + self.q = nn.Linear(dim, dim) + self.kv = nn.Linear(dim, dim * 2) + self.proj = nn.Linear(dim, dim) + + if sr_ratio > 1: + self.sr = nn.Conv2d(dim, dim, sr_ratio, sr_ratio) + self.norm = nn.LayerNorm(dim) + + def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: + b, n, c = x.shape + q = self.q(x).reshape(b, n, self.head, c // self.head).permute(0, 2, 1, 3) + + if self.sr_ratio > 1: + x = x.permute(0, 2, 1).reshape(b, c, h, w) + x = self.sr(x).reshape(b, c, -1).permute(0, 2, 1) + x = self.norm(x) + + k, v = self.kv(x).reshape(b, -1, 2, self.head, c // self.head).permute(2, 0, 3, 1, 4) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(b, n, c) + x = self.proj(x) + return x + +class DropPath(nn.Module): + + def __init__(self, drop_p: float = None): + """ + Drop path (stochastic depth). + Taken from: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/layers/common.py + :param drop_p: drop probability + """ + + super().__init__() + + self.drop_p = drop_p + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.drop_p == 0. or not self.training: + return x + + kp = 1 - self.drop_p + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + + random_tensor = kp + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + + return x.div(kp) * random_tensor + + +class MixFFN(nn.Module): + def __init__(self, in_dim: int, inter_dim: int): + """ + MixFFN block (https://arxiv.org/pdf/2105.15203.pdf) + :param in_dim: input dimension + :param inter_dim: intermediate dimension + """ + + super().__init__() + + self.fc1 = nn.Linear(in_dim, inter_dim) + self.dwconv = nn.Conv2d( + in_channels=inter_dim, + out_channels=inter_dim, + kernel_size=3, + stride=1, + padding=1, + groups=inter_dim) + self.fc2 = nn.Linear(inter_dim, in_dim) + + def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: + x = self.fc1(x) + + b, _, c = x.shape + x = x.transpose(1, 2).view(b, c, h, w) + x = self.dwconv(x) + x = x.flatten(2).transpose(1, 2) + + x = self.fc2(F.gelu(x)) + + return x + +class EncoderBlock(nn.Module): + def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float): + """ + A single encoder block (https://arxiv.org/pdf/2105.15203.pdf) + :param dim: embedding dimension + :param head: number of attention heads + :param sr_ratio: the reduction ratio of the efficient self-attention + :param dpr: drop-path ratio + """ + + super().__init__() + + self.attn = EfficientSelfAttention(dim, head, sr_ratio) + + self.drop_path = DropPath(dpr) if dpr > 0. else nn.Identity() + + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + + self.mlp = MixFFN(in_dim=dim, inter_dim=dim*4) + + def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: + x = x + self.drop_path(self.attn(self.norm1(x), h, w)) + x = x + self.drop_path(self.mlp(self.norm2(x), h, w)) + + return x + + +class MiTBackBone(nn.Module): + def __init__( + self, + embed_dims: list, + encoder_layers: list, + eff_self_att_reduction_ratio: list, + eff_self_att_heads: list, + overlap_patch_size: list, + overlap_patch_stride: list, + overlap_patch_pad: list, + in_channels: int + ): + """ + Mixed Transformer backbone encoder (https://arxiv.org/pdf/2105.15203.pdf) + :param embed_dims: the patch embedding dimensions (number of output channels in each encoder stage) + :param encoder_layers: the number of encoder layers in each encoder stage + :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage + :param eff_self_att_heads: number of efficient self-attention heads in each stage + :param overlap_patch_size: the patch size of the overlapping patch embedding in each stage + :param overlap_patch_stride: the patch stride of the overlapping patch embedding in each stage + :param overlap_patch_pad: the patch padding of the overlapping patch embedding in each stage + :param in_channels: number of input channels + """ + + super().__init__() + + assert len(embed_dims)==len(encoder_layers)==len(eff_self_att_reduction_ratio)==len(eff_self_att_heads)== \ + len(overlap_patch_size)==len(overlap_patch_stride)==len(overlap_patch_pad), \ + f"All backbone hyper-parameters should be lists of the same length" + + # Patch embeddings + self.patch_embed = [] + for stage_num in range(len(embed_dims)): + self.patch_embed.append( + PatchEmbedding( + in_channels=in_channels if stage_num==0 else embed_dims[stage_num-1], + out_channels=embed_dims[stage_num], + patch_size=overlap_patch_size[stage_num], + stride=overlap_patch_stride[stage_num], + padding=overlap_patch_pad[stage_num] + ) + ) + self.add_module(f"patch_embed{stage_num+1}", self.patch_embed[stage_num]) + + drop_path_rate = 0.1 + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(encoder_layers))] + + self.blocks = [] + self.norms = [] + + layer_idx = 0 + for stage_num in range(len(embed_dims)): + self.blocks.append( + nn.ModuleList([ + EncoderBlock( + dim=embed_dims[stage_num], + head=eff_self_att_heads[stage_num], + sr_ratio=eff_self_att_reduction_ratio[stage_num], + dpr=dpr[layer_idx + i]) + for i in range(encoder_layers[stage_num]) + ]) + ) + self.norms.append(nn.LayerNorm(embed_dims[stage_num])) + + self.add_module(f"block{stage_num + 1}", self.blocks[stage_num]) + self.add_module(f"norm{stage_num + 1}", self.norms[stage_num]) + + layer_idx += encoder_layers[stage_num] + + + def forward(self, x: torch.Tensor) -> list[torch.Tensor]: + b_size = x.shape[0] + + features = [] + for stage_num in range(len(self.patch_embed)): + x, h, w = self.patch_embed[stage_num](x) + + for enc_block in self.blocks[stage_num]: + x = enc_block(x, h, w) + x = self.norms[stage_num](x) + x = x.reshape(b_size, h, w, -1).permute(0, 3, 1, 2) + + features.append(x) + + return features + + +class MLP(nn.Module): + def __init__(self, dim, embed_dim): + """ + A single Linear layer, with shape pre-processing + :param dim: input dimension + :param embed_dim: output dimension + """ + + super().__init__() + + self.proj = nn.Linear(dim, embed_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + + return x + +class LinearFuse(nn.Module): + def __init__(self, in_channels: int, out_channels: int): + """ + A linear fusion block (conv + bn + relu) (https://arxiv.org/pdf/2105.15203.pdf) + :param in_channels: number of input channels + :param out_channels: number of output channels + """ + + super().__init__() + + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + bias=False + ) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + + +class SegFormerHead(nn.Module): + def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int): + """ + SegFormer decoder head (https://arxiv.org/pdf/2105.15203.pdf) + :param encoder_dims: list of encoder embedding dimensions + :param embed_dim: unified embedding dimension + :param num_classes: number of predicted classes + """ + super().__init__() + + self.linear_layers = [] + for idx, dim in enumerate(encoder_dims): + self.linear_layers.append(MLP(dim, embed_dim)) + self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx]) + + self.linear_fuse = LinearFuse(in_channels=embed_dim*len(encoder_dims), out_channels=embed_dim) + self.linear_pred = nn.Conv2d(in_channels=embed_dim, + out_channels=num_classes, + kernel_size=1) + + self.dropout = nn.Dropout2d(0.1) + + def forward(self, features: list[torch.Tensor]) -> torch.Tensor: + b, _, h, w = features[0].shape + + out_lst = [self.linear_layers[0](features[0]).permute(0, 2, 1).reshape(b, -1, *features[0].shape[-2:])] + + for i, feature in enumerate(features[1:]): + out = self.linear_layers[i+1](feature).permute(0, 2, 1).reshape(b, -1, *feature.shape[-2:]) + out = F.interpolate(out, size=(h, w), mode='bilinear', align_corners=False) + out_lst.append(out) + + out = self.linear_fuse(torch.cat(out_lst[::-1], dim=1)) + out = self.linear_pred(self.dropout(out)) + + return out + + +# TODO: add support for aux heads? (not in original impl) (currently not using) +class SegFormer(SegmentationModule): + def __init__( + self, + num_classes: int, + encoder_embed_dims: list, + encoder_layers: list, + eff_self_att_reduction_ratio: list, + eff_self_att_heads: list, + decoder_embed_dim: int, + overlap_patch_size: list, + overlap_patch_stride: list, + overlap_patch_pad: list, + in_channels: int = 3 + ): + """ + :param num_classes: number of classes + :param encoder_embed_dims: the patch embedding dimensions (number of output channels in each encoder stage) + :param encoder_layers: the number of encoder layers in each encoder stage + :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage + :param eff_self_att_heads: number of efficient self-attention heads in each stage + :param overlap_patch_size: the patch size of the overlapping patch embedding in each stage + :param overlap_patch_stride: the patch stride of the overlapping patch embedding in each stage + :param overlap_patch_pad: the patch padding of the overlapping patch embedding in each stage + :param in_channels: number of input channels + """ + + super().__init__(use_aux_heads=False) + + self.encoder_embed_dims = encoder_embed_dims + + self._backbone = MiTBackBone( + embed_dims=encoder_embed_dims, + encoder_layers=encoder_layers, + eff_self_att_reduction_ratio=eff_self_att_reduction_ratio, + eff_self_att_heads=eff_self_att_heads, + overlap_patch_size=overlap_patch_size, + overlap_patch_stride=overlap_patch_stride, + overlap_patch_pad=overlap_patch_pad, + in_channels=in_channels + ) + + self.decode_head = SegFormerHead( + encoder_dims=encoder_embed_dims, + embed_dim=decoder_embed_dim, + num_classes=num_classes + ) + + self.init_params() + + def init_params(self): + + for m in self.modules(): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d, nn.SyncBatchNorm)): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + @property + def backbone(self): + return self._backbone + + def _remove_auxiliary_heads(self): + pass + + def replace_head(self, new_num_classes: int, new_decoder_embed_dim: int): + self.decode_head = SegFormerHead( + encoder_dims=self.encoder_embed_dims, + embed_dim=new_decoder_embed_dim, + num_classes=new_num_classes + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + features = self._backbone(x) + out = self.decode_head(features) + out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False) + return out + + def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list: + """ + Custom param groups for training: + - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`. + """ + multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) + multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params() + param_groups = [ + {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"}, + {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"}, + ] + return param_groups + + def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list: + multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) + for param_group in param_groups: + param_group["lr"] = lr + if param_group["name"] == "multiply_lr_params": + param_group["lr"] *= multiply_head_lr + return param_groups + + def _separate_lr_multiply_params(self): + """ + Separate backbone params from the rest. + :return: iterators of groups named_parameters. + """ + backbone_names = [n for n, p in self.backbone.named_parameters()] + multiply_lr_params, no_multiply_params = {}, {} + for name, param in self.named_parameters(): + if name in backbone_names: + no_multiply_params[name] = param + else: + multiply_lr_params[name] = param + return multiply_lr_params.items(), no_multiply_params.items() + + +class SegFormerCustom(SegFormer): + def __init__(self, arch_params: HpmStruct): + """Parse arch_params and translate the parameters to build the SegFormer architecture""" + super().__init__( + num_classes=arch_params.num_classes, + encoder_embed_dims=arch_params.encoder_embed_dims, + encoder_layers=arch_params.encoder_layers, + eff_self_att_reduction_ratio=arch_params.eff_self_att_reduction_ratio, + eff_self_att_heads=arch_params.eff_self_att_heads, + decoder_embed_dim=arch_params.decoder_embed_dim, + overlap_patch_size=arch_params.overlap_patch_size, + overlap_patch_stride=arch_params.overlap_patch_stride, + overlap_patch_pad=arch_params.overlap_patch_pad, + in_channels=arch_params.in_channels + ) + + +DEFAULT_SEGFORMER_PARAMS = { + "in_channels": 3, + "overlap_patch_size": [7, 3, 3, 3], + "overlap_patch_stride": [4, 2, 2, 2], + "overlap_patch_pad": [3, 1, 1, 1], + "eff_self_att_reduction_ratio": [8, 4, 2, 1], + "eff_self_att_heads": [1, 2, 5, 8], +} + +DEFAULT_SEGFORMER_B0_PARAMS = { + **DEFAULT_SEGFORMER_PARAMS, + "encoder_embed_dims": [32, 64, 160, 256], + "encoder_layers": [2, 2, 2, 2], + "decoder_embed_dim": 256 +} + +DEFAULT_SEGFORMER_B1_PARAMS = { + **DEFAULT_SEGFORMER_B0_PARAMS, + "encoder_embed_dims": [64, 128, 320, 512], +} + +DEFAULT_SEGFORMER_B2_PARAMS = { + **DEFAULT_SEGFORMER_B1_PARAMS, + "encoder_layers": [3, 4, 6, 3], + "decoder_embed_dim": 768 +} + +DEFAULT_SEGFORMER_B3_PARAMS = { + **DEFAULT_SEGFORMER_B2_PARAMS, + "encoder_layers": [3, 4, 18, 3], +} + +DEFAULT_SEGFORMER_B4_PARAMS = { + **DEFAULT_SEGFORMER_B2_PARAMS, + "encoder_layers": [3, 8, 27, 3], +} + +DEFAULT_SEGFORMER_B5_PARAMS = { + **DEFAULT_SEGFORMER_B2_PARAMS, + "encoder_layers": [3, 6, 40, 3], +} + + +class SegFormerB0(SegFormerCustom): + def __init__(self, arch_params: HpmStruct): + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B0_PARAMS) + _arch_params.override(**arch_params.to_dict()) + super().__init__(_arch_params) + +class SegFormerB1(SegFormerCustom): + def __init__(self, arch_params: HpmStruct): + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS) + _arch_params.override(**arch_params.to_dict()) + super().__init__(_arch_params) + +class SegFormerB2(SegFormerCustom): + def __init__(self, arch_params: HpmStruct): + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS) + _arch_params.override(**arch_params.to_dict()) + super().__init__(_arch_params) + +class SegFormerB3(SegFormerCustom): + def __init__(self, arch_params: HpmStruct): + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS) + _arch_params.override(**arch_params.to_dict()) + super().__init__(_arch_params) + +class SegFormerB4(SegFormerCustom): + def __init__(self, arch_params: HpmStruct): + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS) + _arch_params.override(**arch_params.to_dict()) + super().__init__(_arch_params) + +class SegFormerB5(SegFormerCustom): + def __init__(self, arch_params: HpmStruct): + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS) + _arch_params.override(**arch_params.to_dict()) + super().__init__(_arch_params) From 932bb738962c13461e56e1fe2c9292fee99d14ff Mon Sep 17 00:00:00 2001 From: eran-deci Date: Mon, 27 Feb 2023 18:54:32 +0200 Subject: [PATCH 02/12] Update segformer.py --- .../models/segmentation_models/segformer.py | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index d66442176d..712295b032 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -486,40 +486,40 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False) return out - def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list: - """ - Custom param groups for training: - - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`. - """ - multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) - multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params() - param_groups = [ - {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"}, - {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"}, - ] - return param_groups - - def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list: - multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) - for param_group in param_groups: - param_group["lr"] = lr - if param_group["name"] == "multiply_lr_params": - param_group["lr"] *= multiply_head_lr - return param_groups - - def _separate_lr_multiply_params(self): - """ - Separate backbone params from the rest. - :return: iterators of groups named_parameters. - """ - backbone_names = [n for n, p in self.backbone.named_parameters()] - multiply_lr_params, no_multiply_params = {}, {} - for name, param in self.named_parameters(): - if name in backbone_names: - no_multiply_params[name] = param - else: - multiply_lr_params[name] = param - return multiply_lr_params.items(), no_multiply_params.items() + # def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list: + # """ + # Custom param groups for training: + # - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`. + # """ + # multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) + # multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params() + # param_groups = [ + # {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"}, + # {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"}, + # ] + # return param_groups + # + # def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list: + # multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) + # for param_group in param_groups: + # param_group["lr"] = lr + # if param_group["name"] == "multiply_lr_params": + # param_group["lr"] *= multiply_head_lr + # return param_groups + # + # def _separate_lr_multiply_params(self): + # """ + # Separate backbone params from the rest. + # :return: iterators of groups named_parameters. + # """ + # backbone_names = [n for n, p in self.backbone.named_parameters()] + # multiply_lr_params, no_multiply_params = {}, {} + # for name, param in self.named_parameters(): + # if name in backbone_names: + # no_multiply_params[name] = param + # else: + # multiply_lr_params[name] = param + # return multiply_lr_params.items(), no_multiply_params.items() class SegFormerCustom(SegFormer): From 78aa7707528ad97c3c97a0a63b7f60180247959c Mon Sep 17 00:00:00 2001 From: eran-deci Date: Mon, 27 Feb 2023 22:55:40 +0200 Subject: [PATCH 03/12] Update segformer.py --- .../models/segmentation_models/segformer.py | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index 712295b032..d66442176d 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -486,40 +486,40 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False) return out - # def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list: - # """ - # Custom param groups for training: - # - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`. - # """ - # multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) - # multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params() - # param_groups = [ - # {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"}, - # {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"}, - # ] - # return param_groups - # - # def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list: - # multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) - # for param_group in param_groups: - # param_group["lr"] = lr - # if param_group["name"] == "multiply_lr_params": - # param_group["lr"] *= multiply_head_lr - # return param_groups - # - # def _separate_lr_multiply_params(self): - # """ - # Separate backbone params from the rest. - # :return: iterators of groups named_parameters. - # """ - # backbone_names = [n for n, p in self.backbone.named_parameters()] - # multiply_lr_params, no_multiply_params = {}, {} - # for name, param in self.named_parameters(): - # if name in backbone_names: - # no_multiply_params[name] = param - # else: - # multiply_lr_params[name] = param - # return multiply_lr_params.items(), no_multiply_params.items() + def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list: + """ + Custom param groups for training: + - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`. + """ + multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) + multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params() + param_groups = [ + {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"}, + {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"}, + ] + return param_groups + + def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list: + multiply_head_lr = get_param(training_params, "multiply_head_lr", 1) + for param_group in param_groups: + param_group["lr"] = lr + if param_group["name"] == "multiply_lr_params": + param_group["lr"] *= multiply_head_lr + return param_groups + + def _separate_lr_multiply_params(self): + """ + Separate backbone params from the rest. + :return: iterators of groups named_parameters. + """ + backbone_names = [n for n, p in self.backbone.named_parameters()] + multiply_lr_params, no_multiply_params = {}, {} + for name, param in self.named_parameters(): + if name in backbone_names: + no_multiply_params[name] = param + else: + multiply_lr_params[name] = param + return multiply_lr_params.items(), no_multiply_params.items() class SegFormerCustom(SegFormer): From a321c22e512c42ae0eafa2cad0c8ba043d1572bc Mon Sep 17 00:00:00 2001 From: eran-deci Date: Tue, 28 Feb 2023 16:37:00 +0200 Subject: [PATCH 04/12] Update segformer.py --- .../models/segmentation_models/segformer.py | 142 ++++++++---------- 1 file changed, 61 insertions(+), 81 deletions(-) diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index d66442176d..5684cfcff2 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -58,7 +58,7 @@ def norm_cdf(x): def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): - # type: (Tensor, float, float, float, float) -> Tensor + # type: (torch.Tensor, float, float, float, float) -> torch.Tensor r"""Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` @@ -91,12 +91,7 @@ def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: super().__init__() - self.proj = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=patch_size, - stride=stride, - padding=padding) + self.proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=patch_size, stride=stride, padding=padding) self.norm = nn.LayerNorm(out_channels) def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]: @@ -149,8 +144,8 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: x = self.proj(x) return x -class DropPath(nn.Module): +class DropPath(nn.Module): def __init__(self, drop_p: float = None): """ Drop path (stochastic depth). @@ -163,7 +158,7 @@ def __init__(self, drop_p: float = None): self.drop_p = drop_p def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.drop_p == 0. or not self.training: + if self.drop_p == 0.0 or not self.training: return x kp = 1 - self.drop_p @@ -186,13 +181,7 @@ def __init__(self, in_dim: int, inter_dim: int): super().__init__() self.fc1 = nn.Linear(in_dim, inter_dim) - self.dwconv = nn.Conv2d( - in_channels=inter_dim, - out_channels=inter_dim, - kernel_size=3, - stride=1, - padding=1, - groups=inter_dim) + self.dwconv = nn.Conv2d(in_channels=inter_dim, out_channels=inter_dim, kernel_size=3, stride=1, padding=1, groups=inter_dim) self.fc2 = nn.Linear(inter_dim, in_dim) def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: @@ -207,6 +196,7 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: return x + class EncoderBlock(nn.Module): def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float): """ @@ -221,12 +211,12 @@ def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float): self.attn = EfficientSelfAttention(dim, head, sr_ratio) - self.drop_path = DropPath(dpr) if dpr > 0. else nn.Identity() + self.drop_path = DropPath(dpr) if dpr > 0.0 else nn.Identity() self.norm1 = nn.LayerNorm(dim) self.norm2 = nn.LayerNorm(dim) - self.mlp = MixFFN(in_dim=dim, inter_dim=dim*4) + self.mlp = MixFFN(in_dim=dim, inter_dim=dim * 4) def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: x = x + self.drop_path(self.attn(self.norm1(x), h, w)) @@ -237,15 +227,15 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: class MiTBackBone(nn.Module): def __init__( - self, - embed_dims: list, - encoder_layers: list, - eff_self_att_reduction_ratio: list, - eff_self_att_heads: list, - overlap_patch_size: list, - overlap_patch_stride: list, - overlap_patch_pad: list, - in_channels: int + self, + embed_dims: list, + encoder_layers: list, + eff_self_att_reduction_ratio: list, + eff_self_att_heads: list, + overlap_patch_size: list, + overlap_patch_stride: list, + overlap_patch_pad: list, + in_channels: int, ): """ Mixed Transformer backbone encoder (https://arxiv.org/pdf/2105.15203.pdf) @@ -261,41 +251,50 @@ def __init__( super().__init__() - assert len(embed_dims)==len(encoder_layers)==len(eff_self_att_reduction_ratio)==len(eff_self_att_heads)== \ - len(overlap_patch_size)==len(overlap_patch_stride)==len(overlap_patch_pad), \ - f"All backbone hyper-parameters should be lists of the same length" + assert ( + len(embed_dims) + == len(encoder_layers) + == len(eff_self_att_reduction_ratio) + == len(eff_self_att_heads) + == len(overlap_patch_size) + == len(overlap_patch_stride) + == len(overlap_patch_pad) + ), "All backbone hyper-parameters should be lists of the same length" # Patch embeddings self.patch_embed = [] for stage_num in range(len(embed_dims)): self.patch_embed.append( PatchEmbedding( - in_channels=in_channels if stage_num==0 else embed_dims[stage_num-1], + in_channels=in_channels if stage_num == 0 else embed_dims[stage_num - 1], out_channels=embed_dims[stage_num], patch_size=overlap_patch_size[stage_num], stride=overlap_patch_stride[stage_num], - padding=overlap_patch_pad[stage_num] + padding=overlap_patch_pad[stage_num], ) ) self.add_module(f"patch_embed{stage_num+1}", self.patch_embed[stage_num]) drop_path_rate = 0.1 dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(encoder_layers))] - + self.blocks = [] self.norms = [] - + layer_idx = 0 for stage_num in range(len(embed_dims)): self.blocks.append( - nn.ModuleList([ - EncoderBlock( - dim=embed_dims[stage_num], - head=eff_self_att_heads[stage_num], - sr_ratio=eff_self_att_reduction_ratio[stage_num], - dpr=dpr[layer_idx + i]) - for i in range(encoder_layers[stage_num]) - ]) + nn.ModuleList( + [ + EncoderBlock( + dim=embed_dims[stage_num], + head=eff_self_att_heads[stage_num], + sr_ratio=eff_self_att_reduction_ratio[stage_num], + dpr=dpr[layer_idx + i], + ) + for i in range(encoder_layers[stage_num]) + ] + ) ) self.norms.append(nn.LayerNorm(embed_dims[stage_num])) @@ -304,7 +303,6 @@ def __init__( layer_idx += encoder_layers[stage_num] - def forward(self, x: torch.Tensor) -> list[torch.Tensor]: b_size = x.shape[0] @@ -340,6 +338,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x + class LinearFuse(nn.Module): def __init__(self, in_channels: int, out_channels: int): """ @@ -350,12 +349,7 @@ def __init__(self, in_channels: int, out_channels: int): super().__init__() - self.conv = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=1, - bias=False - ) + self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False) self.bn = nn.BatchNorm2d(out_channels) self.relu = nn.ReLU(inplace=True) @@ -378,10 +372,8 @@ def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int): self.linear_layers.append(MLP(dim, embed_dim)) self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx]) - self.linear_fuse = LinearFuse(in_channels=embed_dim*len(encoder_dims), out_channels=embed_dim) - self.linear_pred = nn.Conv2d(in_channels=embed_dim, - out_channels=num_classes, - kernel_size=1) + self.linear_fuse = LinearFuse(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim) + self.linear_pred = nn.Conv2d(in_channels=embed_dim, out_channels=num_classes, kernel_size=1) self.dropout = nn.Dropout2d(0.1) @@ -391,8 +383,8 @@ def forward(self, features: list[torch.Tensor]) -> torch.Tensor: out_lst = [self.linear_layers[0](features[0]).permute(0, 2, 1).reshape(b, -1, *features[0].shape[-2:])] for i, feature in enumerate(features[1:]): - out = self.linear_layers[i+1](feature).permute(0, 2, 1).reshape(b, -1, *feature.shape[-2:]) - out = F.interpolate(out, size=(h, w), mode='bilinear', align_corners=False) + out = self.linear_layers[i + 1](feature).permute(0, 2, 1).reshape(b, -1, *feature.shape[-2:]) + out = F.interpolate(out, size=(h, w), mode="bilinear", align_corners=False) out_lst.append(out) out = self.linear_fuse(torch.cat(out_lst[::-1], dim=1)) @@ -414,7 +406,7 @@ def __init__( overlap_patch_size: list, overlap_patch_stride: list, overlap_patch_pad: list, - in_channels: int = 3 + in_channels: int = 3, ): """ :param num_classes: number of classes @@ -440,14 +432,10 @@ def __init__( overlap_patch_size=overlap_patch_size, overlap_patch_stride=overlap_patch_stride, overlap_patch_pad=overlap_patch_pad, - in_channels=in_channels + in_channels=in_channels, ) - self.decode_head = SegFormerHead( - encoder_dims=encoder_embed_dims, - embed_dim=decoder_embed_dim, - num_classes=num_classes - ) + self.decode_head = SegFormerHead(encoder_dims=encoder_embed_dims, embed_dim=decoder_embed_dim, num_classes=num_classes) self.init_params() @@ -455,7 +443,7 @@ def init_params(self): for m in self.modules(): if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=.02) + trunc_normal_(m.weight, std=0.02) if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.Conv2d): @@ -474,16 +462,12 @@ def _remove_auxiliary_heads(self): pass def replace_head(self, new_num_classes: int, new_decoder_embed_dim: int): - self.decode_head = SegFormerHead( - encoder_dims=self.encoder_embed_dims, - embed_dim=new_decoder_embed_dim, - num_classes=new_num_classes - ) + self.decode_head = SegFormerHead(encoder_dims=self.encoder_embed_dims, embed_dim=new_decoder_embed_dim, num_classes=new_num_classes) def forward(self, x: torch.Tensor) -> torch.Tensor: features = self._backbone(x) out = self.decode_head(features) - out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False) + out = F.interpolate(out, size=x.shape[2:], mode="bilinear", align_corners=False) return out def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list: @@ -535,7 +519,7 @@ def __init__(self, arch_params: HpmStruct): overlap_patch_size=arch_params.overlap_patch_size, overlap_patch_stride=arch_params.overlap_patch_stride, overlap_patch_pad=arch_params.overlap_patch_pad, - in_channels=arch_params.in_channels + in_channels=arch_params.in_channels, ) @@ -548,23 +532,14 @@ def __init__(self, arch_params: HpmStruct): "eff_self_att_heads": [1, 2, 5, 8], } -DEFAULT_SEGFORMER_B0_PARAMS = { - **DEFAULT_SEGFORMER_PARAMS, - "encoder_embed_dims": [32, 64, 160, 256], - "encoder_layers": [2, 2, 2, 2], - "decoder_embed_dim": 256 -} +DEFAULT_SEGFORMER_B0_PARAMS = {**DEFAULT_SEGFORMER_PARAMS, "encoder_embed_dims": [32, 64, 160, 256], "encoder_layers": [2, 2, 2, 2], "decoder_embed_dim": 256} DEFAULT_SEGFORMER_B1_PARAMS = { **DEFAULT_SEGFORMER_B0_PARAMS, "encoder_embed_dims": [64, 128, 320, 512], } -DEFAULT_SEGFORMER_B2_PARAMS = { - **DEFAULT_SEGFORMER_B1_PARAMS, - "encoder_layers": [3, 4, 6, 3], - "decoder_embed_dim": 768 -} +DEFAULT_SEGFORMER_B2_PARAMS = {**DEFAULT_SEGFORMER_B1_PARAMS, "encoder_layers": [3, 4, 6, 3], "decoder_embed_dim": 768} DEFAULT_SEGFORMER_B3_PARAMS = { **DEFAULT_SEGFORMER_B2_PARAMS, @@ -588,30 +563,35 @@ def __init__(self, arch_params: HpmStruct): _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) + class SegFormerB1(SegFormerCustom): def __init__(self, arch_params: HpmStruct): _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) + class SegFormerB2(SegFormerCustom): def __init__(self, arch_params: HpmStruct): _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) + class SegFormerB3(SegFormerCustom): def __init__(self, arch_params: HpmStruct): _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) + class SegFormerB4(SegFormerCustom): def __init__(self, arch_params: HpmStruct): _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) + class SegFormerB5(SegFormerCustom): def __init__(self, arch_params: HpmStruct): _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS) From abb99b84fbd1e37aeb9ed54320babc211c53e6c7 Mon Sep 17 00:00:00 2001 From: eran-deci Date: Wed, 1 Mar 2023 16:07:47 +0200 Subject: [PATCH 05/12] update segformer.py, cityscapes_segformer_dataset_params.yaml --- .../cityscapes_segformer_dataset_params.yaml | 6 +- .../models/segmentation_models/segformer.py | 95 +------------------ 2 files changed, 6 insertions(+), 95 deletions(-) diff --git a/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml index 7d90027a7f..3e706b97f4 100644 --- a/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml +++ b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml @@ -26,16 +26,12 @@ train_dataset_params: val_dataset_params: transforms: - SegRescale: - short_size: 1024 + long_size: 1024 - SegPadShortToCropSize: crop_size: [ 1024, 1024 ] fill_mask: 19 - - SegCropImageAndMask: - crop_size: [ 1024, 1024 ] - mode: center - train_dataloader_params: batch_size: 2 shuffle: True diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index 5684cfcff2..fb35ae94b3 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -1,5 +1,3 @@ -import math - import torch import torch.nn as nn import torch.nn.functional as F @@ -7,7 +5,7 @@ from super_gradients.training.models import HpmStruct from super_gradients.training.utils import get_param from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule -from super_gradients.common.abstractions.abstract_logger import get_logger +from super_gradients.training.utils.regularization_utils import DropPath """ paper: SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers @@ -18,65 +16,6 @@ """ -logger = get_logger(__name__) - - -# TODO: this function (and trunc_normal_) are copy-pasted from BEIT model code. We need to consider implementing -# it in a more general location -def _no_grad_trunc_normal_(tensor, mean, std, a, b): - # Cut & paste from PyTorch official master until it's in a few official releases - RW - # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf - def norm_cdf(x): - # Computes standard normal cumulative distribution function - return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 - - if (mean < a - 2 * std) or (mean > b + 2 * std): - logger.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " "The distribution of values may be incorrect.", stacklevel=2) - - with torch.no_grad(): - # Values are generated by using a truncated uniform distribution and - # then using the inverse CDF for the normal distribution. - # Get upper and lower cdf values - lower = norm_cdf((a - mean) / std) - upper = norm_cdf((b - mean) / std) - - # Uniformly fill tensor with values from [l, u], then translate to - # [2l-1, 2u-1]. - tensor.uniform_(2 * lower - 1, 2 * upper - 1) - - # Use inverse cdf transform for normal distribution to get truncated - # standard normal - tensor.erfinv_() - - # Transform to proper mean, std - tensor.mul_(std * math.sqrt(2.0)) - tensor.add_(mean) - - # Clamp to ensure it's in the proper range - tensor.clamp_(min=a, max=b) - return tensor - - -def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): - # type: (torch.Tensor, float, float, float, float) -> torch.Tensor - r"""Fills the input Tensor with values drawn from a truncated - normal distribution. The values are effectively drawn from the - normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` - with values outside :math:`[a, b]` redrawn until they are within - the bounds. The method used for generating the random values works - best when :math:`a \leq \text{mean} \leq b`. - Args: - tensor: an n-dimensional `torch.Tensor` - mean: the mean of the normal distribution - std: the standard deviation of the normal distribution - a: the minimum cutoff value - b: the maximum cutoff value - Examples: - >>> w = torch.empty(3, 5) - >>> nn.init.trunc_normal_(w) - """ - return _no_grad_trunc_normal_(tensor, mean, std, a, b) - class PatchEmbedding(nn.Module): def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int): @@ -145,31 +84,6 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: return x -class DropPath(nn.Module): - def __init__(self, drop_p: float = None): - """ - Drop path (stochastic depth). - Taken from: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/layers/common.py - :param drop_p: drop probability - """ - - super().__init__() - - self.drop_p = drop_p - - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.drop_p == 0.0 or not self.training: - return x - - kp = 1 - self.drop_p - shape = (x.shape[0],) + (1,) * (x.ndim - 1) - - random_tensor = kp + torch.rand(shape, dtype=x.dtype, device=x.device) - random_tensor.floor_() # binarize - - return x.div(kp) * random_tensor - - class MixFFN(nn.Module): def __init__(self, in_dim: int, inter_dim: int): """ @@ -251,7 +165,7 @@ def __init__( super().__init__() - assert ( + if not ( len(embed_dims) == len(encoder_layers) == len(eff_self_att_reduction_ratio) @@ -259,7 +173,8 @@ def __init__( == len(overlap_patch_size) == len(overlap_patch_stride) == len(overlap_patch_pad) - ), "All backbone hyper-parameters should be lists of the same length" + ): + raise ValueError("All backbone hyper-parameters should be lists of the same length") # Patch embeddings self.patch_embed = [] @@ -443,7 +358,7 @@ def init_params(self): for m in self.modules(): if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) + torch.nn.init.trunc_normal_(m.weight, std=0.02) if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.Conv2d): From e30b6bff26ded006bc9f0bd34459114010c67f99 Mon Sep 17 00:00:00 2001 From: eran-deci Date: Fri, 10 Mar 2023 12:33:45 +0200 Subject: [PATCH 06/12] Update all_architectures.py --- src/super_gradients/training/models/all_architectures.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/super_gradients/training/models/all_architectures.py b/src/super_gradients/training/models/all_architectures.py index 3bda4b2418..50d5884101 100755 --- a/src/super_gradients/training/models/all_architectures.py +++ b/src/super_gradients/training/models/all_architectures.py @@ -36,7 +36,7 @@ SegFormerB2, SegFormerB3, SegFormerB4, - SegFormerB5 + SegFormerB5, ) from super_gradients.training.models.kd_modules.kd_module import KDModule @@ -158,7 +158,6 @@ Models.SEGFORMER_B3: SegFormerB3, Models.SEGFORMER_B4: SegFormerB4, Models.SEGFORMER_B5: SegFormerB5, - # Models.DEKR_CUSTOM: DEKRPoseEstimationModel, Models.DEKR_W32_NO_DC: DEKRW32, Models.POSE_PP_YOLO_L: PosePPYoloL, From 6bc26a2cc8ad56ed779a5cb6aa9ca249dee8312e Mon Sep 17 00:00:00 2001 From: eran-deci Date: Fri, 10 Mar 2023 12:46:02 +0200 Subject: [PATCH 07/12] Update segformer.py --- .../training/models/segmentation_models/segformer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index fb35ae94b3..773820dfa9 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -7,6 +7,8 @@ from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule from super_gradients.training.utils.regularization_utils import DropPath +from typing import List, Tuple + """ paper: SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers ( https://arxiv.org/pdf/2105.15203.pdf ) @@ -33,7 +35,7 @@ def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: self.proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=patch_size, stride=stride, padding=padding) self.norm = nn.LayerNorm(out_channels) - def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]: + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, int, int]: x = self.proj(x) _, _, h, w = x.shape @@ -218,7 +220,7 @@ def __init__( layer_idx += encoder_layers[stage_num] - def forward(self, x: torch.Tensor) -> list[torch.Tensor]: + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: b_size = x.shape[0] features = [] @@ -292,7 +294,7 @@ def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int): self.dropout = nn.Dropout2d(0.1) - def forward(self, features: list[torch.Tensor]) -> torch.Tensor: + def forward(self, features: List[torch.Tensor]) -> torch.Tensor: b, _, h, w = features[0].shape out_lst = [self.linear_layers[0](features[0]).permute(0, 2, 1).reshape(b, -1, *features[0].shape[-2:])] From 73c40c1cac41a36df65b38fbbb41be5ed0a16a0a Mon Sep 17 00:00:00 2001 From: eran-deci Date: Wed, 15 Mar 2023 15:00:13 +0200 Subject: [PATCH 08/12] update segformer.py, unite all segformer recipes --- ...rmer_b0.yaml => cityscapes_segformer.yaml} | 36 +++--- .../recipes/cityscapes_segformer_b1.yaml | 109 ------------------ .../recipes/cityscapes_segformer_b2.yaml | 109 ------------------ .../recipes/cityscapes_segformer_b3.yaml | 109 ------------------ .../recipes/cityscapes_segformer_b4.yaml | 109 ------------------ .../recipes/cityscapes_segformer_b5.yaml | 109 ------------------ .../models/segmentation_models/segformer.py | 101 +++++++++------- 7 files changed, 85 insertions(+), 597 deletions(-) rename src/super_gradients/recipes/{cityscapes_segformer_b0.yaml => cityscapes_segformer.yaml} (62%) delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b1.yaml delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b2.yaml delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b3.yaml delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b4.yaml delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b5.yaml diff --git a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml b/src/super_gradients/recipes/cityscapes_segformer.yaml similarity index 62% rename from src/super_gradients/recipes/cityscapes_segformer_b0.yaml rename to src/super_gradients/recipes/cityscapes_segformer.yaml index 19bbc4810e..b78fb932df 100644 --- a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml +++ b/src/super_gradients/recipes/cityscapes_segformer.yaml @@ -1,4 +1,4 @@ -# SegFormer-B0 segmentation training example with Cityscapes dataset. +# SegFormer segmentation training example with Cityscapes dataset. # Reproduction of paper: # Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo # "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" @@ -7,27 +7,37 @@ # Official git repo: # https://github.com/NVlabs/SegFormer # -# Imagenet-1k pre-trained backbone weights taken and adapted from: +# Code and Imagenet-1k pre-trained backbone weights taken and adapted from: # https://github.com/sithu31296/semantic-segmentation # # Instructions: -# 1. We recommend preparing the data according to SG's CityScapes readme file: +# 1. Choose SegFormer architecture (b0 - b5) by changing the value of the "architecture" field below +# 2. We recommend preparing the data according to SG's CityScapes readme file: # https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md -# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and +# 3. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and # "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly -# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory -# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs -# 5. Move to the project root (where you will find the ReadMe and src folder) -# 6. Run the command: -# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b0 +# 4. Edit the "data_root_dir" field below to point to the absolute path of the data root directory +# 5. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs +# 6. Move to the project root (where you will find the ReadMe and src folder) +# 7. Run the command (change: +# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer # # # Imagenet-1K pre-trained backbone: # MiT (Mix Transformer) B0: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b0.pth +# B1: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b1.pth +# B2: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b2.pth +# B3: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b3.pth +# B4: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b4.pth +# B5: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b5.pth # # 1. Download the weights from the above link and put them in a directory of your choice # 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path # 3. Ensure checkpoint_params.load_backbone: True +# +# Performance and training details: +# SegFormer-B0: mIoU (sliding-window inference) on validation set: 76.25 +# training time: 17 hours with 3 A10G GPUs with DDP, ~3 minuets / epoch defaults: @@ -36,7 +46,7 @@ defaults: - checkpoint_params: default_checkpoint_params - _self_ -architecture: segformer_b0 +architecture: segformer_b0 # segformer_b1, segformer_b2, segformer_b3, segformer_b4, segformer_b5 data_root_dir: /data/cityscapes dataset_params: @@ -46,7 +56,7 @@ dataset_params: root_dir: ${data_root_dir} experiment_name: ${architecture}_cityscapes -ckpt_root_dir: +ckpt_root_dir: /home/eran.shachar/PycharmProjects/super-gradients/checkpoints train_dataloader: cityscapes_train val_dataloader: cityscapes_val @@ -57,7 +67,7 @@ arch_params: num_classes: 19 checkpoint_params: - checkpoint_path: + checkpoint_path: /home/eran.shachar/data/segformer_pretrained_weights/mit_b0.pth load_backbone: True load_weights_only: True strict_load: no_key_matching @@ -97,7 +107,7 @@ training_hyperparams: greater_metric_to_watch_is_better: True multi_gpu: DDP -num_gpus: 4 +num_gpus: 3 # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA diff --git a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml deleted file mode 100644 index 6fcf073ff3..0000000000 --- a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# SegFormer-B1 segmentation training example with Cityscapes dataset. -# Reproduction of paper: -# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo -# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" -# ( https://arxiv.org/pdf/2105.15203.pdf ) -# -# Official git repo: -# https://github.com/NVlabs/SegFormer -# -# -# Imagenet-1k pre-trained backbone weights taken and adapted from: -# https://github.com/sithu31296/semantic-segmentation -# -# -# Instructions: -# 1. We recommend preparing the data according to SG's CityScapes readme file: -# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md -# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and -# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly -# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory -# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs -# 5. Move to the project root (where you will find the ReadMe and src folder) -# 6. Run the command: -# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b1 -# -# -# Imagenet-1K pre-trained backbone: -# MiT (Mix Transformer) B1: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b1.pth -# -# 1. Download the weights from the above link and put them in a directory of your choice -# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path -# 3. Ensure checkpoint_params.load_backbone: True - - -defaults: - - training_hyperparams: default_train_params - - dataset_params: cityscapes_segformer_dataset_params - - checkpoint_params: default_checkpoint_params - - _self_ - -architecture: segformer_b1 - -data_root_dir: /data/cityscapes -dataset_params: - train_dataset_params: - root_dir: ${data_root_dir} - val_dataset_params: - root_dir: ${data_root_dir} - -experiment_name: ${architecture}_cityscapes -ckpt_root_dir: - -train_dataloader: cityscapes_train -val_dataloader: cityscapes_val - -cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML - -arch_params: - num_classes: 19 - -checkpoint_params: - checkpoint_path: - load_backbone: True - load_weights_only: True - strict_load: no_key_matching - -load_checkpoint: False - -resume: False -training_hyperparams: - - resume: ${resume} - - max_epochs: 400 - - lr_mode: poly - initial_lr: 0.0002 # for effective batch_size=8 - - optimizer: AdamW - zero_weight_decay_on_bias_and_bn: True - - sync_bn: True - - loss: cross_entropy - criterion_params: - ignore_index: ${cityscapes_ignored_label} - - train_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - valid_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - metric_to_watch: IoU - greater_metric_to_watch_is_better: True - -multi_gpu: DDP -num_gpus: 4 - - -# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA -hydra: - run: - # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) - dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml deleted file mode 100644 index 23e5956ac5..0000000000 --- a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# SegFormer-B2 segmentation training example with Cityscapes dataset. -# Reproduction of paper: -# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo -# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" -# ( https://arxiv.org/pdf/2105.15203.pdf ) -# -# Official git repo: -# https://github.com/NVlabs/SegFormer -# -# -# Imagenet-1k pre-trained backbone weights taken and adapted from: -# https://github.com/sithu31296/semantic-segmentation -# -# -# Instructions: -# 1. We recommend preparing the data according to SG's CityScapes readme file: -# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md -# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and -# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly -# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory -# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs -# 5. Move to the project root (where you will find the ReadMe and src folder) -# 6. Run the command: -# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b2 -# -# -# Imagenet-1K pre-trained backbone: -# MiT (Mix Transformer) B2: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b2.pth -# -# 1. Download the weights from the above link and put them in a directory of your choice -# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path -# 3. Ensure checkpoint_params.load_backbone: True - - -defaults: - - training_hyperparams: default_train_params - - dataset_params: cityscapes_segformer_dataset_params - - checkpoint_params: default_checkpoint_params - - _self_ - -architecture: segformer_b2 - -data_root_dir: /data/cityscapes -dataset_params: - train_dataset_params: - root_dir: ${data_root_dir} - val_dataset_params: - root_dir: ${data_root_dir} - -experiment_name: ${architecture}_cityscapes -ckpt_root_dir: - -train_dataloader: cityscapes_train -val_dataloader: cityscapes_val - -cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML - -arch_params: - num_classes: 19 - -checkpoint_params: - checkpoint_path: - load_backbone: True - load_weights_only: True - strict_load: no_key_matching - -load_checkpoint: False - -resume: False -training_hyperparams: - - resume: ${resume} - - max_epochs: 400 - - lr_mode: poly - initial_lr: 0.0002 # for effective batch_size=8 - - optimizer: AdamW - zero_weight_decay_on_bias_and_bn: True - - sync_bn: True - - loss: cross_entropy - criterion_params: - ignore_index: ${cityscapes_ignored_label} - - train_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - valid_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - metric_to_watch: IoU - greater_metric_to_watch_is_better: True - -multi_gpu: DDP -num_gpus: 4 - - -# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA -hydra: - run: - # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) - dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml deleted file mode 100644 index 4d957502bf..0000000000 --- a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# SegFormer-B3 segmentation training example with Cityscapes dataset. -# Reproduction of paper: -# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo -# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" -# ( https://arxiv.org/pdf/2105.15203.pdf ) -# -# Official git repo: -# https://github.com/NVlabs/SegFormer -# -# -# Imagenet-1k pre-trained backbone weights taken and adapted from: -# https://github.com/sithu31296/semantic-segmentation -# -# -# Instructions: -# 1. We recommend preparing the data according to SG's CityScapes readme file: -# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md -# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and -# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly -# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory -# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs -# 5. Move to the project root (where you will find the ReadMe and src folder) -# 6. Run the command: -# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b3 -# -# -# Imagenet-1K pre-trained backbone: -# MiT (Mix Transformer) B3: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b3.pth -# -# 1. Download the weights from the above link and put them in a directory of your choice -# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path -# 3. Ensure checkpoint_params.load_backbone: True - - -defaults: - - training_hyperparams: default_train_params - - dataset_params: cityscapes_segformer_dataset_params - - checkpoint_params: default_checkpoint_params - - _self_ - -architecture: segformer_b3 - -data_root_dir: /data/cityscapes -dataset_params: - train_dataset_params: - root_dir: ${data_root_dir} - val_dataset_params: - root_dir: ${data_root_dir} - -experiment_name: ${architecture}_cityscapes -ckpt_root_dir: - -train_dataloader: cityscapes_train -val_dataloader: cityscapes_val - -cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML - -arch_params: - num_classes: 19 - -checkpoint_params: - checkpoint_path: - load_backbone: True - load_weights_only: True - strict_load: no_key_matching - -load_checkpoint: False - -resume: False -training_hyperparams: - - resume: ${resume} - - max_epochs: 400 - - lr_mode: poly - initial_lr: 0.0002 # for effective batch_size=8 - - optimizer: AdamW - zero_weight_decay_on_bias_and_bn: True - - sync_bn: True - - loss: cross_entropy - criterion_params: - ignore_index: ${cityscapes_ignored_label} - - train_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - valid_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - metric_to_watch: IoU - greater_metric_to_watch_is_better: True - -multi_gpu: DDP -num_gpus: 4 - - -# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA -hydra: - run: - # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) - dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml deleted file mode 100644 index 4d34fd1c95..0000000000 --- a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# SegFormer-B4 segmentation training example with Cityscapes dataset. -# Reproduction of paper: -# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo -# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" -# ( https://arxiv.org/pdf/2105.15203.pdf ) -# -# Official git repo: -# https://github.com/NVlabs/SegFormer -# -# -# Imagenet-1k pre-trained backbone weights taken and adapted from: -# https://github.com/sithu31296/semantic-segmentation -# -# -# Instructions: -# 1. We recommend preparing the data according to SG's CityScapes readme file: -# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md -# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and -# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly -# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory -# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs -# 5. Move to the project root (where you will find the ReadMe and src folder) -# 6. Run the command: -# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b4 -# -# -# Imagenet-1K pre-trained backbone: -# MiT (Mix Transformer) B4: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b4.pth -# -# 1. Download the weights from the above link and put them in a directory of your choice -# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path -# 3. Ensure checkpoint_params.load_backbone: True - - -defaults: - - training_hyperparams: default_train_params - - dataset_params: cityscapes_segformer_dataset_params - - checkpoint_params: default_checkpoint_params - - _self_ - -architecture: segformer_b4 - -data_root_dir: /data/cityscapes -dataset_params: - train_dataset_params: - root_dir: ${data_root_dir} - val_dataset_params: - root_dir: ${data_root_dir} - -experiment_name: ${architecture}_cityscapes -ckpt_root_dir: - -train_dataloader: cityscapes_train -val_dataloader: cityscapes_val - -cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML - -arch_params: - num_classes: 19 - -checkpoint_params: - checkpoint_path: - load_backbone: True - load_weights_only: True - strict_load: no_key_matching - -load_checkpoint: False - -resume: False -training_hyperparams: - - resume: ${resume} - - max_epochs: 400 - - lr_mode: poly - initial_lr: 0.0002 # for effective batch_size=8 - - optimizer: AdamW - zero_weight_decay_on_bias_and_bn: True - - sync_bn: True - - loss: cross_entropy - criterion_params: - ignore_index: ${cityscapes_ignored_label} - - train_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - valid_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - metric_to_watch: IoU - greater_metric_to_watch_is_better: True - -multi_gpu: DDP -num_gpus: 4 - - -# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA -hydra: - run: - # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) - dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml deleted file mode 100644 index ba8dd776de..0000000000 --- a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# SegFormer-B5 segmentation training example with Cityscapes dataset. -# Reproduction of paper: -# Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo -# "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers" -# ( https://arxiv.org/pdf/2105.15203.pdf ) -# -# Official git repo: -# https://github.com/NVlabs/SegFormer -# -# -# Imagenet-1k pre-trained backbone weights taken and adapted from: -# https://github.com/sithu31296/semantic-segmentation -# -# -# Instructions: -# 1. We recommend preparing the data according to SG's CityScapes readme file: -# https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md -# 2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and -# "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly -# 3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory -# 4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs -# 5. Move to the project root (where you will find the ReadMe and src folder) -# 6. Run the command: -# python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b5 -# -# -# Imagenet-1K pre-trained backbone: -# MiT (Mix Transformer) B5: https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b5.pth -# -# 1. Download the weights from the above link and put them in a directory of your choice -# 2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path -# 3. Ensure checkpoint_params.load_backbone: True - - -defaults: - - training_hyperparams: default_train_params - - dataset_params: cityscapes_segformer_dataset_params - - checkpoint_params: default_checkpoint_params - - _self_ - -architecture: segformer_b5 - -data_root_dir: /data/cityscapes -dataset_params: - train_dataset_params: - root_dir: ${data_root_dir} - val_dataset_params: - root_dir: ${data_root_dir} - -experiment_name: ${architecture}_cityscapes -ckpt_root_dir: - -train_dataloader: cityscapes_train -val_dataloader: cityscapes_val - -cityscapes_ignored_label: 19 # convenience parameter since it is used in many places in the YAML - -arch_params: - num_classes: 19 - -checkpoint_params: - checkpoint_path: - load_backbone: True - load_weights_only: True - strict_load: no_key_matching - -load_checkpoint: False - -resume: False -training_hyperparams: - - resume: ${resume} - - max_epochs: 400 - - lr_mode: poly - initial_lr: 0.0002 # for effective batch_size=8 - - optimizer: AdamW - zero_weight_decay_on_bias_and_bn: True - - sync_bn: True - - loss: cross_entropy - criterion_params: - ignore_index: ${cityscapes_ignored_label} - - train_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - valid_metrics_list: - - IoU: - num_classes: 20 - ignore_index: ${cityscapes_ignored_label} - - metric_to_watch: IoU - greater_metric_to_watch_is_better: True - -multi_gpu: DDP -num_gpus: 4 - - -# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA -hydra: - run: - # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated) - dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}} diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index 773820dfa9..c4422b1610 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -6,19 +6,22 @@ from super_gradients.training.utils import get_param from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule from super_gradients.training.utils.regularization_utils import DropPath +from super_gradients.modules.conv_bn_relu_block import ConvBNReLU + from typing import List, Tuple """ paper: SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers ( https://arxiv.org/pdf/2105.15203.pdf ) -code adopted from git repo: https://github.com/sithu31296/semantic-segmentation - -Imagenet-1k pre-trained backbone weights taken and adapted from: https://github.com/sithu31296/semantic-segmentation +Code and Imagenet-1k pre-trained backbone weights adopted from GitHub repo: +https://github.com/sithu31296/semantic-segmentation """ +# TODO: extract this block to src/super_gradients/modules/transformer_modules and reuse the same module of Beit and +# other ViTs class PatchEmbedding(nn.Module): def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int): """ @@ -45,6 +48,8 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, int, int]: return x, h, w +# TODO: extract this block to src/super_gradients/modules/transformer_modules and reuse the same module of Beit and +# other ViTs class EfficientSelfAttention(nn.Module): def __init__(self, dim: int, head: int, sr_ratio: int): """ @@ -144,13 +149,13 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor: class MiTBackBone(nn.Module): def __init__( self, - embed_dims: list, - encoder_layers: list, - eff_self_att_reduction_ratio: list, - eff_self_att_heads: list, - overlap_patch_size: list, - overlap_patch_stride: list, - overlap_patch_pad: list, + embed_dims: List[int], + encoder_layers: List[int], + eff_self_att_reduction_ratio: List[int], + eff_self_att_heads: List[int], + overlap_patch_size: List[int], + overlap_patch_stride: List[int], + overlap_patch_pad: List[int], in_channels: int, ): """ @@ -237,8 +242,10 @@ def forward(self, x: torch.Tensor) -> List[torch.Tensor]: return features +# TODO: extract this block to src/super_gradients/modules/transformer_modules and reuse the same module of Beit and +# other ViTs class MLP(nn.Module): - def __init__(self, dim, embed_dim): + def __init__(self, dim: int, embed_dim: int): """ A single Linear layer, with shape pre-processing :param dim: input dimension @@ -256,26 +263,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -class LinearFuse(nn.Module): - def __init__(self, in_channels: int, out_channels: int): - """ - A linear fusion block (conv + bn + relu) (https://arxiv.org/pdf/2105.15203.pdf) - :param in_channels: number of input channels - :param out_channels: number of output channels - """ - - super().__init__() - - self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False) - self.bn = nn.BatchNorm2d(out_channels) - self.relu = nn.ReLU(inplace=True) - - def forward(self, x): - return self.relu(self.bn(self.conv(x))) - - class SegFormerHead(nn.Module): - def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int): + def __init__(self, encoder_dims: List[int], embed_dim: int, num_classes: int): """ SegFormer decoder head (https://arxiv.org/pdf/2105.15203.pdf) :param encoder_dims: list of encoder embedding dimensions @@ -289,7 +278,7 @@ def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int): self.linear_layers.append(MLP(dim, embed_dim)) self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx]) - self.linear_fuse = LinearFuse(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim) + self.linear_fuse = ConvBNReLU(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim, kernel_size=1, bias=False, inplace=True) self.linear_pred = nn.Conv2d(in_channels=embed_dim, out_channels=num_classes, kernel_size=1) self.dropout = nn.Dropout2d(0.1) @@ -315,14 +304,14 @@ class SegFormer(SegmentationModule): def __init__( self, num_classes: int, - encoder_embed_dims: list, - encoder_layers: list, - eff_self_att_reduction_ratio: list, - eff_self_att_heads: list, + encoder_embed_dims: List[int], + encoder_layers: List[int], + eff_self_att_reduction_ratio: List[int], + eff_self_att_heads: List[int], decoder_embed_dim: int, - overlap_patch_size: list, - overlap_patch_stride: list, - overlap_patch_pad: list, + overlap_patch_size: List[int], + overlap_patch_stride: List[int], + overlap_patch_pad: List[int], in_channels: int = 3, ): """ @@ -425,7 +414,11 @@ def _separate_lr_multiply_params(self): class SegFormerCustom(SegFormer): def __init__(self, arch_params: HpmStruct): - """Parse arch_params and translate the parameters to build the SegFormer architecture""" + """ + Parse arch_params and translate the parameters to build the SegFormer architecture + :param arch_params: architecture parameters + """ + super().__init__( num_classes=arch_params.num_classes, encoder_embed_dims=arch_params.encoder_embed_dims, @@ -476,6 +469,11 @@ def __init__(self, arch_params: HpmStruct): class SegFormerB0(SegFormerCustom): def __init__(self, arch_params: HpmStruct): + """ + SegFormer B0 architecture + :param arch_params: architecture parameters + """ + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B0_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) @@ -483,6 +481,11 @@ def __init__(self, arch_params: HpmStruct): class SegFormerB1(SegFormerCustom): def __init__(self, arch_params: HpmStruct): + """ + SegFormer B1 architecture + :param arch_params: architecture parameters + """ + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) @@ -490,6 +493,11 @@ def __init__(self, arch_params: HpmStruct): class SegFormerB2(SegFormerCustom): def __init__(self, arch_params: HpmStruct): + """ + SegFormer B2 architecture + :param arch_params: architecture parameters + """ + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) @@ -497,6 +505,11 @@ def __init__(self, arch_params: HpmStruct): class SegFormerB3(SegFormerCustom): def __init__(self, arch_params: HpmStruct): + """ + SegFormer B3 architecture + :param arch_params: architecture parameters + """ + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) @@ -504,6 +517,11 @@ def __init__(self, arch_params: HpmStruct): class SegFormerB4(SegFormerCustom): def __init__(self, arch_params: HpmStruct): + """ + SegFormer B4 architecture + :param arch_params: architecture parameters + """ + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) @@ -511,6 +529,11 @@ def __init__(self, arch_params: HpmStruct): class SegFormerB5(SegFormerCustom): def __init__(self, arch_params: HpmStruct): + """ + SegFormer B5 architecture + :param arch_params: architecture parameters + """ + _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS) _arch_params.override(**arch_params.to_dict()) super().__init__(_arch_params) From 75ca2ad73d8d8d236ce89669d0bcfdbc60136dc9 Mon Sep 17 00:00:00 2001 From: eran-deci Date: Wed, 15 Mar 2023 15:03:46 +0200 Subject: [PATCH 09/12] Update cityscapes_segformer.yaml --- src/super_gradients/recipes/cityscapes_segformer.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/super_gradients/recipes/cityscapes_segformer.yaml b/src/super_gradients/recipes/cityscapes_segformer.yaml index b78fb932df..5f895a736c 100644 --- a/src/super_gradients/recipes/cityscapes_segformer.yaml +++ b/src/super_gradients/recipes/cityscapes_segformer.yaml @@ -56,7 +56,7 @@ dataset_params: root_dir: ${data_root_dir} experiment_name: ${architecture}_cityscapes -ckpt_root_dir: /home/eran.shachar/PycharmProjects/super-gradients/checkpoints +ckpt_root_dir: train_dataloader: cityscapes_train val_dataloader: cityscapes_val @@ -67,7 +67,7 @@ arch_params: num_classes: 19 checkpoint_params: - checkpoint_path: /home/eran.shachar/data/segformer_pretrained_weights/mit_b0.pth + checkpoint_path: load_backbone: True load_weights_only: True strict_load: no_key_matching @@ -107,7 +107,7 @@ training_hyperparams: greater_metric_to_watch_is_better: True multi_gpu: DDP -num_gpus: 3 +num_gpus: 4 # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA From 35908195f7a04ae9b2e5ddcb5e59be0349f19c24 Mon Sep 17 00:00:00 2001 From: eran-deci Date: Wed, 15 Mar 2023 16:16:24 +0200 Subject: [PATCH 10/12] Update segformer.py --- .../training/models/segmentation_models/segformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index c4422b1610..6d4ae3c632 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -2,7 +2,7 @@ import torch.nn as nn import torch.nn.functional as F -from super_gradients.training.models import HpmStruct +from super_gradients.training.utils.utils import HpmStruct from super_gradients.training.utils import get_param from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule from super_gradients.training.utils.regularization_utils import DropPath From 9c5f1e2b3f9effe0af77339c8be6eaf340db6796 Mon Sep 17 00:00:00 2001 From: eran-deci Date: Mon, 17 Apr 2023 15:36:58 +0300 Subject: [PATCH 11/12] Update segformer.py --- .../training/models/segmentation_models/segformer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py index 6d4ae3c632..9ebadb30dc 100644 --- a/src/super_gradients/training/models/segmentation_models/segformer.py +++ b/src/super_gradients/training/models/segmentation_models/segformer.py @@ -7,6 +7,8 @@ from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule from super_gradients.training.utils.regularization_utils import DropPath from super_gradients.modules.conv_bn_relu_block import ConvBNReLU +from super_gradients.common.object_names import Models +from super_gradients.common.registry.registry import register_model from typing import List, Tuple @@ -467,6 +469,7 @@ def __init__(self, arch_params: HpmStruct): } +@register_model(Models.SEGFORMER_B0) class SegFormerB0(SegFormerCustom): def __init__(self, arch_params: HpmStruct): """ @@ -479,6 +482,7 @@ def __init__(self, arch_params: HpmStruct): super().__init__(_arch_params) +@register_model(Models.SEGFORMER_B1) class SegFormerB1(SegFormerCustom): def __init__(self, arch_params: HpmStruct): """ @@ -491,6 +495,7 @@ def __init__(self, arch_params: HpmStruct): super().__init__(_arch_params) +@register_model(Models.SEGFORMER_B2) class SegFormerB2(SegFormerCustom): def __init__(self, arch_params: HpmStruct): """ @@ -503,6 +508,7 @@ def __init__(self, arch_params: HpmStruct): super().__init__(_arch_params) +@register_model(Models.SEGFORMER_B3) class SegFormerB3(SegFormerCustom): def __init__(self, arch_params: HpmStruct): """ @@ -515,6 +521,7 @@ def __init__(self, arch_params: HpmStruct): super().__init__(_arch_params) +@register_model(Models.SEGFORMER_B4) class SegFormerB4(SegFormerCustom): def __init__(self, arch_params: HpmStruct): """ @@ -527,6 +534,7 @@ def __init__(self, arch_params: HpmStruct): super().__init__(_arch_params) +@register_model(Models.SEGFORMER_B5) class SegFormerB5(SegFormerCustom): def __init__(self, arch_params: HpmStruct): """ From 0e853ede104bdb443041699be9bcd1ca569a1e53 Mon Sep 17 00:00:00 2001 From: eran-deci Date: Mon, 17 Apr 2023 15:56:25 +0300 Subject: [PATCH 12/12] Update __init__.py --- src/super_gradients/training/models/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/super_gradients/training/models/__init__.py b/src/super_gradients/training/models/__init__.py index c6582e30aa..280de77b61 100755 --- a/src/super_gradients/training/models/__init__.py +++ b/src/super_gradients/training/models/__init__.py @@ -96,6 +96,7 @@ STDCSegmentationBase, CustomSTDCSegmentation, ) +from super_gradients.training.models.segmentation_models.segformer import SegFormerB0, SegFormerB1, SegFormerB2, SegFormerB3, SegFormerB4, SegFormerB5 # Pose estimation from super_gradients.training.models.pose_estimation_models.pose_ppyolo import PosePPYoloL @@ -258,4 +259,10 @@ "ARCHITECTURES", "Models", "user_models", + "SegFormerB0", + "SegFormerB1", + "SegFormerB2", + "SegFormerB3", + "SegFormerB4", + "SegFormerB5", ]