From 2495138a7a55d46b53da0f159c0f6c6c5245a1a8 Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Mon, 27 Feb 2023 15:41:09 +0200
Subject: [PATCH 01/12] initial commit

---
 src/super_gradients/common/object_names.py    |   6 +
 .../recipes/cityscapes_segformer_b0.yaml      | 107 +++
 .../recipes/cityscapes_segformer_b1.yaml      | 109 +++
 .../recipes/cityscapes_segformer_b2.yaml      | 109 +++
 .../recipes/cityscapes_segformer_b3.yaml      | 109 +++
 .../recipes/cityscapes_segformer_b4.yaml      | 109 +++
 .../recipes/cityscapes_segformer_b5.yaml      | 109 +++
 .../cityscapes_segformer_dataset_params.yaml  |  45 ++
 .../training/models/all_architectures.py      |  14 +
 .../models/segmentation_models/segformer.py   | 619 ++++++++++++++++++
 10 files changed, 1336 insertions(+)
 create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b0.yaml
 create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b1.yaml
 create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b2.yaml
 create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b3.yaml
 create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b4.yaml
 create mode 100644 src/super_gradients/recipes/cityscapes_segformer_b5.yaml
 create mode 100644 src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml
 create mode 100644 src/super_gradients/training/models/segmentation_models/segformer.py

diff --git a/src/super_gradients/common/object_names.py b/src/super_gradients/common/object_names.py
index 10443ae137..c650911747 100644
--- a/src/super_gradients/common/object_names.py
+++ b/src/super_gradients/common/object_names.py
@@ -288,6 +288,12 @@ class Models:
     PP_YOLOE_M = "ppyoloe_m"
     PP_YOLOE_L = "ppyoloe_l"
     PP_YOLOE_X = "ppyoloe_x"
+    SEGFORMER_B0 = "segformer_b0"
+    SEGFORMER_B1 = "segformer_b1"
+    SEGFORMER_B2 = "segformer_b2"
+    SEGFORMER_B3 = "segformer_b3"
+    SEGFORMER_B4 = "segformer_b4"
+    SEGFORMER_B5 = "segformer_b5"
 
     DEKR_CUSTOM = "dekr_custom"
 
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
new file mode 100644
index 0000000000..19bbc4810e
--- /dev/null
+++ b/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
@@ -0,0 +1,107 @@
+#  SegFormer-B0 segmentation training example with Cityscapes dataset.
+#  Reproduction of paper:
+#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
+#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
+#  ( https://arxiv.org/pdf/2105.15203.pdf )
+#
+#  Official git repo:
+#      https://github.com/NVlabs/SegFormer
+#
+#  Imagenet-1k pre-trained backbone weights taken and adapted from:
+#      https://github.com/sithu31296/semantic-segmentation
+#
+# Instructions:
+#   1. We recommend preparing the data according to SG's CityScapes readme file:
+#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
+#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
+#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
+#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
+#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
+#   5. Move to the project root (where you will find the ReadMe and src folder)
+#   6. Run the command:
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b0
+#
+#
+# Imagenet-1K pre-trained backbone:
+#   MiT (Mix Transformer) B0:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b0.pth
+#
+#   1. Download the weights from the above link and put them in a directory of your choice
+#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
+#   3. Ensure checkpoint_params.load_backbone: True
+
+
+defaults:
+  - training_hyperparams: default_train_params
+  - dataset_params: cityscapes_segformer_dataset_params
+  - checkpoint_params: default_checkpoint_params
+  - _self_
+
+architecture: segformer_b0
+
+data_root_dir: /data/cityscapes
+dataset_params:
+  train_dataset_params:
+    root_dir: ${data_root_dir}
+  val_dataset_params:
+    root_dir: ${data_root_dir}
+
+experiment_name: ${architecture}_cityscapes
+ckpt_root_dir:
+
+train_dataloader: cityscapes_train
+val_dataloader: cityscapes_val
+
+cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
+
+arch_params:
+  num_classes: 19
+
+checkpoint_params:
+  checkpoint_path:
+  load_backbone: True
+  load_weights_only: True
+  strict_load: no_key_matching
+
+load_checkpoint: False
+
+resume: False
+training_hyperparams:
+
+  resume: ${resume}
+
+  max_epochs: 400
+
+  lr_mode: poly
+  initial_lr: 0.0002   # for effective batch_size=8
+
+  optimizer: AdamW
+  zero_weight_decay_on_bias_and_bn: True
+
+  sync_bn: True
+
+  loss: cross_entropy
+  criterion_params:
+    ignore_index: ${cityscapes_ignored_label}
+
+  train_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  valid_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  metric_to_watch: IoU
+  greater_metric_to_watch_is_better: True
+
+multi_gpu: DDP
+num_gpus: 4
+
+
+# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
+hydra:
+  run:
+    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
+    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
new file mode 100644
index 0000000000..6fcf073ff3
--- /dev/null
+++ b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
@@ -0,0 +1,109 @@
+#  SegFormer-B1 segmentation training example with Cityscapes dataset.
+#  Reproduction of paper:
+#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
+#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
+#  ( https://arxiv.org/pdf/2105.15203.pdf )
+#
+#  Official git repo:
+#      https://github.com/NVlabs/SegFormer
+#
+#
+#  Imagenet-1k pre-trained backbone weights taken and adapted from:
+#      https://github.com/sithu31296/semantic-segmentation
+#
+#
+# Instructions:
+#   1. We recommend preparing the data according to SG's CityScapes readme file:
+#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
+#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
+#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
+#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
+#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
+#   5. Move to the project root (where you will find the ReadMe and src folder)
+#   6. Run the command:
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b1
+#
+#
+# Imagenet-1K pre-trained backbone:
+#   MiT (Mix Transformer) B1:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b1.pth
+#
+#   1. Download the weights from the above link and put them in a directory of your choice
+#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
+#   3. Ensure checkpoint_params.load_backbone: True
+
+
+defaults:
+  - training_hyperparams: default_train_params
+  - dataset_params: cityscapes_segformer_dataset_params
+  - checkpoint_params: default_checkpoint_params
+  - _self_
+
+architecture: segformer_b1
+
+data_root_dir: /data/cityscapes
+dataset_params:
+  train_dataset_params:
+    root_dir: ${data_root_dir}
+  val_dataset_params:
+    root_dir: ${data_root_dir}
+
+experiment_name: ${architecture}_cityscapes
+ckpt_root_dir:
+
+train_dataloader: cityscapes_train
+val_dataloader: cityscapes_val
+
+cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
+
+arch_params:
+  num_classes: 19
+
+checkpoint_params:
+  checkpoint_path:
+  load_backbone: True
+  load_weights_only: True
+  strict_load: no_key_matching
+
+load_checkpoint: False
+
+resume: False
+training_hyperparams:
+
+  resume: ${resume}
+
+  max_epochs: 400
+
+  lr_mode: poly
+  initial_lr: 0.0002   # for effective batch_size=8
+
+  optimizer: AdamW
+  zero_weight_decay_on_bias_and_bn: True
+
+  sync_bn: True
+
+  loss: cross_entropy
+  criterion_params:
+    ignore_index: ${cityscapes_ignored_label}
+
+  train_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  valid_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  metric_to_watch: IoU
+  greater_metric_to_watch_is_better: True
+
+multi_gpu: DDP
+num_gpus: 4
+
+
+# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
+hydra:
+  run:
+    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
+    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
new file mode 100644
index 0000000000..23e5956ac5
--- /dev/null
+++ b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
@@ -0,0 +1,109 @@
+#  SegFormer-B2 segmentation training example with Cityscapes dataset.
+#  Reproduction of paper:
+#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
+#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
+#  ( https://arxiv.org/pdf/2105.15203.pdf )
+#
+#  Official git repo:
+#      https://github.com/NVlabs/SegFormer
+#
+#
+#  Imagenet-1k pre-trained backbone weights taken and adapted from:
+#      https://github.com/sithu31296/semantic-segmentation
+#
+#
+# Instructions:
+#   1. We recommend preparing the data according to SG's CityScapes readme file:
+#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
+#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
+#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
+#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
+#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
+#   5. Move to the project root (where you will find the ReadMe and src folder)
+#   6. Run the command:
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b2
+#
+#
+# Imagenet-1K pre-trained backbone:
+#   MiT (Mix Transformer) B2:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b2.pth
+#
+#   1. Download the weights from the above link and put them in a directory of your choice
+#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
+#   3. Ensure checkpoint_params.load_backbone: True
+
+
+defaults:
+  - training_hyperparams: default_train_params
+  - dataset_params: cityscapes_segformer_dataset_params
+  - checkpoint_params: default_checkpoint_params
+  - _self_
+
+architecture: segformer_b2
+
+data_root_dir: /data/cityscapes
+dataset_params:
+  train_dataset_params:
+    root_dir: ${data_root_dir}
+  val_dataset_params:
+    root_dir: ${data_root_dir}
+
+experiment_name: ${architecture}_cityscapes
+ckpt_root_dir:
+
+train_dataloader: cityscapes_train
+val_dataloader: cityscapes_val
+
+cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
+
+arch_params:
+  num_classes: 19
+
+checkpoint_params:
+  checkpoint_path:
+  load_backbone: True
+  load_weights_only: True
+  strict_load: no_key_matching
+
+load_checkpoint: False
+
+resume: False
+training_hyperparams:
+
+  resume: ${resume}
+
+  max_epochs: 400
+
+  lr_mode: poly
+  initial_lr: 0.0002   # for effective batch_size=8
+
+  optimizer: AdamW
+  zero_weight_decay_on_bias_and_bn: True
+
+  sync_bn: True
+
+  loss: cross_entropy
+  criterion_params:
+    ignore_index: ${cityscapes_ignored_label}
+
+  train_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  valid_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  metric_to_watch: IoU
+  greater_metric_to_watch_is_better: True
+
+multi_gpu: DDP
+num_gpus: 4
+
+
+# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
+hydra:
+  run:
+    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
+    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
new file mode 100644
index 0000000000..4d957502bf
--- /dev/null
+++ b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
@@ -0,0 +1,109 @@
+#  SegFormer-B3 segmentation training example with Cityscapes dataset.
+#  Reproduction of paper:
+#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
+#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
+#  ( https://arxiv.org/pdf/2105.15203.pdf )
+#
+#  Official git repo:
+#      https://github.com/NVlabs/SegFormer
+#
+#
+#  Imagenet-1k pre-trained backbone weights taken and adapted from:
+#      https://github.com/sithu31296/semantic-segmentation
+#
+#
+# Instructions:
+#   1. We recommend preparing the data according to SG's CityScapes readme file:
+#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
+#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
+#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
+#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
+#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
+#   5. Move to the project root (where you will find the ReadMe and src folder)
+#   6. Run the command:
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b3
+#
+#
+# Imagenet-1K pre-trained backbone:
+#   MiT (Mix Transformer) B3:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b3.pth
+#
+#   1. Download the weights from the above link and put them in a directory of your choice
+#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
+#   3. Ensure checkpoint_params.load_backbone: True
+
+
+defaults:
+  - training_hyperparams: default_train_params
+  - dataset_params: cityscapes_segformer_dataset_params
+  - checkpoint_params: default_checkpoint_params
+  - _self_
+
+architecture: segformer_b3
+
+data_root_dir: /data/cityscapes
+dataset_params:
+  train_dataset_params:
+    root_dir: ${data_root_dir}
+  val_dataset_params:
+    root_dir: ${data_root_dir}
+
+experiment_name: ${architecture}_cityscapes
+ckpt_root_dir:
+
+train_dataloader: cityscapes_train
+val_dataloader: cityscapes_val
+
+cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
+
+arch_params:
+  num_classes: 19
+
+checkpoint_params:
+  checkpoint_path:
+  load_backbone: True
+  load_weights_only: True
+  strict_load: no_key_matching
+
+load_checkpoint: False
+
+resume: False
+training_hyperparams:
+
+  resume: ${resume}
+
+  max_epochs: 400
+
+  lr_mode: poly
+  initial_lr: 0.0002   # for effective batch_size=8
+
+  optimizer: AdamW
+  zero_weight_decay_on_bias_and_bn: True
+
+  sync_bn: True
+
+  loss: cross_entropy
+  criterion_params:
+    ignore_index: ${cityscapes_ignored_label}
+
+  train_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  valid_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  metric_to_watch: IoU
+  greater_metric_to_watch_is_better: True
+
+multi_gpu: DDP
+num_gpus: 4
+
+
+# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
+hydra:
+  run:
+    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
+    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
new file mode 100644
index 0000000000..4d34fd1c95
--- /dev/null
+++ b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
@@ -0,0 +1,109 @@
+#  SegFormer-B4 segmentation training example with Cityscapes dataset.
+#  Reproduction of paper:
+#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
+#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
+#  ( https://arxiv.org/pdf/2105.15203.pdf )
+#
+#  Official git repo:
+#      https://github.com/NVlabs/SegFormer
+#
+#
+#  Imagenet-1k pre-trained backbone weights taken and adapted from:
+#      https://github.com/sithu31296/semantic-segmentation
+#
+#
+# Instructions:
+#   1. We recommend preparing the data according to SG's CityScapes readme file:
+#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
+#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
+#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
+#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
+#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
+#   5. Move to the project root (where you will find the ReadMe and src folder)
+#   6. Run the command:
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b4
+#
+#
+# Imagenet-1K pre-trained backbone:
+#   MiT (Mix Transformer) B4:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b4.pth
+#
+#   1. Download the weights from the above link and put them in a directory of your choice
+#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
+#   3. Ensure checkpoint_params.load_backbone: True
+
+
+defaults:
+  - training_hyperparams: default_train_params
+  - dataset_params: cityscapes_segformer_dataset_params
+  - checkpoint_params: default_checkpoint_params
+  - _self_
+
+architecture: segformer_b4
+
+data_root_dir: /data/cityscapes
+dataset_params:
+  train_dataset_params:
+    root_dir: ${data_root_dir}
+  val_dataset_params:
+    root_dir: ${data_root_dir}
+
+experiment_name: ${architecture}_cityscapes
+ckpt_root_dir:
+
+train_dataloader: cityscapes_train
+val_dataloader: cityscapes_val
+
+cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
+
+arch_params:
+  num_classes: 19
+
+checkpoint_params:
+  checkpoint_path:
+  load_backbone: True
+  load_weights_only: True
+  strict_load: no_key_matching
+
+load_checkpoint: False
+
+resume: False
+training_hyperparams:
+
+  resume: ${resume}
+
+  max_epochs: 400
+
+  lr_mode: poly
+  initial_lr: 0.0002   # for effective batch_size=8
+
+  optimizer: AdamW
+  zero_weight_decay_on_bias_and_bn: True
+
+  sync_bn: True
+
+  loss: cross_entropy
+  criterion_params:
+    ignore_index: ${cityscapes_ignored_label}
+
+  train_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  valid_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  metric_to_watch: IoU
+  greater_metric_to_watch_is_better: True
+
+multi_gpu: DDP
+num_gpus: 4
+
+
+# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
+hydra:
+  run:
+    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
+    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
new file mode 100644
index 0000000000..ba8dd776de
--- /dev/null
+++ b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
@@ -0,0 +1,109 @@
+#  SegFormer-B5 segmentation training example with Cityscapes dataset.
+#  Reproduction of paper:
+#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
+#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
+#  ( https://arxiv.org/pdf/2105.15203.pdf )
+#
+#  Official git repo:
+#      https://github.com/NVlabs/SegFormer
+#
+#
+#  Imagenet-1k pre-trained backbone weights taken and adapted from:
+#      https://github.com/sithu31296/semantic-segmentation
+#
+#
+# Instructions:
+#   1. We recommend preparing the data according to SG's CityScapes readme file:
+#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
+#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
+#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
+#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
+#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
+#   5. Move to the project root (where you will find the ReadMe and src folder)
+#   6. Run the command:
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b5
+#
+#
+# Imagenet-1K pre-trained backbone:
+#   MiT (Mix Transformer) B5:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b5.pth
+#
+#   1. Download the weights from the above link and put them in a directory of your choice
+#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
+#   3. Ensure checkpoint_params.load_backbone: True
+
+
+defaults:
+  - training_hyperparams: default_train_params
+  - dataset_params: cityscapes_segformer_dataset_params
+  - checkpoint_params: default_checkpoint_params
+  - _self_
+
+architecture: segformer_b5
+
+data_root_dir: /data/cityscapes
+dataset_params:
+  train_dataset_params:
+    root_dir: ${data_root_dir}
+  val_dataset_params:
+    root_dir: ${data_root_dir}
+
+experiment_name: ${architecture}_cityscapes
+ckpt_root_dir:
+
+train_dataloader: cityscapes_train
+val_dataloader: cityscapes_val
+
+cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
+
+arch_params:
+  num_classes: 19
+
+checkpoint_params:
+  checkpoint_path:
+  load_backbone: True
+  load_weights_only: True
+  strict_load: no_key_matching
+
+load_checkpoint: False
+
+resume: False
+training_hyperparams:
+
+  resume: ${resume}
+
+  max_epochs: 400
+
+  lr_mode: poly
+  initial_lr: 0.0002   # for effective batch_size=8
+
+  optimizer: AdamW
+  zero_weight_decay_on_bias_and_bn: True
+
+  sync_bn: True
+
+  loss: cross_entropy
+  criterion_params:
+    ignore_index: ${cityscapes_ignored_label}
+
+  train_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  valid_metrics_list:
+    - IoU:
+        num_classes: 20
+        ignore_index: ${cityscapes_ignored_label}
+
+  metric_to_watch: IoU
+  greater_metric_to_watch_is_better: True
+
+multi_gpu: DDP
+num_gpus: 4
+
+
+# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
+hydra:
+  run:
+    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
+    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml
new file mode 100644
index 0000000000..7d90027a7f
--- /dev/null
+++ b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml
@@ -0,0 +1,45 @@
+defaults:
+  - cityscapes_dataset_params
+  - _self_
+
+train_dataset_params:
+  transforms:
+    - SegColorJitter:
+        brightness: 0.5
+        contrast: 0.5
+        saturation: 0.5
+
+    - SegRandomFlip:
+        prob: 0.5
+
+    - SegRandomRescale:
+        scales: [ 0.5, 2.0 ]
+
+    - SegPadShortToCropSize:
+        crop_size: [ 1024, 1024 ]
+        fill_mask: 19
+
+    - SegCropImageAndMask:
+        crop_size: [ 1024, 1024 ]
+        mode: random
+
+val_dataset_params:
+  transforms:
+    - SegRescale:
+        short_size: 1024
+
+    - SegPadShortToCropSize:
+        crop_size: [ 1024, 1024 ]
+        fill_mask: 19
+
+    - SegCropImageAndMask:
+        crop_size: [ 1024, 1024 ]
+        mode: center
+
+train_dataloader_params:
+  batch_size: 2
+  shuffle: True
+
+val_dataloader_params:
+  batch_size: 2
+  shuffle: False
diff --git a/src/super_gradients/training/models/all_architectures.py b/src/super_gradients/training/models/all_architectures.py
index c681f39733..9bd0af6778 100755
--- a/src/super_gradients/training/models/all_architectures.py
+++ b/src/super_gradients/training/models/all_architectures.py
@@ -28,6 +28,14 @@
     CustomSTDCSegmentation,
     STDCClassification,
 )
+from super_gradients.training.models.segmentation_models.segformer import (
+    SegFormerB0,
+    SegFormerB1,
+    SegFormerB2,
+    SegFormerB3,
+    SegFormerB4,
+    SegFormerB5
+)
 
 from super_gradients.training.models.kd_modules.kd_module import KDModule
 from super_gradients.training.models.classification_models.beit import BeitBasePatch16_224, BeitLargePatch16_224
@@ -141,6 +149,12 @@
     Models.PP_YOLOE_M: PPYoloE_M,
     Models.PP_YOLOE_L: PPYoloE_L,
     Models.PP_YOLOE_X: PPYoloE_X,
+    Models.SEGFORMER_B0: SegFormerB0,
+    Models.SEGFORMER_B1: SegFormerB1,
+    Models.SEGFORMER_B2: SegFormerB2,
+    Models.SEGFORMER_B3: SegFormerB3,
+    Models.SEGFORMER_B4: SegFormerB4,
+    Models.SEGFORMER_B5: SegFormerB5,
     #
     Models.DEKR_CUSTOM: DEKRPoseEstimationModel,
 }
diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
new file mode 100644
index 0000000000..d66442176d
--- /dev/null
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -0,0 +1,619 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from super_gradients.training.models import HpmStruct
+from super_gradients.training.utils import get_param
+from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule
+from super_gradients.common.abstractions.abstract_logger import get_logger
+
+"""
+paper:  SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers
+        ( https://arxiv.org/pdf/2105.15203.pdf )
+code adopted from git repo: https://github.com/sithu31296/semantic-segmentation
+
+Imagenet-1k pre-trained backbone weights taken and adapted from: https://github.com/sithu31296/semantic-segmentation
+
+"""
+
+logger = get_logger(__name__)
+
+
+# TODO: this function (and trunc_normal_) are copy-pasted from BEIT model code. We need to consider implementing
+#  it in a more general location
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        logger.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " "The distribution of values may be incorrect.", stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+class PatchEmbedding(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int):
+        """
+        Overlapped patch merging (https://arxiv.org/pdf/2105.15203.pdf)
+        :param in_channels: number of input channels
+        :param out_channels: number of output channels (embedding dimension)
+        :param patch_size: patch size (k for size (k, k))
+        :param stride: patch stride (k for size (k, k))
+        :param padding:  patch padding (k for size (k, k))
+        """
+
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding)
+        self.norm = nn.LayerNorm(out_channels)
+
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]:
+        x = self.proj(x)
+        _, _, h, w = x.shape
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+
+        return x, h, w
+
+
+class EfficientSelfAttention(nn.Module):
+    def __init__(self, dim: int, head: int, sr_ratio: int):
+        """
+        Efficient self-attention (https://arxiv.org/pdf/2105.15203.pdf)
+        :param dim: embedding dimension
+        :param head: number of attention heads
+        :param sr_ratio: the reduction ratio of the efficient self-attention
+        """
+
+        super().__init__()
+
+        self.head = head
+        self.sr_ratio = sr_ratio
+        self.scale = (dim // head) ** -0.5
+        self.q = nn.Linear(dim, dim)
+        self.kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, sr_ratio, sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
+        b, n, c = x.shape
+        q = self.q(x).reshape(b, n, self.head, c // self.head).permute(0, 2, 1, 3)
+
+        if self.sr_ratio > 1:
+            x = x.permute(0, 2, 1).reshape(b, c, h, w)
+            x = self.sr(x).reshape(b, c, -1).permute(0, 2, 1)
+            x = self.norm(x)
+
+        k, v = self.kv(x).reshape(b, -1, 2, self.head, c // self.head).permute(2, 0, 3, 1, 4)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+
+        x = (attn @ v).transpose(1, 2).reshape(b, n, c)
+        x = self.proj(x)
+        return x
+
+class DropPath(nn.Module):
+
+    def __init__(self, drop_p: float = None):
+        """
+        Drop path (stochastic depth).
+        Taken from: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/layers/common.py
+        :param drop_p: drop probability
+        """
+
+        super().__init__()
+
+        self.drop_p = drop_p
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.drop_p == 0. or not self.training:
+            return x
+
+        kp = 1 - self.drop_p
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+
+        random_tensor = kp + torch.rand(shape, dtype=x.dtype, device=x.device)
+        random_tensor.floor_()  # binarize
+
+        return x.div(kp) * random_tensor
+
+
+class MixFFN(nn.Module):
+    def __init__(self, in_dim: int, inter_dim: int):
+        """
+        MixFFN block (https://arxiv.org/pdf/2105.15203.pdf)
+        :param in_dim: input dimension
+        :param inter_dim: intermediate dimension
+        """
+
+        super().__init__()
+
+        self.fc1 = nn.Linear(in_dim, inter_dim)
+        self.dwconv = nn.Conv2d(
+            in_channels=inter_dim,
+            out_channels=inter_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=inter_dim)
+        self.fc2 = nn.Linear(inter_dim, in_dim)
+
+    def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
+        x = self.fc1(x)
+
+        b, _, c = x.shape
+        x = x.transpose(1, 2).view(b, c, h, w)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        x = self.fc2(F.gelu(x))
+
+        return x
+
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float):
+        """
+        A single encoder block (https://arxiv.org/pdf/2105.15203.pdf)
+        :param dim: embedding dimension
+        :param head: number of attention heads
+        :param sr_ratio: the reduction ratio of the efficient self-attention
+        :param dpr: drop-path ratio
+        """
+
+        super().__init__()
+
+        self.attn = EfficientSelfAttention(dim, head, sr_ratio)
+
+        self.drop_path = DropPath(dpr) if dpr > 0. else nn.Identity()
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.mlp = MixFFN(in_dim=dim, inter_dim=dim*4)
+
+    def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
+        x = x + self.drop_path(self.attn(self.norm1(x), h, w))
+        x = x + self.drop_path(self.mlp(self.norm2(x), h, w))
+
+        return x
+
+
+class MiTBackBone(nn.Module):
+    def __init__(
+            self,
+            embed_dims: list,
+            encoder_layers: list,
+            eff_self_att_reduction_ratio: list,
+            eff_self_att_heads: list,
+            overlap_patch_size: list,
+            overlap_patch_stride: list,
+            overlap_patch_pad: list,
+            in_channels: int
+    ):
+        """
+        Mixed Transformer backbone encoder (https://arxiv.org/pdf/2105.15203.pdf)
+        :param embed_dims: the patch embedding dimensions (number of output channels in each encoder stage)
+        :param encoder_layers: the number of encoder layers in each encoder stage
+        :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage
+        :param eff_self_att_heads: number of efficient self-attention heads in each stage
+        :param overlap_patch_size:  the patch size of the overlapping patch embedding in each stage
+        :param overlap_patch_stride:  the patch stride of the overlapping patch embedding in each stage
+        :param overlap_patch_pad:  the patch padding of the overlapping patch embedding in each stage
+        :param in_channels:  number of input channels
+        """
+
+        super().__init__()
+
+        assert len(embed_dims)==len(encoder_layers)==len(eff_self_att_reduction_ratio)==len(eff_self_att_heads)== \
+            len(overlap_patch_size)==len(overlap_patch_stride)==len(overlap_patch_pad), \
+            f"All backbone hyper-parameters should be lists of the same length"
+
+        # Patch embeddings
+        self.patch_embed = []
+        for stage_num in range(len(embed_dims)):
+            self.patch_embed.append(
+                PatchEmbedding(
+                    in_channels=in_channels if stage_num==0 else embed_dims[stage_num-1],
+                    out_channels=embed_dims[stage_num],
+                    patch_size=overlap_patch_size[stage_num],
+                    stride=overlap_patch_stride[stage_num],
+                    padding=overlap_patch_pad[stage_num]
+                )
+            )
+            self.add_module(f"patch_embed{stage_num+1}", self.patch_embed[stage_num])
+
+        drop_path_rate = 0.1
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(encoder_layers))]
+        
+        self.blocks = []
+        self.norms = []
+        
+        layer_idx = 0
+        for stage_num in range(len(embed_dims)):
+            self.blocks.append(
+                nn.ModuleList([
+                    EncoderBlock(
+                        dim=embed_dims[stage_num],
+                        head=eff_self_att_heads[stage_num],
+                        sr_ratio=eff_self_att_reduction_ratio[stage_num],
+                        dpr=dpr[layer_idx + i])
+                    for i in range(encoder_layers[stage_num])
+                ])
+            )
+            self.norms.append(nn.LayerNorm(embed_dims[stage_num]))
+
+            self.add_module(f"block{stage_num + 1}", self.blocks[stage_num])
+            self.add_module(f"norm{stage_num + 1}", self.norms[stage_num])
+
+            layer_idx += encoder_layers[stage_num]
+
+
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        b_size = x.shape[0]
+
+        features = []
+        for stage_num in range(len(self.patch_embed)):
+            x, h, w = self.patch_embed[stage_num](x)
+
+            for enc_block in self.blocks[stage_num]:
+                x = enc_block(x, h, w)
+            x = self.norms[stage_num](x)
+            x = x.reshape(b_size, h, w, -1).permute(0, 3, 1, 2)
+
+            features.append(x)
+
+        return features
+
+
+class MLP(nn.Module):
+    def __init__(self, dim, embed_dim):
+        """
+        A single Linear layer, with shape pre-processing
+        :param dim: input dimension
+        :param embed_dim: output dimension
+        """
+
+        super().__init__()
+
+        self.proj = nn.Linear(dim, embed_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+
+        return x
+
+class LinearFuse(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        """
+        A linear fusion block (conv + bn + relu) (https://arxiv.org/pdf/2105.15203.pdf)
+        :param in_channels: number of input channels
+        :param out_channels: number of output channels
+        """
+
+        super().__init__()
+
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            bias=False
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        return self.relu(self.bn(self.conv(x)))
+
+
+class SegFormerHead(nn.Module):
+    def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int):
+        """
+        SegFormer decoder head (https://arxiv.org/pdf/2105.15203.pdf)
+        :param encoder_dims: list of encoder embedding dimensions
+        :param embed_dim: unified embedding dimension
+        :param num_classes: number of predicted classes
+        """
+        super().__init__()
+
+        self.linear_layers = []
+        for idx, dim in enumerate(encoder_dims):
+            self.linear_layers.append(MLP(dim, embed_dim))
+            self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx])
+
+        self.linear_fuse = LinearFuse(in_channels=embed_dim*len(encoder_dims), out_channels=embed_dim)
+        self.linear_pred = nn.Conv2d(in_channels=embed_dim,
+                                     out_channels=num_classes,
+                                     kernel_size=1)
+
+        self.dropout = nn.Dropout2d(0.1)
+
+    def forward(self, features: list[torch.Tensor]) -> torch.Tensor:
+        b, _, h, w = features[0].shape
+
+        out_lst = [self.linear_layers[0](features[0]).permute(0, 2, 1).reshape(b, -1, *features[0].shape[-2:])]
+
+        for i, feature in enumerate(features[1:]):
+            out = self.linear_layers[i+1](feature).permute(0, 2, 1).reshape(b, -1, *feature.shape[-2:])
+            out = F.interpolate(out, size=(h, w), mode='bilinear', align_corners=False)
+            out_lst.append(out)
+
+        out = self.linear_fuse(torch.cat(out_lst[::-1], dim=1))
+        out = self.linear_pred(self.dropout(out))
+
+        return out
+
+
+# TODO: add support for aux heads? (not in original impl) (currently not using)
+class SegFormer(SegmentationModule):
+    def __init__(
+        self,
+        num_classes: int,
+        encoder_embed_dims: list,
+        encoder_layers: list,
+        eff_self_att_reduction_ratio: list,
+        eff_self_att_heads: list,
+        decoder_embed_dim: int,
+        overlap_patch_size: list,
+        overlap_patch_stride: list,
+        overlap_patch_pad: list,
+        in_channels: int = 3
+    ):
+        """
+        :param num_classes: number of classes
+        :param encoder_embed_dims: the patch embedding dimensions (number of output channels in each encoder stage)
+        :param encoder_layers: the number of encoder layers in each encoder stage
+        :param eff_self_att_reduction_ratio: the reduction ratios of the efficient self-attention in each stage
+        :param eff_self_att_heads: number of efficient self-attention heads in each stage
+        :param overlap_patch_size:  the patch size of the overlapping patch embedding in each stage
+        :param overlap_patch_stride:  the patch stride of the overlapping patch embedding in each stage
+        :param overlap_patch_pad:  the patch padding of the overlapping patch embedding in each stage
+        :param in_channels:  number of input channels
+        """
+
+        super().__init__(use_aux_heads=False)
+
+        self.encoder_embed_dims = encoder_embed_dims
+
+        self._backbone = MiTBackBone(
+            embed_dims=encoder_embed_dims,
+            encoder_layers=encoder_layers,
+            eff_self_att_reduction_ratio=eff_self_att_reduction_ratio,
+            eff_self_att_heads=eff_self_att_heads,
+            overlap_patch_size=overlap_patch_size,
+            overlap_patch_stride=overlap_patch_stride,
+            overlap_patch_pad=overlap_patch_pad,
+            in_channels=in_channels
+        )
+
+        self.decode_head = SegFormerHead(
+            encoder_dims=encoder_embed_dims,
+            embed_dim=decoder_embed_dim,
+            num_classes=num_classes
+        )
+
+        self.init_params()
+
+    def init_params(self):
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d, nn.SyncBatchNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+
+    @property
+    def backbone(self):
+        return self._backbone
+
+    def _remove_auxiliary_heads(self):
+        pass
+
+    def replace_head(self, new_num_classes: int, new_decoder_embed_dim: int):
+        self.decode_head = SegFormerHead(
+            encoder_dims=self.encoder_embed_dims,
+            embed_dim=new_decoder_embed_dim,
+            num_classes=new_num_classes
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        features = self._backbone(x)
+        out = self.decode_head(features)
+        out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False)
+        return out
+
+    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
+        """
+        Custom param groups for training:
+        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
+        """
+        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
+        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
+        param_groups = [
+            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
+            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
+        ]
+        return param_groups
+
+    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
+        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
+        for param_group in param_groups:
+            param_group["lr"] = lr
+            if param_group["name"] == "multiply_lr_params":
+                param_group["lr"] *= multiply_head_lr
+        return param_groups
+
+    def _separate_lr_multiply_params(self):
+        """
+        Separate backbone params from the rest.
+        :return: iterators of groups named_parameters.
+        """
+        backbone_names = [n for n, p in self.backbone.named_parameters()]
+        multiply_lr_params, no_multiply_params = {}, {}
+        for name, param in self.named_parameters():
+            if name in backbone_names:
+                no_multiply_params[name] = param
+            else:
+                multiply_lr_params[name] = param
+        return multiply_lr_params.items(), no_multiply_params.items()
+
+
+class SegFormerCustom(SegFormer):
+    def __init__(self, arch_params: HpmStruct):
+        """Parse arch_params and translate the parameters to build the SegFormer architecture"""
+        super().__init__(
+            num_classes=arch_params.num_classes,
+            encoder_embed_dims=arch_params.encoder_embed_dims,
+            encoder_layers=arch_params.encoder_layers,
+            eff_self_att_reduction_ratio=arch_params.eff_self_att_reduction_ratio,
+            eff_self_att_heads=arch_params.eff_self_att_heads,
+            decoder_embed_dim=arch_params.decoder_embed_dim,
+            overlap_patch_size=arch_params.overlap_patch_size,
+            overlap_patch_stride=arch_params.overlap_patch_stride,
+            overlap_patch_pad=arch_params.overlap_patch_pad,
+            in_channels=arch_params.in_channels
+        )
+
+
+DEFAULT_SEGFORMER_PARAMS = {
+    "in_channels": 3,
+    "overlap_patch_size": [7, 3, 3, 3],
+    "overlap_patch_stride": [4, 2, 2, 2],
+    "overlap_patch_pad": [3, 1, 1, 1],
+    "eff_self_att_reduction_ratio": [8, 4, 2, 1],
+    "eff_self_att_heads": [1, 2, 5, 8],
+}
+
+DEFAULT_SEGFORMER_B0_PARAMS = {
+    **DEFAULT_SEGFORMER_PARAMS,
+    "encoder_embed_dims": [32, 64, 160, 256],
+    "encoder_layers": [2, 2, 2, 2],
+    "decoder_embed_dim": 256
+}
+
+DEFAULT_SEGFORMER_B1_PARAMS = {
+    **DEFAULT_SEGFORMER_B0_PARAMS,
+    "encoder_embed_dims": [64, 128, 320, 512],
+}
+
+DEFAULT_SEGFORMER_B2_PARAMS = {
+    **DEFAULT_SEGFORMER_B1_PARAMS,
+    "encoder_layers": [3, 4, 6, 3],
+    "decoder_embed_dim": 768
+}
+
+DEFAULT_SEGFORMER_B3_PARAMS = {
+    **DEFAULT_SEGFORMER_B2_PARAMS,
+    "encoder_layers": [3, 4, 18, 3],
+}
+
+DEFAULT_SEGFORMER_B4_PARAMS = {
+    **DEFAULT_SEGFORMER_B2_PARAMS,
+    "encoder_layers": [3, 8, 27, 3],
+}
+
+DEFAULT_SEGFORMER_B5_PARAMS = {
+    **DEFAULT_SEGFORMER_B2_PARAMS,
+    "encoder_layers": [3, 6, 40, 3],
+}
+
+
+class SegFormerB0(SegFormerCustom):
+    def __init__(self, arch_params: HpmStruct):
+        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B0_PARAMS)
+        _arch_params.override(**arch_params.to_dict())
+        super().__init__(_arch_params)
+
+class SegFormerB1(SegFormerCustom):
+    def __init__(self, arch_params: HpmStruct):
+        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS)
+        _arch_params.override(**arch_params.to_dict())
+        super().__init__(_arch_params)
+
+class SegFormerB2(SegFormerCustom):
+    def __init__(self, arch_params: HpmStruct):
+        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS)
+        _arch_params.override(**arch_params.to_dict())
+        super().__init__(_arch_params)
+
+class SegFormerB3(SegFormerCustom):
+    def __init__(self, arch_params: HpmStruct):
+        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS)
+        _arch_params.override(**arch_params.to_dict())
+        super().__init__(_arch_params)
+
+class SegFormerB4(SegFormerCustom):
+    def __init__(self, arch_params: HpmStruct):
+        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS)
+        _arch_params.override(**arch_params.to_dict())
+        super().__init__(_arch_params)
+
+class SegFormerB5(SegFormerCustom):
+    def __init__(self, arch_params: HpmStruct):
+        _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS)
+        _arch_params.override(**arch_params.to_dict())
+        super().__init__(_arch_params)

From 932bb738962c13461e56e1fe2c9292fee99d14ff Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Mon, 27 Feb 2023 18:54:32 +0200
Subject: [PATCH 02/12] Update segformer.py

---
 .../models/segmentation_models/segformer.py   | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index d66442176d..712295b032 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -486,40 +486,40 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False)
         return out
 
-    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
-        """
-        Custom param groups for training:
-        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
-        """
-        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
-        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
-        param_groups = [
-            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
-            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
-        ]
-        return param_groups
-
-    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
-        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
-        for param_group in param_groups:
-            param_group["lr"] = lr
-            if param_group["name"] == "multiply_lr_params":
-                param_group["lr"] *= multiply_head_lr
-        return param_groups
-
-    def _separate_lr_multiply_params(self):
-        """
-        Separate backbone params from the rest.
-        :return: iterators of groups named_parameters.
-        """
-        backbone_names = [n for n, p in self.backbone.named_parameters()]
-        multiply_lr_params, no_multiply_params = {}, {}
-        for name, param in self.named_parameters():
-            if name in backbone_names:
-                no_multiply_params[name] = param
-            else:
-                multiply_lr_params[name] = param
-        return multiply_lr_params.items(), no_multiply_params.items()
+    # def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
+    #     """
+    #     Custom param groups for training:
+    #     - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
+    #     """
+    #     multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
+    #     multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
+    #     param_groups = [
+    #         {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
+    #         {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
+    #     ]
+    #     return param_groups
+    #
+    # def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
+    #     multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
+    #     for param_group in param_groups:
+    #         param_group["lr"] = lr
+    #         if param_group["name"] == "multiply_lr_params":
+    #             param_group["lr"] *= multiply_head_lr
+    #     return param_groups
+    #
+    # def _separate_lr_multiply_params(self):
+    #     """
+    #     Separate backbone params from the rest.
+    #     :return: iterators of groups named_parameters.
+    #     """
+    #     backbone_names = [n for n, p in self.backbone.named_parameters()]
+    #     multiply_lr_params, no_multiply_params = {}, {}
+    #     for name, param in self.named_parameters():
+    #         if name in backbone_names:
+    #             no_multiply_params[name] = param
+    #         else:
+    #             multiply_lr_params[name] = param
+    #     return multiply_lr_params.items(), no_multiply_params.items()
 
 
 class SegFormerCustom(SegFormer):

From 78aa7707528ad97c3c97a0a63b7f60180247959c Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Mon, 27 Feb 2023 22:55:40 +0200
Subject: [PATCH 03/12] Update segformer.py

---
 .../models/segmentation_models/segformer.py   | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index 712295b032..d66442176d 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -486,40 +486,40 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False)
         return out
 
-    # def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
-    #     """
-    #     Custom param groups for training:
-    #     - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
-    #     """
-    #     multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
-    #     multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
-    #     param_groups = [
-    #         {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
-    #         {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
-    #     ]
-    #     return param_groups
-    #
-    # def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
-    #     multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
-    #     for param_group in param_groups:
-    #         param_group["lr"] = lr
-    #         if param_group["name"] == "multiply_lr_params":
-    #             param_group["lr"] *= multiply_head_lr
-    #     return param_groups
-    #
-    # def _separate_lr_multiply_params(self):
-    #     """
-    #     Separate backbone params from the rest.
-    #     :return: iterators of groups named_parameters.
-    #     """
-    #     backbone_names = [n for n, p in self.backbone.named_parameters()]
-    #     multiply_lr_params, no_multiply_params = {}, {}
-    #     for name, param in self.named_parameters():
-    #         if name in backbone_names:
-    #             no_multiply_params[name] = param
-    #         else:
-    #             multiply_lr_params[name] = param
-    #     return multiply_lr_params.items(), no_multiply_params.items()
+    def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
+        """
+        Custom param groups for training:
+        - Different lr for backbone and the rest, if `multiply_head_lr` key is in `training_params`.
+        """
+        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
+        multiply_lr_params, no_multiply_params = self._separate_lr_multiply_params()
+        param_groups = [
+            {"named_params": no_multiply_params, "lr": lr, "name": "no_multiply_params"},
+            {"named_params": multiply_lr_params, "lr": lr * multiply_head_lr, "name": "multiply_lr_params"},
+        ]
+        return param_groups
+
+    def update_param_groups(self, param_groups: list, lr: float, epoch: int, iter: int, training_params: HpmStruct, total_batch: int) -> list:
+        multiply_head_lr = get_param(training_params, "multiply_head_lr", 1)
+        for param_group in param_groups:
+            param_group["lr"] = lr
+            if param_group["name"] == "multiply_lr_params":
+                param_group["lr"] *= multiply_head_lr
+        return param_groups
+
+    def _separate_lr_multiply_params(self):
+        """
+        Separate backbone params from the rest.
+        :return: iterators of groups named_parameters.
+        """
+        backbone_names = [n for n, p in self.backbone.named_parameters()]
+        multiply_lr_params, no_multiply_params = {}, {}
+        for name, param in self.named_parameters():
+            if name in backbone_names:
+                no_multiply_params[name] = param
+            else:
+                multiply_lr_params[name] = param
+        return multiply_lr_params.items(), no_multiply_params.items()
 
 
 class SegFormerCustom(SegFormer):

From a321c22e512c42ae0eafa2cad0c8ba043d1572bc Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Tue, 28 Feb 2023 16:37:00 +0200
Subject: [PATCH 04/12] Update segformer.py

---
 .../models/segmentation_models/segformer.py   | 142 ++++++++----------
 1 file changed, 61 insertions(+), 81 deletions(-)

diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index d66442176d..5684cfcff2 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -58,7 +58,7 @@ def norm_cdf(x):
 
 
 def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
-    # type: (Tensor, float, float, float, float) -> Tensor
+    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
     r"""Fills the input Tensor with values drawn from a truncated
     normal distribution. The values are effectively drawn from the
     normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
@@ -91,12 +91,7 @@ def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride:
 
         super().__init__()
 
-        self.proj = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=patch_size,
-            stride=stride,
-            padding=padding)
+        self.proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=patch_size, stride=stride, padding=padding)
         self.norm = nn.LayerNorm(out_channels)
 
     def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]:
@@ -149,8 +144,8 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
         x = self.proj(x)
         return x
 
-class DropPath(nn.Module):
 
+class DropPath(nn.Module):
     def __init__(self, drop_p: float = None):
         """
         Drop path (stochastic depth).
@@ -163,7 +158,7 @@ def __init__(self, drop_p: float = None):
         self.drop_p = drop_p
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.drop_p == 0. or not self.training:
+        if self.drop_p == 0.0 or not self.training:
             return x
 
         kp = 1 - self.drop_p
@@ -186,13 +181,7 @@ def __init__(self, in_dim: int, inter_dim: int):
         super().__init__()
 
         self.fc1 = nn.Linear(in_dim, inter_dim)
-        self.dwconv = nn.Conv2d(
-            in_channels=inter_dim,
-            out_channels=inter_dim,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=inter_dim)
+        self.dwconv = nn.Conv2d(in_channels=inter_dim, out_channels=inter_dim, kernel_size=3, stride=1, padding=1, groups=inter_dim)
         self.fc2 = nn.Linear(inter_dim, in_dim)
 
     def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
@@ -207,6 +196,7 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
 
         return x
 
+
 class EncoderBlock(nn.Module):
     def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float):
         """
@@ -221,12 +211,12 @@ def __init__(self, dim: int, head: int, sr_ratio: int, dpr: float):
 
         self.attn = EfficientSelfAttention(dim, head, sr_ratio)
 
-        self.drop_path = DropPath(dpr) if dpr > 0. else nn.Identity()
+        self.drop_path = DropPath(dpr) if dpr > 0.0 else nn.Identity()
 
         self.norm1 = nn.LayerNorm(dim)
         self.norm2 = nn.LayerNorm(dim)
 
-        self.mlp = MixFFN(in_dim=dim, inter_dim=dim*4)
+        self.mlp = MixFFN(in_dim=dim, inter_dim=dim * 4)
 
     def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
         x = x + self.drop_path(self.attn(self.norm1(x), h, w))
@@ -237,15 +227,15 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
 
 class MiTBackBone(nn.Module):
     def __init__(
-            self,
-            embed_dims: list,
-            encoder_layers: list,
-            eff_self_att_reduction_ratio: list,
-            eff_self_att_heads: list,
-            overlap_patch_size: list,
-            overlap_patch_stride: list,
-            overlap_patch_pad: list,
-            in_channels: int
+        self,
+        embed_dims: list,
+        encoder_layers: list,
+        eff_self_att_reduction_ratio: list,
+        eff_self_att_heads: list,
+        overlap_patch_size: list,
+        overlap_patch_stride: list,
+        overlap_patch_pad: list,
+        in_channels: int,
     ):
         """
         Mixed Transformer backbone encoder (https://arxiv.org/pdf/2105.15203.pdf)
@@ -261,41 +251,50 @@ def __init__(
 
         super().__init__()
 
-        assert len(embed_dims)==len(encoder_layers)==len(eff_self_att_reduction_ratio)==len(eff_self_att_heads)== \
-            len(overlap_patch_size)==len(overlap_patch_stride)==len(overlap_patch_pad), \
-            f"All backbone hyper-parameters should be lists of the same length"
+        assert (
+            len(embed_dims)
+            == len(encoder_layers)
+            == len(eff_self_att_reduction_ratio)
+            == len(eff_self_att_heads)
+            == len(overlap_patch_size)
+            == len(overlap_patch_stride)
+            == len(overlap_patch_pad)
+        ), "All backbone hyper-parameters should be lists of the same length"
 
         # Patch embeddings
         self.patch_embed = []
         for stage_num in range(len(embed_dims)):
             self.patch_embed.append(
                 PatchEmbedding(
-                    in_channels=in_channels if stage_num==0 else embed_dims[stage_num-1],
+                    in_channels=in_channels if stage_num == 0 else embed_dims[stage_num - 1],
                     out_channels=embed_dims[stage_num],
                     patch_size=overlap_patch_size[stage_num],
                     stride=overlap_patch_stride[stage_num],
-                    padding=overlap_patch_pad[stage_num]
+                    padding=overlap_patch_pad[stage_num],
                 )
             )
             self.add_module(f"patch_embed{stage_num+1}", self.patch_embed[stage_num])
 
         drop_path_rate = 0.1
         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(encoder_layers))]
-        
+
         self.blocks = []
         self.norms = []
-        
+
         layer_idx = 0
         for stage_num in range(len(embed_dims)):
             self.blocks.append(
-                nn.ModuleList([
-                    EncoderBlock(
-                        dim=embed_dims[stage_num],
-                        head=eff_self_att_heads[stage_num],
-                        sr_ratio=eff_self_att_reduction_ratio[stage_num],
-                        dpr=dpr[layer_idx + i])
-                    for i in range(encoder_layers[stage_num])
-                ])
+                nn.ModuleList(
+                    [
+                        EncoderBlock(
+                            dim=embed_dims[stage_num],
+                            head=eff_self_att_heads[stage_num],
+                            sr_ratio=eff_self_att_reduction_ratio[stage_num],
+                            dpr=dpr[layer_idx + i],
+                        )
+                        for i in range(encoder_layers[stage_num])
+                    ]
+                )
             )
             self.norms.append(nn.LayerNorm(embed_dims[stage_num]))
 
@@ -304,7 +303,6 @@ def __init__(
 
             layer_idx += encoder_layers[stage_num]
 
-
     def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
         b_size = x.shape[0]
 
@@ -340,6 +338,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         return x
 
+
 class LinearFuse(nn.Module):
     def __init__(self, in_channels: int, out_channels: int):
         """
@@ -350,12 +349,7 @@ def __init__(self, in_channels: int, out_channels: int):
 
         super().__init__()
 
-        self.conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            bias=False
-        )
+        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False)
         self.bn = nn.BatchNorm2d(out_channels)
         self.relu = nn.ReLU(inplace=True)
 
@@ -378,10 +372,8 @@ def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int):
             self.linear_layers.append(MLP(dim, embed_dim))
             self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx])
 
-        self.linear_fuse = LinearFuse(in_channels=embed_dim*len(encoder_dims), out_channels=embed_dim)
-        self.linear_pred = nn.Conv2d(in_channels=embed_dim,
-                                     out_channels=num_classes,
-                                     kernel_size=1)
+        self.linear_fuse = LinearFuse(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim)
+        self.linear_pred = nn.Conv2d(in_channels=embed_dim, out_channels=num_classes, kernel_size=1)
 
         self.dropout = nn.Dropout2d(0.1)
 
@@ -391,8 +383,8 @@ def forward(self, features: list[torch.Tensor]) -> torch.Tensor:
         out_lst = [self.linear_layers[0](features[0]).permute(0, 2, 1).reshape(b, -1, *features[0].shape[-2:])]
 
         for i, feature in enumerate(features[1:]):
-            out = self.linear_layers[i+1](feature).permute(0, 2, 1).reshape(b, -1, *feature.shape[-2:])
-            out = F.interpolate(out, size=(h, w), mode='bilinear', align_corners=False)
+            out = self.linear_layers[i + 1](feature).permute(0, 2, 1).reshape(b, -1, *feature.shape[-2:])
+            out = F.interpolate(out, size=(h, w), mode="bilinear", align_corners=False)
             out_lst.append(out)
 
         out = self.linear_fuse(torch.cat(out_lst[::-1], dim=1))
@@ -414,7 +406,7 @@ def __init__(
         overlap_patch_size: list,
         overlap_patch_stride: list,
         overlap_patch_pad: list,
-        in_channels: int = 3
+        in_channels: int = 3,
     ):
         """
         :param num_classes: number of classes
@@ -440,14 +432,10 @@ def __init__(
             overlap_patch_size=overlap_patch_size,
             overlap_patch_stride=overlap_patch_stride,
             overlap_patch_pad=overlap_patch_pad,
-            in_channels=in_channels
+            in_channels=in_channels,
         )
 
-        self.decode_head = SegFormerHead(
-            encoder_dims=encoder_embed_dims,
-            embed_dim=decoder_embed_dim,
-            num_classes=num_classes
-        )
+        self.decode_head = SegFormerHead(encoder_dims=encoder_embed_dims, embed_dim=decoder_embed_dim, num_classes=num_classes)
 
         self.init_params()
 
@@ -455,7 +443,7 @@ def init_params(self):
 
         for m in self.modules():
             if isinstance(m, nn.Linear):
-                trunc_normal_(m.weight, std=.02)
+                trunc_normal_(m.weight, std=0.02)
                 if m.bias is not None:
                     nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Conv2d):
@@ -474,16 +462,12 @@ def _remove_auxiliary_heads(self):
         pass
 
     def replace_head(self, new_num_classes: int, new_decoder_embed_dim: int):
-        self.decode_head = SegFormerHead(
-            encoder_dims=self.encoder_embed_dims,
-            embed_dim=new_decoder_embed_dim,
-            num_classes=new_num_classes
-        )
+        self.decode_head = SegFormerHead(encoder_dims=self.encoder_embed_dims, embed_dim=new_decoder_embed_dim, num_classes=new_num_classes)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         features = self._backbone(x)
         out = self.decode_head(features)
-        out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False)
+        out = F.interpolate(out, size=x.shape[2:], mode="bilinear", align_corners=False)
         return out
 
     def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list:
@@ -535,7 +519,7 @@ def __init__(self, arch_params: HpmStruct):
             overlap_patch_size=arch_params.overlap_patch_size,
             overlap_patch_stride=arch_params.overlap_patch_stride,
             overlap_patch_pad=arch_params.overlap_patch_pad,
-            in_channels=arch_params.in_channels
+            in_channels=arch_params.in_channels,
         )
 
 
@@ -548,23 +532,14 @@ def __init__(self, arch_params: HpmStruct):
     "eff_self_att_heads": [1, 2, 5, 8],
 }
 
-DEFAULT_SEGFORMER_B0_PARAMS = {
-    **DEFAULT_SEGFORMER_PARAMS,
-    "encoder_embed_dims": [32, 64, 160, 256],
-    "encoder_layers": [2, 2, 2, 2],
-    "decoder_embed_dim": 256
-}
+DEFAULT_SEGFORMER_B0_PARAMS = {**DEFAULT_SEGFORMER_PARAMS, "encoder_embed_dims": [32, 64, 160, 256], "encoder_layers": [2, 2, 2, 2], "decoder_embed_dim": 256}
 
 DEFAULT_SEGFORMER_B1_PARAMS = {
     **DEFAULT_SEGFORMER_B0_PARAMS,
     "encoder_embed_dims": [64, 128, 320, 512],
 }
 
-DEFAULT_SEGFORMER_B2_PARAMS = {
-    **DEFAULT_SEGFORMER_B1_PARAMS,
-    "encoder_layers": [3, 4, 6, 3],
-    "decoder_embed_dim": 768
-}
+DEFAULT_SEGFORMER_B2_PARAMS = {**DEFAULT_SEGFORMER_B1_PARAMS, "encoder_layers": [3, 4, 6, 3], "decoder_embed_dim": 768}
 
 DEFAULT_SEGFORMER_B3_PARAMS = {
     **DEFAULT_SEGFORMER_B2_PARAMS,
@@ -588,30 +563,35 @@ def __init__(self, arch_params: HpmStruct):
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
 
+
 class SegFormerB1(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
 
+
 class SegFormerB2(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
 
+
 class SegFormerB3(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
 
+
 class SegFormerB4(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
 
+
 class SegFormerB5(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS)

From abb99b84fbd1e37aeb9ed54320babc211c53e6c7 Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Wed, 1 Mar 2023 16:07:47 +0200
Subject: [PATCH 05/12] update segformer.py,
 cityscapes_segformer_dataset_params.yaml

---
 .../cityscapes_segformer_dataset_params.yaml  |  6 +-
 .../models/segmentation_models/segformer.py   | 95 +------------------
 2 files changed, 6 insertions(+), 95 deletions(-)

diff --git a/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml
index 7d90027a7f..3e706b97f4 100644
--- a/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml
+++ b/src/super_gradients/recipes/dataset_params/cityscapes_segformer_dataset_params.yaml
@@ -26,16 +26,12 @@ train_dataset_params:
 val_dataset_params:
   transforms:
     - SegRescale:
-        short_size: 1024
+        long_size: 1024
 
     - SegPadShortToCropSize:
         crop_size: [ 1024, 1024 ]
         fill_mask: 19
 
-    - SegCropImageAndMask:
-        crop_size: [ 1024, 1024 ]
-        mode: center
-
 train_dataloader_params:
   batch_size: 2
   shuffle: True
diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index 5684cfcff2..fb35ae94b3 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -1,5 +1,3 @@
-import math
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -7,7 +5,7 @@
 from super_gradients.training.models import HpmStruct
 from super_gradients.training.utils import get_param
 from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule
-from super_gradients.common.abstractions.abstract_logger import get_logger
+from super_gradients.training.utils.regularization_utils import DropPath
 
 """
 paper:  SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers
@@ -18,65 +16,6 @@
 
 """
 
-logger = get_logger(__name__)
-
-
-# TODO: this function (and trunc_normal_) are copy-pasted from BEIT model code. We need to consider implementing
-#  it in a more general location
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        logger.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " "The distribution of values may be incorrect.", stacklevel=2)
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        lower = norm_cdf((a - mean) / std)
-        upper = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.0))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
-    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-
 
 class PatchEmbedding(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int):
@@ -145,31 +84,6 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
         return x
 
 
-class DropPath(nn.Module):
-    def __init__(self, drop_p: float = None):
-        """
-        Drop path (stochastic depth).
-        Taken from: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/layers/common.py
-        :param drop_p: drop probability
-        """
-
-        super().__init__()
-
-        self.drop_p = drop_p
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.drop_p == 0.0 or not self.training:
-            return x
-
-        kp = 1 - self.drop_p
-        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
-
-        random_tensor = kp + torch.rand(shape, dtype=x.dtype, device=x.device)
-        random_tensor.floor_()  # binarize
-
-        return x.div(kp) * random_tensor
-
-
 class MixFFN(nn.Module):
     def __init__(self, in_dim: int, inter_dim: int):
         """
@@ -251,7 +165,7 @@ def __init__(
 
         super().__init__()
 
-        assert (
+        if not (
             len(embed_dims)
             == len(encoder_layers)
             == len(eff_self_att_reduction_ratio)
@@ -259,7 +173,8 @@ def __init__(
             == len(overlap_patch_size)
             == len(overlap_patch_stride)
             == len(overlap_patch_pad)
-        ), "All backbone hyper-parameters should be lists of the same length"
+        ):
+            raise ValueError("All backbone hyper-parameters should be lists of the same length")
 
         # Patch embeddings
         self.patch_embed = []
@@ -443,7 +358,7 @@ def init_params(self):
 
         for m in self.modules():
             if isinstance(m, nn.Linear):
-                trunc_normal_(m.weight, std=0.02)
+                torch.nn.init.trunc_normal_(m.weight, std=0.02)
                 if m.bias is not None:
                     nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Conv2d):

From e30b6bff26ded006bc9f0bd34459114010c67f99 Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Fri, 10 Mar 2023 12:33:45 +0200
Subject: [PATCH 06/12] Update all_architectures.py

---
 src/super_gradients/training/models/all_architectures.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/super_gradients/training/models/all_architectures.py b/src/super_gradients/training/models/all_architectures.py
index 3bda4b2418..50d5884101 100755
--- a/src/super_gradients/training/models/all_architectures.py
+++ b/src/super_gradients/training/models/all_architectures.py
@@ -36,7 +36,7 @@
     SegFormerB2,
     SegFormerB3,
     SegFormerB4,
-    SegFormerB5
+    SegFormerB5,
 )
 
 from super_gradients.training.models.kd_modules.kd_module import KDModule
@@ -158,7 +158,6 @@
     Models.SEGFORMER_B3: SegFormerB3,
     Models.SEGFORMER_B4: SegFormerB4,
     Models.SEGFORMER_B5: SegFormerB5,
-    #
     Models.DEKR_CUSTOM: DEKRPoseEstimationModel,
     Models.DEKR_W32_NO_DC: DEKRW32,
     Models.POSE_PP_YOLO_L: PosePPYoloL,

From 6bc26a2cc8ad56ed779a5cb6aa9ca249dee8312e Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Fri, 10 Mar 2023 12:46:02 +0200
Subject: [PATCH 07/12] Update segformer.py

---
 .../training/models/segmentation_models/segformer.py      | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index fb35ae94b3..773820dfa9 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -7,6 +7,8 @@
 from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule
 from super_gradients.training.utils.regularization_utils import DropPath
 
+from typing import List, Tuple
+
 """
 paper:  SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers
         ( https://arxiv.org/pdf/2105.15203.pdf )
@@ -33,7 +35,7 @@ def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride:
         self.proj = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=patch_size, stride=stride, padding=padding)
         self.norm = nn.LayerNorm(out_channels)
 
-    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]:
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
         x = self.proj(x)
         _, _, h, w = x.shape
 
@@ -218,7 +220,7 @@ def __init__(
 
             layer_idx += encoder_layers[stage_num]
 
-    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
         b_size = x.shape[0]
 
         features = []
@@ -292,7 +294,7 @@ def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int):
 
         self.dropout = nn.Dropout2d(0.1)
 
-    def forward(self, features: list[torch.Tensor]) -> torch.Tensor:
+    def forward(self, features: List[torch.Tensor]) -> torch.Tensor:
         b, _, h, w = features[0].shape
 
         out_lst = [self.linear_layers[0](features[0]).permute(0, 2, 1).reshape(b, -1, *features[0].shape[-2:])]

From 73c40c1cac41a36df65b38fbbb41be5ed0a16a0a Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Wed, 15 Mar 2023 15:00:13 +0200
Subject: [PATCH 08/12] update segformer.py, unite all segformer recipes

---
 ...rmer_b0.yaml => cityscapes_segformer.yaml} |  36 +++---
 .../recipes/cityscapes_segformer_b1.yaml      | 109 ------------------
 .../recipes/cityscapes_segformer_b2.yaml      | 109 ------------------
 .../recipes/cityscapes_segformer_b3.yaml      | 109 ------------------
 .../recipes/cityscapes_segformer_b4.yaml      | 109 ------------------
 .../recipes/cityscapes_segformer_b5.yaml      | 109 ------------------
 .../models/segmentation_models/segformer.py   | 101 +++++++++-------
 7 files changed, 85 insertions(+), 597 deletions(-)
 rename src/super_gradients/recipes/{cityscapes_segformer_b0.yaml => cityscapes_segformer.yaml} (62%)
 delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b1.yaml
 delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b2.yaml
 delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b3.yaml
 delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b4.yaml
 delete mode 100644 src/super_gradients/recipes/cityscapes_segformer_b5.yaml

diff --git a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml b/src/super_gradients/recipes/cityscapes_segformer.yaml
similarity index 62%
rename from src/super_gradients/recipes/cityscapes_segformer_b0.yaml
rename to src/super_gradients/recipes/cityscapes_segformer.yaml
index 19bbc4810e..b78fb932df 100644
--- a/src/super_gradients/recipes/cityscapes_segformer_b0.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer.yaml
@@ -1,4 +1,4 @@
-#  SegFormer-B0 segmentation training example with Cityscapes dataset.
+#  SegFormer segmentation training example with Cityscapes dataset.
 #  Reproduction of paper:
 #  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
 #  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
@@ -7,27 +7,37 @@
 #  Official git repo:
 #      https://github.com/NVlabs/SegFormer
 #
-#  Imagenet-1k pre-trained backbone weights taken and adapted from:
+#  Code and Imagenet-1k pre-trained backbone weights taken and adapted from:
 #      https://github.com/sithu31296/semantic-segmentation
 #
 # Instructions:
-#   1. We recommend preparing the data according to SG's CityScapes readme file:
+#   1. Choose SegFormer architecture (b0 - b5) by changing the value of the "architecture" field below
+#   2. We recommend preparing the data according to SG's CityScapes readme file:
 #      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
-#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
+#   3. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
 #      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
-#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
-#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
-#   5. Move to the project root (where you will find the ReadMe and src folder)
-#   6. Run the command:
-#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b0
+#   4. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
+#   5. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
+#   6. Move to the project root (where you will find the ReadMe and src folder)
+#   7. Run the command (change:
+#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer
 #
 #
 # Imagenet-1K pre-trained backbone:
 #   MiT (Mix Transformer) B0:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b0.pth
+#                         B1:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b1.pth
+#                         B2:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b2.pth
+#                         B3:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b3.pth
+#                         B4:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b4.pth
+#                         B5:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b5.pth
 #
 #   1. Download the weights from the above link and put them in a directory of your choice
 #   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
 #   3. Ensure checkpoint_params.load_backbone: True
+#
+# Performance and training details:
+#      SegFormer-B0: mIoU (sliding-window inference) on validation set: 76.25
+#                    training time: 17 hours with 3 A10G GPUs with DDP, ~3 minuets / epoch
 
 
 defaults:
@@ -36,7 +46,7 @@ defaults:
   - checkpoint_params: default_checkpoint_params
   - _self_
 
-architecture: segformer_b0
+architecture: segformer_b0    # segformer_b1, segformer_b2, segformer_b3, segformer_b4, segformer_b5
 
 data_root_dir: /data/cityscapes
 dataset_params:
@@ -46,7 +56,7 @@ dataset_params:
     root_dir: ${data_root_dir}
 
 experiment_name: ${architecture}_cityscapes
-ckpt_root_dir:
+ckpt_root_dir: /home/eran.shachar/PycharmProjects/super-gradients/checkpoints
 
 train_dataloader: cityscapes_train
 val_dataloader: cityscapes_val
@@ -57,7 +67,7 @@ arch_params:
   num_classes: 19
 
 checkpoint_params:
-  checkpoint_path:
+  checkpoint_path: /home/eran.shachar/data/segformer_pretrained_weights/mit_b0.pth
   load_backbone: True
   load_weights_only: True
   strict_load: no_key_matching
@@ -97,7 +107,7 @@ training_hyperparams:
   greater_metric_to_watch_is_better: True
 
 multi_gpu: DDP
-num_gpus: 4
+num_gpus: 3
 
 
 # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml b/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
deleted file mode 100644
index 6fcf073ff3..0000000000
--- a/src/super_gradients/recipes/cityscapes_segformer_b1.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-#  SegFormer-B1 segmentation training example with Cityscapes dataset.
-#  Reproduction of paper:
-#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
-#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
-#  ( https://arxiv.org/pdf/2105.15203.pdf )
-#
-#  Official git repo:
-#      https://github.com/NVlabs/SegFormer
-#
-#
-#  Imagenet-1k pre-trained backbone weights taken and adapted from:
-#      https://github.com/sithu31296/semantic-segmentation
-#
-#
-# Instructions:
-#   1. We recommend preparing the data according to SG's CityScapes readme file:
-#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
-#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
-#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
-#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
-#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
-#   5. Move to the project root (where you will find the ReadMe and src folder)
-#   6. Run the command:
-#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b1
-#
-#
-# Imagenet-1K pre-trained backbone:
-#   MiT (Mix Transformer) B1:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b1.pth
-#
-#   1. Download the weights from the above link and put them in a directory of your choice
-#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
-#   3. Ensure checkpoint_params.load_backbone: True
-
-
-defaults:
-  - training_hyperparams: default_train_params
-  - dataset_params: cityscapes_segformer_dataset_params
-  - checkpoint_params: default_checkpoint_params
-  - _self_
-
-architecture: segformer_b1
-
-data_root_dir: /data/cityscapes
-dataset_params:
-  train_dataset_params:
-    root_dir: ${data_root_dir}
-  val_dataset_params:
-    root_dir: ${data_root_dir}
-
-experiment_name: ${architecture}_cityscapes
-ckpt_root_dir:
-
-train_dataloader: cityscapes_train
-val_dataloader: cityscapes_val
-
-cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
-
-arch_params:
-  num_classes: 19
-
-checkpoint_params:
-  checkpoint_path:
-  load_backbone: True
-  load_weights_only: True
-  strict_load: no_key_matching
-
-load_checkpoint: False
-
-resume: False
-training_hyperparams:
-
-  resume: ${resume}
-
-  max_epochs: 400
-
-  lr_mode: poly
-  initial_lr: 0.0002   # for effective batch_size=8
-
-  optimizer: AdamW
-  zero_weight_decay_on_bias_and_bn: True
-
-  sync_bn: True
-
-  loss: cross_entropy
-  criterion_params:
-    ignore_index: ${cityscapes_ignored_label}
-
-  train_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  valid_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  metric_to_watch: IoU
-  greater_metric_to_watch_is_better: True
-
-multi_gpu: DDP
-num_gpus: 4
-
-
-# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
-hydra:
-  run:
-    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
-    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml b/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
deleted file mode 100644
index 23e5956ac5..0000000000
--- a/src/super_gradients/recipes/cityscapes_segformer_b2.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-#  SegFormer-B2 segmentation training example with Cityscapes dataset.
-#  Reproduction of paper:
-#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
-#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
-#  ( https://arxiv.org/pdf/2105.15203.pdf )
-#
-#  Official git repo:
-#      https://github.com/NVlabs/SegFormer
-#
-#
-#  Imagenet-1k pre-trained backbone weights taken and adapted from:
-#      https://github.com/sithu31296/semantic-segmentation
-#
-#
-# Instructions:
-#   1. We recommend preparing the data according to SG's CityScapes readme file:
-#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
-#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
-#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
-#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
-#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
-#   5. Move to the project root (where you will find the ReadMe and src folder)
-#   6. Run the command:
-#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b2
-#
-#
-# Imagenet-1K pre-trained backbone:
-#   MiT (Mix Transformer) B2:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b2.pth
-#
-#   1. Download the weights from the above link and put them in a directory of your choice
-#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
-#   3. Ensure checkpoint_params.load_backbone: True
-
-
-defaults:
-  - training_hyperparams: default_train_params
-  - dataset_params: cityscapes_segformer_dataset_params
-  - checkpoint_params: default_checkpoint_params
-  - _self_
-
-architecture: segformer_b2
-
-data_root_dir: /data/cityscapes
-dataset_params:
-  train_dataset_params:
-    root_dir: ${data_root_dir}
-  val_dataset_params:
-    root_dir: ${data_root_dir}
-
-experiment_name: ${architecture}_cityscapes
-ckpt_root_dir:
-
-train_dataloader: cityscapes_train
-val_dataloader: cityscapes_val
-
-cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
-
-arch_params:
-  num_classes: 19
-
-checkpoint_params:
-  checkpoint_path:
-  load_backbone: True
-  load_weights_only: True
-  strict_load: no_key_matching
-
-load_checkpoint: False
-
-resume: False
-training_hyperparams:
-
-  resume: ${resume}
-
-  max_epochs: 400
-
-  lr_mode: poly
-  initial_lr: 0.0002   # for effective batch_size=8
-
-  optimizer: AdamW
-  zero_weight_decay_on_bias_and_bn: True
-
-  sync_bn: True
-
-  loss: cross_entropy
-  criterion_params:
-    ignore_index: ${cityscapes_ignored_label}
-
-  train_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  valid_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  metric_to_watch: IoU
-  greater_metric_to_watch_is_better: True
-
-multi_gpu: DDP
-num_gpus: 4
-
-
-# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
-hydra:
-  run:
-    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
-    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml b/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
deleted file mode 100644
index 4d957502bf..0000000000
--- a/src/super_gradients/recipes/cityscapes_segformer_b3.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-#  SegFormer-B3 segmentation training example with Cityscapes dataset.
-#  Reproduction of paper:
-#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
-#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
-#  ( https://arxiv.org/pdf/2105.15203.pdf )
-#
-#  Official git repo:
-#      https://github.com/NVlabs/SegFormer
-#
-#
-#  Imagenet-1k pre-trained backbone weights taken and adapted from:
-#      https://github.com/sithu31296/semantic-segmentation
-#
-#
-# Instructions:
-#   1. We recommend preparing the data according to SG's CityScapes readme file:
-#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
-#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
-#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
-#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
-#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
-#   5. Move to the project root (where you will find the ReadMe and src folder)
-#   6. Run the command:
-#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b3
-#
-#
-# Imagenet-1K pre-trained backbone:
-#   MiT (Mix Transformer) B3:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b3.pth
-#
-#   1. Download the weights from the above link and put them in a directory of your choice
-#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
-#   3. Ensure checkpoint_params.load_backbone: True
-
-
-defaults:
-  - training_hyperparams: default_train_params
-  - dataset_params: cityscapes_segformer_dataset_params
-  - checkpoint_params: default_checkpoint_params
-  - _self_
-
-architecture: segformer_b3
-
-data_root_dir: /data/cityscapes
-dataset_params:
-  train_dataset_params:
-    root_dir: ${data_root_dir}
-  val_dataset_params:
-    root_dir: ${data_root_dir}
-
-experiment_name: ${architecture}_cityscapes
-ckpt_root_dir:
-
-train_dataloader: cityscapes_train
-val_dataloader: cityscapes_val
-
-cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
-
-arch_params:
-  num_classes: 19
-
-checkpoint_params:
-  checkpoint_path:
-  load_backbone: True
-  load_weights_only: True
-  strict_load: no_key_matching
-
-load_checkpoint: False
-
-resume: False
-training_hyperparams:
-
-  resume: ${resume}
-
-  max_epochs: 400
-
-  lr_mode: poly
-  initial_lr: 0.0002   # for effective batch_size=8
-
-  optimizer: AdamW
-  zero_weight_decay_on_bias_and_bn: True
-
-  sync_bn: True
-
-  loss: cross_entropy
-  criterion_params:
-    ignore_index: ${cityscapes_ignored_label}
-
-  train_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  valid_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  metric_to_watch: IoU
-  greater_metric_to_watch_is_better: True
-
-multi_gpu: DDP
-num_gpus: 4
-
-
-# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
-hydra:
-  run:
-    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
-    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml b/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
deleted file mode 100644
index 4d34fd1c95..0000000000
--- a/src/super_gradients/recipes/cityscapes_segformer_b4.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-#  SegFormer-B4 segmentation training example with Cityscapes dataset.
-#  Reproduction of paper:
-#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
-#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
-#  ( https://arxiv.org/pdf/2105.15203.pdf )
-#
-#  Official git repo:
-#      https://github.com/NVlabs/SegFormer
-#
-#
-#  Imagenet-1k pre-trained backbone weights taken and adapted from:
-#      https://github.com/sithu31296/semantic-segmentation
-#
-#
-# Instructions:
-#   1. We recommend preparing the data according to SG's CityScapes readme file:
-#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
-#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
-#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
-#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
-#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
-#   5. Move to the project root (where you will find the ReadMe and src folder)
-#   6. Run the command:
-#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b4
-#
-#
-# Imagenet-1K pre-trained backbone:
-#   MiT (Mix Transformer) B4:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b4.pth
-#
-#   1. Download the weights from the above link and put them in a directory of your choice
-#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
-#   3. Ensure checkpoint_params.load_backbone: True
-
-
-defaults:
-  - training_hyperparams: default_train_params
-  - dataset_params: cityscapes_segformer_dataset_params
-  - checkpoint_params: default_checkpoint_params
-  - _self_
-
-architecture: segformer_b4
-
-data_root_dir: /data/cityscapes
-dataset_params:
-  train_dataset_params:
-    root_dir: ${data_root_dir}
-  val_dataset_params:
-    root_dir: ${data_root_dir}
-
-experiment_name: ${architecture}_cityscapes
-ckpt_root_dir:
-
-train_dataloader: cityscapes_train
-val_dataloader: cityscapes_val
-
-cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
-
-arch_params:
-  num_classes: 19
-
-checkpoint_params:
-  checkpoint_path:
-  load_backbone: True
-  load_weights_only: True
-  strict_load: no_key_matching
-
-load_checkpoint: False
-
-resume: False
-training_hyperparams:
-
-  resume: ${resume}
-
-  max_epochs: 400
-
-  lr_mode: poly
-  initial_lr: 0.0002   # for effective batch_size=8
-
-  optimizer: AdamW
-  zero_weight_decay_on_bias_and_bn: True
-
-  sync_bn: True
-
-  loss: cross_entropy
-  criterion_params:
-    ignore_index: ${cityscapes_ignored_label}
-
-  train_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  valid_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  metric_to_watch: IoU
-  greater_metric_to_watch_is_better: True
-
-multi_gpu: DDP
-num_gpus: 4
-
-
-# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
-hydra:
-  run:
-    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
-    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml b/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
deleted file mode 100644
index ba8dd776de..0000000000
--- a/src/super_gradients/recipes/cityscapes_segformer_b5.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-#  SegFormer-B5 segmentation training example with Cityscapes dataset.
-#  Reproduction of paper:
-#  Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar,  Jose M. Alvarez, Ping Luo
-#  "SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"
-#  ( https://arxiv.org/pdf/2105.15203.pdf )
-#
-#  Official git repo:
-#      https://github.com/NVlabs/SegFormer
-#
-#
-#  Imagenet-1k pre-trained backbone weights taken and adapted from:
-#      https://github.com/sithu31296/semantic-segmentation
-#
-#
-# Instructions:
-#   1. We recommend preparing the data according to SG's CityScapes readme file:
-#      https://github.com/Deci-AI/super-gradients/blob/master/src/super_gradients/training/datasets/Dataset_Setup_Instructions.md
-#   2. Note: if you change the dataset's internal directory structure, make changes to the fields "list_file" and
-#      "labels_csv_path" of both "train_dataset_params" and "val_dataset_params" accordingly
-#   3. Edit the "data_root_dir" field below to point to the absolute path of the data root directory
-#   4. Edit the "ckpt_root_dir" field to the path where you want to save checkpoints and logs
-#   5. Move to the project root (where you will find the ReadMe and src folder)
-#   6. Run the command:
-#       python src/super_gradients/examples/train_from_recipe_example/train_from_recipe.py --config-name=cityscapes_segformer_b5
-#
-#
-# Imagenet-1K pre-trained backbone:
-#   MiT (Mix Transformer) B5:   https://deci-pretrained-models.s3.amazonaws.com/mit_backbones/mit_b5.pth
-#
-#   1. Download the weights from the above link and put them in a directory of your choice
-#   2. Below, insert the weights file's full path to checkpoint_params.checkpoint_path
-#   3. Ensure checkpoint_params.load_backbone: True
-
-
-defaults:
-  - training_hyperparams: default_train_params
-  - dataset_params: cityscapes_segformer_dataset_params
-  - checkpoint_params: default_checkpoint_params
-  - _self_
-
-architecture: segformer_b5
-
-data_root_dir: /data/cityscapes
-dataset_params:
-  train_dataset_params:
-    root_dir: ${data_root_dir}
-  val_dataset_params:
-    root_dir: ${data_root_dir}
-
-experiment_name: ${architecture}_cityscapes
-ckpt_root_dir:
-
-train_dataloader: cityscapes_train
-val_dataloader: cityscapes_val
-
-cityscapes_ignored_label: 19    # convenience parameter since it is used in many places in the YAML
-
-arch_params:
-  num_classes: 19
-
-checkpoint_params:
-  checkpoint_path:
-  load_backbone: True
-  load_weights_only: True
-  strict_load: no_key_matching
-
-load_checkpoint: False
-
-resume: False
-training_hyperparams:
-
-  resume: ${resume}
-
-  max_epochs: 400
-
-  lr_mode: poly
-  initial_lr: 0.0002   # for effective batch_size=8
-
-  optimizer: AdamW
-  zero_weight_decay_on_bias_and_bn: True
-
-  sync_bn: True
-
-  loss: cross_entropy
-  criterion_params:
-    ignore_index: ${cityscapes_ignored_label}
-
-  train_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  valid_metrics_list:
-    - IoU:
-        num_classes: 20
-        ignore_index: ${cityscapes_ignored_label}
-
-  metric_to_watch: IoU
-  greater_metric_to_watch_is_better: True
-
-multi_gpu: DDP
-num_gpus: 4
-
-
-# THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA
-hydra:
-  run:
-    # Set the output directory (i.e. where .hydra folder that logs all the input params will be generated)
-    dir: ${hydra_output_dir:${ckpt_root_dir}, ${experiment_name}}
diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index 773820dfa9..c4422b1610 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -6,19 +6,22 @@
 from super_gradients.training.utils import get_param
 from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule
 from super_gradients.training.utils.regularization_utils import DropPath
+from super_gradients.modules.conv_bn_relu_block import ConvBNReLU
+
 
 from typing import List, Tuple
 
 """
 paper:  SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers
         ( https://arxiv.org/pdf/2105.15203.pdf )
-code adopted from git repo: https://github.com/sithu31296/semantic-segmentation
-
-Imagenet-1k pre-trained backbone weights taken and adapted from: https://github.com/sithu31296/semantic-segmentation
 
+Code and Imagenet-1k pre-trained backbone weights adopted from GitHub repo:
+https://github.com/sithu31296/semantic-segmentation
 """
 
 
+# TODO: extract this block to src/super_gradients/modules/transformer_modules and reuse the same module of Beit and
+#       other ViTs
 class PatchEmbedding(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, patch_size: int, stride: int, padding: int):
         """
@@ -45,6 +48,8 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
         return x, h, w
 
 
+# TODO: extract this block to src/super_gradients/modules/transformer_modules and reuse the same module of Beit and
+#       other ViTs
 class EfficientSelfAttention(nn.Module):
     def __init__(self, dim: int, head: int, sr_ratio: int):
         """
@@ -144,13 +149,13 @@ def forward(self, x: torch.Tensor, h: int, w: int) -> torch.Tensor:
 class MiTBackBone(nn.Module):
     def __init__(
         self,
-        embed_dims: list,
-        encoder_layers: list,
-        eff_self_att_reduction_ratio: list,
-        eff_self_att_heads: list,
-        overlap_patch_size: list,
-        overlap_patch_stride: list,
-        overlap_patch_pad: list,
+        embed_dims: List[int],
+        encoder_layers: List[int],
+        eff_self_att_reduction_ratio: List[int],
+        eff_self_att_heads: List[int],
+        overlap_patch_size: List[int],
+        overlap_patch_stride: List[int],
+        overlap_patch_pad: List[int],
         in_channels: int,
     ):
         """
@@ -237,8 +242,10 @@ def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
         return features
 
 
+# TODO: extract this block to src/super_gradients/modules/transformer_modules and reuse the same module of Beit and
+#       other ViTs
 class MLP(nn.Module):
-    def __init__(self, dim, embed_dim):
+    def __init__(self, dim: int, embed_dim: int):
         """
         A single Linear layer, with shape pre-processing
         :param dim: input dimension
@@ -256,26 +263,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-class LinearFuse(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int):
-        """
-        A linear fusion block (conv + bn + relu) (https://arxiv.org/pdf/2105.15203.pdf)
-        :param in_channels: number of input channels
-        :param out_channels: number of output channels
-        """
-
-        super().__init__()
-
-        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=False)
-        self.bn = nn.BatchNorm2d(out_channels)
-        self.relu = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        return self.relu(self.bn(self.conv(x)))
-
-
 class SegFormerHead(nn.Module):
-    def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int):
+    def __init__(self, encoder_dims: List[int], embed_dim: int, num_classes: int):
         """
         SegFormer decoder head (https://arxiv.org/pdf/2105.15203.pdf)
         :param encoder_dims: list of encoder embedding dimensions
@@ -289,7 +278,7 @@ def __init__(self, encoder_dims: list, embed_dim: int, num_classes: int):
             self.linear_layers.append(MLP(dim, embed_dim))
             self.add_module(f"linear_c{idx + 1}", self.linear_layers[idx])
 
-        self.linear_fuse = LinearFuse(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim)
+        self.linear_fuse = ConvBNReLU(in_channels=embed_dim * len(encoder_dims), out_channels=embed_dim, kernel_size=1, bias=False, inplace=True)
         self.linear_pred = nn.Conv2d(in_channels=embed_dim, out_channels=num_classes, kernel_size=1)
 
         self.dropout = nn.Dropout2d(0.1)
@@ -315,14 +304,14 @@ class SegFormer(SegmentationModule):
     def __init__(
         self,
         num_classes: int,
-        encoder_embed_dims: list,
-        encoder_layers: list,
-        eff_self_att_reduction_ratio: list,
-        eff_self_att_heads: list,
+        encoder_embed_dims: List[int],
+        encoder_layers: List[int],
+        eff_self_att_reduction_ratio: List[int],
+        eff_self_att_heads: List[int],
         decoder_embed_dim: int,
-        overlap_patch_size: list,
-        overlap_patch_stride: list,
-        overlap_patch_pad: list,
+        overlap_patch_size: List[int],
+        overlap_patch_stride: List[int],
+        overlap_patch_pad: List[int],
         in_channels: int = 3,
     ):
         """
@@ -425,7 +414,11 @@ def _separate_lr_multiply_params(self):
 
 class SegFormerCustom(SegFormer):
     def __init__(self, arch_params: HpmStruct):
-        """Parse arch_params and translate the parameters to build the SegFormer architecture"""
+        """
+        Parse arch_params and translate the parameters to build the SegFormer architecture
+        :param arch_params: architecture parameters
+        """
+
         super().__init__(
             num_classes=arch_params.num_classes,
             encoder_embed_dims=arch_params.encoder_embed_dims,
@@ -476,6 +469,11 @@ def __init__(self, arch_params: HpmStruct):
 
 class SegFormerB0(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
+        """
+        SegFormer B0 architecture
+        :param arch_params: architecture parameters
+        """
+
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B0_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
@@ -483,6 +481,11 @@ def __init__(self, arch_params: HpmStruct):
 
 class SegFormerB1(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
+        """
+        SegFormer B1 architecture
+        :param arch_params: architecture parameters
+        """
+
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B1_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
@@ -490,6 +493,11 @@ def __init__(self, arch_params: HpmStruct):
 
 class SegFormerB2(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
+        """
+        SegFormer B2 architecture
+        :param arch_params: architecture parameters
+        """
+
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B2_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
@@ -497,6 +505,11 @@ def __init__(self, arch_params: HpmStruct):
 
 class SegFormerB3(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
+        """
+        SegFormer B3 architecture
+        :param arch_params: architecture parameters
+        """
+
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B3_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
@@ -504,6 +517,11 @@ def __init__(self, arch_params: HpmStruct):
 
 class SegFormerB4(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
+        """
+        SegFormer B4 architecture
+        :param arch_params: architecture parameters
+        """
+
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B4_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)
@@ -511,6 +529,11 @@ def __init__(self, arch_params: HpmStruct):
 
 class SegFormerB5(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
+        """
+        SegFormer B5 architecture
+        :param arch_params: architecture parameters
+        """
+
         _arch_params = HpmStruct(**DEFAULT_SEGFORMER_B5_PARAMS)
         _arch_params.override(**arch_params.to_dict())
         super().__init__(_arch_params)

From 75ca2ad73d8d8d236ce89669d0bcfdbc60136dc9 Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Wed, 15 Mar 2023 15:03:46 +0200
Subject: [PATCH 09/12] Update cityscapes_segformer.yaml

---
 src/super_gradients/recipes/cityscapes_segformer.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/super_gradients/recipes/cityscapes_segformer.yaml b/src/super_gradients/recipes/cityscapes_segformer.yaml
index b78fb932df..5f895a736c 100644
--- a/src/super_gradients/recipes/cityscapes_segformer.yaml
+++ b/src/super_gradients/recipes/cityscapes_segformer.yaml
@@ -56,7 +56,7 @@ dataset_params:
     root_dir: ${data_root_dir}
 
 experiment_name: ${architecture}_cityscapes
-ckpt_root_dir: /home/eran.shachar/PycharmProjects/super-gradients/checkpoints
+ckpt_root_dir:
 
 train_dataloader: cityscapes_train
 val_dataloader: cityscapes_val
@@ -67,7 +67,7 @@ arch_params:
   num_classes: 19
 
 checkpoint_params:
-  checkpoint_path: /home/eran.shachar/data/segformer_pretrained_weights/mit_b0.pth
+  checkpoint_path:
   load_backbone: True
   load_weights_only: True
   strict_load: no_key_matching
@@ -107,7 +107,7 @@ training_hyperparams:
   greater_metric_to_watch_is_better: True
 
 multi_gpu: DDP
-num_gpus: 3
+num_gpus: 4
 
 
 # THE FOLLOWING PARAMS ARE DIRECTLY USED BY HYDRA

From 35908195f7a04ae9b2e5ddcb5e59be0349f19c24 Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Wed, 15 Mar 2023 16:16:24 +0200
Subject: [PATCH 10/12] Update segformer.py

---
 .../training/models/segmentation_models/segformer.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index c4422b1610..6d4ae3c632 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from super_gradients.training.models import HpmStruct
+from super_gradients.training.utils.utils import HpmStruct
 from super_gradients.training.utils import get_param
 from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule
 from super_gradients.training.utils.regularization_utils import DropPath

From 9c5f1e2b3f9effe0af77339c8be6eaf340db6796 Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Mon, 17 Apr 2023 15:36:58 +0300
Subject: [PATCH 11/12] Update segformer.py

---
 .../training/models/segmentation_models/segformer.py      | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/super_gradients/training/models/segmentation_models/segformer.py b/src/super_gradients/training/models/segmentation_models/segformer.py
index 6d4ae3c632..9ebadb30dc 100644
--- a/src/super_gradients/training/models/segmentation_models/segformer.py
+++ b/src/super_gradients/training/models/segmentation_models/segformer.py
@@ -7,6 +7,8 @@
 from super_gradients.training.models.segmentation_models.segmentation_module import SegmentationModule
 from super_gradients.training.utils.regularization_utils import DropPath
 from super_gradients.modules.conv_bn_relu_block import ConvBNReLU
+from super_gradients.common.object_names import Models
+from super_gradients.common.registry.registry import register_model
 
 
 from typing import List, Tuple
@@ -467,6 +469,7 @@ def __init__(self, arch_params: HpmStruct):
 }
 
 
+@register_model(Models.SEGFORMER_B0)
 class SegFormerB0(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         """
@@ -479,6 +482,7 @@ def __init__(self, arch_params: HpmStruct):
         super().__init__(_arch_params)
 
 
+@register_model(Models.SEGFORMER_B1)
 class SegFormerB1(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         """
@@ -491,6 +495,7 @@ def __init__(self, arch_params: HpmStruct):
         super().__init__(_arch_params)
 
 
+@register_model(Models.SEGFORMER_B2)
 class SegFormerB2(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         """
@@ -503,6 +508,7 @@ def __init__(self, arch_params: HpmStruct):
         super().__init__(_arch_params)
 
 
+@register_model(Models.SEGFORMER_B3)
 class SegFormerB3(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         """
@@ -515,6 +521,7 @@ def __init__(self, arch_params: HpmStruct):
         super().__init__(_arch_params)
 
 
+@register_model(Models.SEGFORMER_B4)
 class SegFormerB4(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         """
@@ -527,6 +534,7 @@ def __init__(self, arch_params: HpmStruct):
         super().__init__(_arch_params)
 
 
+@register_model(Models.SEGFORMER_B5)
 class SegFormerB5(SegFormerCustom):
     def __init__(self, arch_params: HpmStruct):
         """

From 0e853ede104bdb443041699be9bcd1ca569a1e53 Mon Sep 17 00:00:00 2001
From: eran-deci <eran.shachar@deci.ai>
Date: Mon, 17 Apr 2023 15:56:25 +0300
Subject: [PATCH 12/12] Update __init__.py

---
 src/super_gradients/training/models/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/super_gradients/training/models/__init__.py b/src/super_gradients/training/models/__init__.py
index c6582e30aa..280de77b61 100755
--- a/src/super_gradients/training/models/__init__.py
+++ b/src/super_gradients/training/models/__init__.py
@@ -96,6 +96,7 @@
     STDCSegmentationBase,
     CustomSTDCSegmentation,
 )
+from super_gradients.training.models.segmentation_models.segformer import SegFormerB0, SegFormerB1, SegFormerB2, SegFormerB3, SegFormerB4, SegFormerB5
 
 # Pose estimation
 from super_gradients.training.models.pose_estimation_models.pose_ppyolo import PosePPYoloL
@@ -258,4 +259,10 @@
     "ARCHITECTURES",
     "Models",
     "user_models",
+    "SegFormerB0",
+    "SegFormerB1",
+    "SegFormerB2",
+    "SegFormerB3",
+    "SegFormerB4",
+    "SegFormerB5",
 ]