diff --git a/Jenkinsfile b/Jenkinsfile
index 8b88ba9fb1ac..d1c0c103509d 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -126,6 +126,256 @@ pipeline {
         sh 'CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat'
       }
     }
+//
+//     stage('L2: Multimodal Imagen Train') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/imagen_train"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/text_to_image/imagen/imagen_training.py \
+//         trainer.precision=16 \
+//         trainer.num_nodes=1 \
+//         trainer.devices=1 \
+//         ++exp_manager.max_time_per_run=00:00:03:00 \
+//         trainer.max_steps=20 \
+//         model.micro_batch_size=1 \
+//         model.global_batch_size=1 \
+//         model.data.synthetic_data=True \
+//         exp_manager.exp_dir=/home/TestData/multimodal/imagen_train \
+//         model.inductor=False \
+//         model.unet.flash_attention=False \
+//         "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/imagen_train"
+//       }
+//     }
+//
+//     stage('L2: Multimodal Stable Diffusion Train') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+//             trainer.precision=16 \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=1 \
+//             model.global_batch_size=1 \
+//             model.data.synthetic_data=True \
+//             exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train \
+//             model.inductor=False \
+//             model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+//             ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+//             ++model.cond_stage_config.max_length=77 \
+//             ~model.cond_stage_config.restore_from_path \
+//             ~model.cond_stage_config.freeze \
+//             ~model.cond_stage_config.layer \
+//             model.unet_config.from_pretrained=null \
+//             model.first_stage_config.from_pretrained=null \
+//             model.unet_config.use_flash_attention=False \
+//             "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
+//       }
+//     }
+//     stage('L2: Multimodal ControlNet Train') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/text_to_image/controlnet/controlnet_train.py \
+//             trainer.precision=16 \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=1 \
+//             model.global_batch_size=1 \
+//             model.data.synthetic_data=True \
+//             exp_manager.exp_dir=/home/TestData/multimodal/controlnet_train \
+//             model.inductor=False \
+//             model.image_logger.max_images=0 \
+//             model.control_stage_config.params.from_pretrained_unet=null \
+//             model.unet_config.from_pretrained=null \
+//             model.first_stage_config.from_pretrained=null \
+//             model.unet_config.use_flash_attention=False \
+//             "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/controlnet_train"
+//       }
+//     }
+//     stage('L2: Multimodal DreamBooth Train') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/text_to_image/dreambooth/dreambooth.py \
+//             trainer.precision=16 \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=1 \
+//             model.global_batch_size=1 \
+//             exp_manager.exp_dir=/home/TestData/multimodal/dreambooth_train \
+//             model.inductor=False \
+//             model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+//             ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+//             ++model.cond_stage_config.max_length=77 \
+//             ~model.cond_stage_config.restore_from_path \
+//             ~model.cond_stage_config.freeze \
+//             ~model.cond_stage_config.layer \
+//             model.unet_config.from_pretrained=null \
+//             model.first_stage_config.from_pretrained=null \
+//             model.data.instance_dir=/home/TestData/multimodal/tiny-dreambooth \
+//             model.unet_config.use_flash_attention=False \
+//             "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/dreambooth_train"
+//       }
+//     }
+//     stage('L2: Vision ViT Pretrain TP=1') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/vision/vision_transformer/megatron_vit_classification_pretrain.py \
+//             trainer.precision=16 \
+//             model.megatron_amp_O2=False \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             trainer.val_check_interval=5 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=2 \
+//             model.global_batch_size=4 \
+//             model.tensor_model_parallel_size=1 \
+//             model.pipeline_model_parallel_size=1 \
+//             model.data.num_workers=0 \
+//             exp_manager.create_checkpoint_callback=False \
+//             model.data.data_path=[/home/TestData/multimodal/tiny-imagenet/train,/home/TestData/multimodal/tiny-imagenet/val] \
+//             exp_manager.exp_dir=/home/TestData/vision/vit_pretrain_tp1 "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/vision/vit_pretrain_tp1"
+//       }
+//     }
+//
+//     stage('L2: Multimodal CLIP Pretrain TP=1') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/vision_language_foundation/clip/megatron_clip_pretrain.py  \
+//             trainer.precision=16 \
+//             model.megatron_amp_O2=False \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             trainer.val_check_interval=10 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=1 \
+//             model.global_batch_size=1 \
+//             model.tensor_model_parallel_size=1 \
+//             model.pipeline_model_parallel_size=1 \
+//             exp_manager.create_checkpoint_callback=False \
+//             model.data.num_workers=0 \
+//             model.vision.num_layers=2 \
+//             model.text.num_layers=2 \
+//             model.vision.patch_dim=32 \
+//             model.vision.encoder_seq_length=49 \
+//             model.vision.class_token_length=7 \
+//             model.data.train.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
+//             model.data.validation.dataset_path=[/home/TestData/multimodal/tiny-clip/00000.tar] \
+//             model.data.webdataset.local_root_path=/ \
+//             exp_manager.exp_dir=/home/TestData/multimodal/clip_pretrain_tp1 "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/clip_pretrain_tp1"
+//       }
+//     }
+//
+//     stage('L2: Multimodal NeVA Pretrain TP=1') {
+//       when {
+//         anyOf {
+//           branch 'main'
+//           changeRequest target: 'main'
+//         }
+//       }
+//       failFast true
+//       steps {
+//         sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
+//         sh "pip install webdataset==0.2.48"
+//         sh "python examples/multimodal/multimodal_llm/neva/neva_pretrain.py \
+//             trainer.precision=bf16 \
+//             model.megatron_amp_O2=False \
+//             trainer.num_nodes=1 \
+//             trainer.devices=1 \
+//             trainer.val_check_interval=10 \
+//             trainer.limit_val_batches=5 \
+//             trainer.log_every_n_steps=1 \
+//             ++exp_manager.max_time_per_run=00:00:03:00 \
+//             trainer.max_steps=20 \
+//             model.micro_batch_size=2 \
+//             model.global_batch_size=4 \
+//             model.tensor_model_parallel_size=1 \
+//             model.pipeline_model_parallel_size=1 \
+//             exp_manager.create_checkpoint_callback=False \
+//             model.data.data_path=/home/TestData/multimodal/tiny-neva/dummy.json \
+//             model.data.image_folder=/home/TestData/multimodal/tiny-neva/images \
+//             model.tokenizer.library=sentencepiece \
+//             model.tokenizer.model=/home/TestData/multimodal/tiny-neva/tokenizer_add_special.model \
+//             model.num_layers=2 \
+//             model.hidden_size=5120 \
+//             model.ffn_hidden_size=13824 \
+//             model.num_attention_heads=40 \
+//             model.normalization=rmsnorm \
+//             model.data.num_workers=0 \
+//             model.data.conv_template=llama_2 \
+//             model.mm_cfg.vision_encoder.from_pretrained='openai/clip-vit-large-patch14' \
+//             model.mm_cfg.llm.from_pretrained=null \
+//             model.use_flash_attention=false \
+//             exp_manager.exp_dir=/home/TestData/multimodal/neva_pretrain_tp1 "
+//         sh "pip install 'webdataset>=0.1.48,<=0.1.62'"
+//         sh "rm -rf /home/TestData/multimodal/neva_pretrain_tp1"
+//       }
+//     }
 
     // TODO: this requires TE >= v0.11 which is not available in 23.06.
     //        please uncomment this test once mcore CI is ready.
@@ -4826,6 +5076,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         }
       }
     }
+
     stage('L2: TTS Fast dev runs 1') {
       when {
         anyOf {
@@ -4971,7 +5222,27 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         }
       }
     }
-
+    stage('L2: NeRF') {
+      when {
+        anyOf {
+          branch 'r1.21.0'
+          changeRequest target: 'r1.21.0'
+        }
+      }
+      parallel {
+        stage('DreamFusion') {
+          steps {
+            sh 'python examples/multimodal/text_to_image/nerf/main.py \
+            trainer.num_nodes=1 \
+            trainer.devices="[0]" \
+            trainer.max_steps=1000 \
+            model.prompt="a DSLR photo of a delicious hamburger" \
+            exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results'
+            sh 'rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results'
+          }
+        }
+      }
+    }
     stage('L??: Speech Checkpoints tests') {
       when {
         anyOf {
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 952e25332ca4..586f6cf47675 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -61,6 +61,9 @@
     'ipadic',
     'psutil',
     'regex',
+    'PIL',
+    'boto3',
+    'taming',
 ]
 
 _skipped_autodoc_mock_imports = ['wrapt', 'numpy']
@@ -125,6 +128,8 @@
     'tts/tts_all.bib',
     'text_processing/text_processing_all.bib',
     'core/adapters/adapter_bib.bib',
+    'multimodal/mm_all.bib',
+    'vision/vision_all.bib',
 ]
 
 intersphinx_mapping = {
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 86ad55d1709b..7407886eefc8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -47,7 +47,7 @@ NVIDIA NeMo User Guide
    nlp/api
    nlp/megatron_onnx_export
    nlp/models
-   
+
 
 .. toctree::
    :maxdepth: 1
@@ -71,6 +71,23 @@ NVIDIA NeMo User Guide
    text_processing/g2p/g2p
    common/intro
 
+.. toctree::
+   :maxdepth: 3
+   :caption: Multimodal (MM)
+   :name: Multimodal
+
+   multimodal/mllm/intro
+   multimodal/vlm/intro
+   multimodal/text2img/intro
+   multimodal/nerf/intro
+   multimodal/api
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Vision
+   :name: vision
+
+   vision/intro
 
 .. toctree::
    :maxdepth: 3
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
new file mode 100644
index 000000000000..ef517d6bdd5a
--- /dev/null
+++ b/docs/source/multimodal/api.rst
@@ -0,0 +1,81 @@
+NeMo Megatron API
+=======================
+
+Model Classes
+-------------
+
+.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_base_model.MegatronBaseModel
+    :show-inheritance:
+    :no-members:
+    :members: __init__, configure_optimizers
+
+
+.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
+    :show-inheritance:
+    :no-members:
+    :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
+
+
+.. autoclass:: nemo.collections.multimodal.models.dreambooth.dreambooth.MegatronDreamBooth
+    :show-inheritance:
+    :no-members:
+    :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
+
+
+.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.MegatronControlNet
+    :show-inheritance:
+    :no-members:
+    :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
+
+.. autoclass:: nemo.collections.multimodal.models.imagen.imagen.MegatronImagen
+    :show-inheritance:
+    :no-members:
+    :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
+
+
+
+Modules
+-------
+
+.. autoclass:: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+    :show-inheritance:
+    :no-members:
+
+.. autoclass:: nemo.collections.multimodal.modules.imagen.diffusionmodules.nets.UNetModel
+    :show-inheritance:
+    :no-members:
+
+.. autoclass:: nemo.collections.multimodal.modules.imagen.diffusionmodules.nets.EfficientUNetModel
+    :show-inheritance:
+    :no-members:
+
+.. autoclass:: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+    :show-inheritance:
+    :no-members:
+    :members: __init__, encode, decode
+
+.. autoclass:: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
+    :show-inheritance:
+    :no-members:
+    :members: __init__, forward
+
+.. autoclass:: nemo.collections.multimodal.modules.imagen.encoder.t5encoder.T5Encoder
+    :show-inheritance:
+    :no-members:
+    :members: __init__, encode
+
+
+.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel
+    :show-inheritance:
+    :no-members:
+    :members: forward
+
+Datasets
+---------
+
+.. autoclass:: nemo.collections.multimodal.data.common.webdataset.WebDatasetCommon
+    :show-inheritance:
+
+.. autoclass:: nemo.collections.multimodal.data.dreambooth.dreambooth_dataset.DreamBoothDataset
+    :show-inheritance:
+
diff --git a/docs/source/multimodal/mllm/checkpoint.rst b/docs/source/multimodal/mllm/checkpoint.rst
new file mode 100644
index 000000000000..8ccb520bda4b
--- /dev/null
+++ b/docs/source/multimodal/mllm/checkpoint.rst
@@ -0,0 +1,114 @@
+Checkpoints
+===========
+
+In this section, we present four key functionalities of NVIDIA NeMo related to checkpoint management:
+
+1. **Checkpoint Loading**: Load local ``.nemo`` checkpoint files with the :code:`restore_from()` method.
+2. **Partial Checkpoint Conversion**: Convert partially-trained ``.ckpt`` checkpoints to the ``.nemo`` format.
+3. **Community Checkpoint Conversion**: Transition checkpoints from community sources, like HuggingFace, into the ``.nemo`` format.
+4. **Model Parallelism Adjustment**: Modify model parallelism to efficiently train models that exceed the memory of a single GPU. NeMo employs both tensor (intra-layer) and pipeline (inter-layer) model parallelisms. Dive deeper with "Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM" (`link <https://arxiv.org/pdf/2104.04473.pdf>`_). This tool aids in adjusting model parallelism, accommodating users who need to deploy on larger GPU arrays due to memory constraints.
+
+Understanding Checkpoint Formats
+--------------------------------
+
+A ``.nemo`` checkpoint is fundamentally a tar file that bundles the model configurations (given as a YAML file), model weights, and other pertinent artifacts like tokenizer models or vocabulary files. This consolidated design streamlines sharing, loading, tuning, evaluating, and inference.
+
+On the other hand, the ``.ckpt`` file is a product of PyTorch Lightning training. It stores model weights and optimizer states, and it's generally used for resuming training.
+
+Subsequent sections delve into each of the previously listed functionalities, emphasizing the loading of fully trained checkpoints for evaluation or additional fine-tuning.
+
+
+Loading Local Checkpoints
+-------------------------
+
+NeMo inherently saves any model's checkpoints in the ``.nemo`` format. To manually save a model at any stage:
+
+.. code-block:: python
+
+   model.save_to(<checkpoint_path>.nemo)
+
+To load a local ``.nemo`` checkpoint:
+
+.. code-block:: python
+
+   import nemo.collections.multimodal as nemo_multimodal
+   model = nemo_multimodal.models.<MODEL_BASE_CLASS>.restore_from(restore_path="<path/to/checkpoint/file.nemo>")
+
+Replace `<MODEL_BASE_CLASS>` with the appropriate MM model class.
+
+Converting Local Checkpoints
+----------------------------
+
+The training script only auto-converts the final checkpoint into the ``.nemo`` format. To evaluate intermediate training checkpoints, conversion to ``.nemo`` might be needed. For this:
+
+.. code-block:: python
+
+   python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
+       examples/multimodal/convert_ckpt_to_nemo.py \
+       --checkpoint_folder <path_to_PTL_checkpoints_folder> \
+       --checkpoint_name <checkpoint_name> \
+       --nemo_file_path <path_to_output_nemo_file> \
+       --tensor_model_parallel_size <tensor_model_parallel_size> \
+       --pipeline_model_parallel_size <pipeline_model_parallel_size>
+
+Converting Community Checkpoints
+--------------------------------
+
+NeVA Checkpoints
+^^^^^^^^^^^^^^^^
+
+Currently, the conversion mainly supports LLaVA checkpoints based on "llama-2 chat" checkpoints. As a reference, we'll consider the checkpoint `llava-llama-2-13b-chat-lightning-preview <https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview>`_.
+
+After downloading this checkpoint and saving it at `/path/to/llava-llama-2-13b-chat-lightning-preview`, undertake the following procedures:
+
+Modifying the Tokenizer
+"""""""""""""""""""""""
+
+NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in `/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer`, execute the following in the NeMo container:
+
+.. code-block:: bash
+
+   cd /opt/sentencepiece/src/
+   protoc --python_out=/opt/NeMo/scripts/tokenizers/ sentencepiece_model.proto
+   python /opt/NeMo/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \
+   --input_file /path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer.model \
+   --output_file /path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer_neva.model \
+   --is_userdefined \
+   --tokens "<extra_id_0>" "<extra_id_1>" "<extra_id_2>" "<extra_id_3>" \
+            "<extra_id_4>" "<extra_id_5>" "<extra_id_6>" "<extra_id_7>"
+
+Checkpoint Conversion
+"""""""""""""""""""""
+
+For conversion:
+
+.. code-block:: python
+
+   python examples/multimodal/mllm/neva/convert_hf_llava_to_neva.py \
+     --in-file /path/to/llava-llama-2-13b-chat-lightning-preview \
+     --out-file /path/to/neva-llava-llama-2-13b-chat-lightning-preview.nemo \
+     --tokenizer-model /path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer_add_special.model
+     --conv-template llama_2
+
+
+Model Parallelism Adjustment
+-------------------------
+
+NeVA Checkpoints
+^^^^^^^^^^^^^^^^
+
+Adjust model parallelism with:
+
+.. code-block:: python
+
+   python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+    --model_file=/path/to/source.nemo \
+    --target_file=/path/to/target.nemo \
+    --tensor_model_parallel_size=??? \
+    --target_tensor_model_parallel_size=??? \
+    --pipeline_model_parallel_size=??? \
+    --target_pipeline_model_parallel_size=??? \
+    --model_class="nemo.collections.multimodal.models.neva.neva_model.MegatronNevaModel" \
+    --precision=32 \
+    --tokenizer_model_path=/path/to/tokenizer.model \
+    --tp_conversion_only
diff --git a/docs/source/multimodal/mllm/configs.rst b/docs/source/multimodal/mllm/configs.rst
new file mode 100644
index 000000000000..38ee65da9dd3
--- /dev/null
+++ b/docs/source/multimodal/mllm/configs.rst
@@ -0,0 +1,143 @@
+Common Configuration Files
+==========================
+
+This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo Multimodal Language Model collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`../core/core` section.
+
+Within the configuration files of the NeMo Multimodal Language Model, details concerning dataset(s), augmentation, optimization parameters, and model architectural specifications are central. This page explores each of these aspects.
+
+Discover exemplary configuration files for all NeMo Multimodal Language Model scripts in the `config directory of the examples <https://TODOURL>`_.
+
+Dataset Configuration
+---------------------
+
+The NeMo multimodal language model currently supports a conversation data format, inspired by and designed from https://github.com/haotian-liu/LLaVA/tree/main. To explore a sample dataset, visit https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md.
+
+The configuration file allows setting any initialization parameter accepted by the Dataset class used in the experiment. For a comprehensive list of Datasets and their parameters, visit the `Datasets <./api.html#Datasets>`__ section of the API.
+
+A typical training configuration is as follows:
+
+.. code-block:: yaml
+
+  data:
+    num_workers: 8
+    dataloader_type: cyclic
+    data_path: path/to/conversations.json
+    lazy_preprocess: True
+    is_multimodal: True
+    conv_template: llama_2
+    image_token_len: 256
+    image_folder: path/to/images
+    image_aspect_ratio: 'square'
+
+Key parameters include:
+
+- ``data_path``: The path to the dataset in JSON format.
+- ``is_multimodal``: Indicates if the dataset has multiple modalities (e.g., text and images).
+- ``conv_template``: The template used for conversation format. Supports values like 'nvgpt' and 'llama_2'.
+- ``image_token_len``: Specifies how many tokens in the language model word embedding each image will occupy.
+- ``image_folder``: The path to the folder containing images related to the dataset.
+- ``image_aspect_ratio``: Specifies whether to pad or crop the image to maintain the aspect ratio, such as 'square'.
+
+Trainer Configuration
+---------------------
+
+This section outlines arguments for the Pytorch Lightning Trainer Object.
+
+.. code-block:: yaml
+
+  trainer:
+    devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
+    num_nodes: 1
+    max_epochs: -1
+    max_steps: 2500000 # precedence over max_epochs
+    logger: False  # Provided by exp_manager 
+    precision: bf16 # Should be set to 16 for O1 and O2 to enable the AMP.
+    accelerator: gpu
+    log_every_n_steps: 5  # Interval of logging.
+    resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+    num_sanity_val_steps: 10 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+    enable_checkpointing: False # Provided by exp_manager
+    accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+    gradient_clip_val: 1.0
+    benchmark: False
+    enable_model_summary: True
+
+For a detailed list of arguments, refer to the `Pytorch Lightning Trainer <https://lightning.ai/docs/pytorch/stable/common/trainer.html#>`__ API section.
+
+Experiment Manager Configurations
+---------------------------------
+
+The NeMo Experiment Manager provides a streamlined approach to manage various tasks such as logging, saving, and resuming.
+
+.. code-block:: yaml
+
+  exp_manager:
+    exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
+    name: ${name}
+    create_wandb_logger: True
+    wandb_logger_kwargs: # Whether you want exp_manger to create a Wandb logger
+      name: training-session
+      project: text2img
+      group: nemo
+      resume: True
+    create_tensorboard_logger: True  # Whether you want exp_manger to create a tb logger
+    create_checkpoint_callback: True  # Whether you want exp_manager to create a modelcheckpoint callback
+    checkpoint_callback_params:
+      monitor: reduced_train_loss
+      save_top_k: 5
+      every_n_epochs: 0 # Save checkpoint frequency.
+      every_n_train_steps: 1000 # Mutually exclusive with every_n_epochs. It is recommended to set this if training on large-scale dataset.
+      filename: '${name}--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+    resume_if_exists: True
+    resume_ignore_no_checkpoint: True
+    resume_from_checkpoint: ${model.resume_from_checkpoint}
+    ema:
+      enable: True
+      decay: 0.9999
+      validate_original_weights: False
+      every_n_steps: 1
+      cpu_offload: False
+
+Optimizer Configurations
+-------------------------
+
+.. code-block:: yaml
+
+  optim:
+    name: fused_adam
+    lr: 0.0001
+    eps: 1e-8
+    betas: [ 0.9, 0.999 ]
+    weight_decay: 0.01
+    sched:
+      name: WarmupPolicy
+      warmup_steps: 10000
+      warmup_ratio: null
+
+The default optimizer used is ``fused_adam``. For details on all supported optimizers, refer to the NeMo user guide. The learning rate scheduler can be specified in the ``optim.sched`` section.
+
+Model Configurations
+--------------------
+
+Each configuration file should detail the model architecture used for the experiment.
+
+The parameters commonly shared across most multimodal language models include:
+
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| **Parameter**             | **Datatype** | **Description**                                                                       |
++===========================+==============+=======================================================================================+
+| :code:`micro_batch_size`  | int          | micro batch size that fits on each GPU                                                |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`global_batch_size` | int          | global batch size that takes consideration of gradient accumulation, data parallelism |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`tensor_model_parallel_size`       | int         | intra-layer model parallelism                                                     |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`pipeline_model_parallel_size`     | int         | inter-layer model parallelism                                                           |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`seed`              | int          | seed used in training                                                                 |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+
+NeVA
+~~~~~~~~
+
+For model-specific configurations, refer to `Neva <./neva.html#neva>`_.
diff --git a/docs/source/multimodal/mllm/datasets.rst b/docs/source/multimodal/mllm/datasets.rst
new file mode 100644
index 000000000000..1c64c4d317d2
--- /dev/null
+++ b/docs/source/multimodal/mllm/datasets.rst
@@ -0,0 +1,99 @@
+Multimodal Language Model Datasets
+==================================
+
+The NeMo multimodal language model supports the conversation data format, drawing inspiration from and designed based on `LLaVA <https://github.com/haotian-liu/LLaVA/tree/main>`_. Sample datasets can be explored at `LLaVA's data documentation <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_.
+
+Preparing the Training Dataset
+------------------------------
+
+The NeVA model training encompasses two phases: pretraining and finetuning. Each phase mandates a unique dataset.
+
+For **pretraining**, utilize the *LAION/CC/SBU BLIP-Caption Concept-balanced 558K* dataset. Access this dataset via `LLaVA's GitHub <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_. After procuring the dataset, extract it to:
+
+.. code-block:: bash
+
+   /path/to/neva/datasets/LLaVA-Pretrain-LCS-558K/blip_laion_cc_sbu_558k.json
+
+Acquire the image data from `HuggingFace <https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/blob/main/images.zip>`_ and extract to:
+
+.. code-block:: bash
+
+   /path/to/neva/datasets/LLaVA-Pretrain-LCS-558K/images
+
+For **fine-tuning**, deploy the *LLaVA-Instruct-150K* dataset. This is also available on `LLaVA's GitHub <https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md>`_. You can download the prompts from `HuggingFace <https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/tree/main>`_:
+
+.. code-block:: bash
+
+   /path/to/neva/datasets/LLaVA-Instruct-150K/
+
+Image data for this phase can be obtained from the `COCO Dataset <https://cocodataset.org/#download>`_. Once downloaded, extract the images to:
+
+.. code-block:: bash
+
+   /path/to/neva/datasets/LLaVA-Instruct-150K/images
+
+Additional Preparation for NeVA Model
+-------------------------------------
+
+The following instructions are specific to the NeVA model within the NeMo Multimodal Language Models.
+
+Setting Up LLaMA-2 Chat Checkpoints
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Support is available for both the 7B and 13B chat models. Both can be downloaded from `LLaVA's Model Zoo <https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md>`_. After downloading the desired HuggingFace checkpoint, extract and store it on your local system to prep for pretraining.
+
+To convert the LLaMA-2 checkpoints to NeMo's format, follow these steps:
+
+1. Adjust the default yaml file at `megatron_llama_config.yaml <https://TODOURL>`_. Ensure ``model.mcore_gpt`` and ``model.transformer_engine`` are set to `False` before the checkpoint conversion.
+
+2. For the 7B chat model, use this conversion command:
+
+.. code-block:: bash
+
+   python /opt/NeMo/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py \
+     --in-file <PATH-TO-HF-CHECKPOINT> \
+     --out-file /path/to/neva/checkpoints/llama-2-7b-chat.nemo
+
+For the 13B model, adjust the paths in the `--in-file` and `--out-file` parameters accordingly.
+
+3. Execute the subsequent command to divide the checkpoint for tensor model parallel sizes of 4 or 8. It's advisable to use TP=4 for the 7B model and TP=8 for the 13B model to ensure both pretraining and finetuning operate without memory complications.
+
+.. code-block:: bash
+
+   # Instructions for the 7B model partitioning provided here.
+   # Adjust parameters for the 13B model as needed.
+   python /opt/NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py \
+     --model_file=/path/to/neva/checkpoints/llama-2-7b-chat.nemo  \
+     --target_file=/path/to/neva/checkpoints/llama-2-7b-chat-tp4.nemo \
+     --tensor_model_parallel_size=1 \
+     --target_tensor_model_parallel_size=4 \
+     --pipeline_model_parallel_size=1 \
+     --target_pipeline_model_parallel_size=1 \
+     --tp_conversion_only \
+     --model_class="nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel" \
+     --tokenizer_model_path=<PATH-TO-HF-CHECKPOINT>/tokenizer.model
+
+Tokenizer Configuration
+^^^^^^^^^^^^^^^^^^^^^^^
+
+For NeVA training, integrating special tokens into the tokenizer is vital. After obtaining the 7B/13B model from Huggingface, also procure the corresponding tokenizer model. Referring to the 7B-chat model:
+
+1. Download the `tokenizer.model <https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview/blob/main/tokenizer.model>`_ to:
+
+.. code-block:: bash
+
+   /path/to/neva/tokenizers/tokenizer.model
+
+2. Executing the next script necessitates the NeMo dependency. It's more convenient to run the script within the NeMo container.
+
+3. Employ the command below to infuse special tokens into the tokenizer:
+
+.. code-block:: bash
+
+   cd /opt/sentencepiece/src/; protoc --python_out=/opt/NeMo/scripts/tokenizers/ sentencepiece_model.proto
+   python /opt/NeMo/scripts/tokenizers/add_special_tokens_to_sentencepiece.py \
+   --input_file /path/to/neva/tokenizers/tokenizer.model \
+   --output_file /path/to/neva/tokenizers/tokenizer_neva.model \
+   --is_userdefined \
+   --tokens "<extra_id_0>" "<extra_id_1>" "<extra_id_2>" "<extra_id_3>" \
+            "<extra_id_4>" "<extra_id_5>" "<extra_id_6>" "<extra_id_7>"
diff --git a/docs/source/multimodal/mllm/images/llava_arch.jpg b/docs/source/multimodal/mllm/images/llava_arch.jpg
new file mode 100644
index 000000000000..7488935ff06c
Binary files /dev/null and b/docs/source/multimodal/mllm/images/llava_arch.jpg differ
diff --git a/docs/source/multimodal/mllm/intro.rst b/docs/source/multimodal/mllm/intro.rst
new file mode 100644
index 000000000000..2029210a7a8a
--- /dev/null
+++ b/docs/source/multimodal/mllm/intro.rst
@@ -0,0 +1,111 @@
+Multimodal Language Models
+==========================
+
+The endeavor to extend Language Models (LLMs) into multimodal domains by integrating additional structures like visual encoders has become a focal point of recent research, especially given its potential to significantly lower the cost compared to training multimodal universal models from scratch.
+
+The advent of GPT-4 has spurred a plethora of developments including notable models like LLaVA, Mini-GPT4, and Flamingo. These models, despite minor differences, share similar structural and training strategies.
+
+Supported Models
+-----------------
+NeMo Multimodal currently supports the following models:
+
++-----------------------------------+----------+-------------+------+-------------------------+------------------+
+| Model                             | Training | Fine-Tuning | PEFT | Evaluation              | Inference        |
++===================================+==========+=============+======+=========================+==================+
+| `NeVA (LLaVA) <./neva.html>`_     | ✓        | ✓           | -    | -                       | ✓                |
++-----------------------------------+----------+-------------+------+-------------------------+------------------+
+| Kosmos-2                          | WIP      | WIP         | -    | -                       | WIP              |
++-----------------------------------+----------+-------------+------+-------------------------+------------------+
+
+Spotlight Models
+-----------------
+
+LLaVA: Visual Instruction Tuning
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+LLaVA :cite:`mm-models-llava` focuses on creating a dataset for visual instruction tuning to enhance LLMs' ability to comprehend diverse instructions and provide detailed responses. NeMo's implementation of LLaVA is called NeVA.
+
+- Model Structure:
+    - Visual Encoder: Utilizes CLIP’s ViT-L/14.
+    - Text Decoder: Employs LLaMA.
+    - Connection: A simple linear mapping layer connects the visual encoder's output to the text decoder's word embedding space (v1.0 version).
+
+- Training:
+    1. Cross-modal Pre-training: Utilizes 595k image-text data from CC3M, training only the linear mapping layer while keeping the visual encoder and text decoder frozen.
+    2. Instruction Fine-tuning: Custom-built 158k multimodal instruction dataset employed for fine-tuning targeting multimodal chatbot scenarios, with a variant targeting the Science QA dataset.
+
+Flamingo: A Visual Language Model for Few-Shot Learning
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Flamingo :cite:`mm-models-flamingo` addresses inconsistent visual feature map sizes by generating fixed-length feature sequences, enhancing visual relevance generation.
+
+- Model Structure:
+    - Resampler: Utilizes a Perceiver Resampler for generating fixed-length feature sequences.
+    - Attention: Adds cross-attention layers before each LLM layer to enhance visual relevance generation.
+
+- Training:
+    - Dataset: Utilizes data from various datasets like M3W, ALIGN, LTIP, and VTP emphasizing multimodal in-context learning.
+
+Kosmos-1: Language Is Not All You Need: Aligning Perception with Language Models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Kosmos-1 :cite:`mm-models-kosmos1` by Microsoft is a Multimodal Large Language Model (MLLM) aimed at melding language, perception, action, and world modeling.
+
+- Model Structure:
+    - Core Backbone: Transformer-Based Causal Language Model.
+    - Architecture: Utilizes MAGNETO, a nuanced Transformer variant.
+    - Position Encoding: Employs XPOS relative position encoding for long-context modeling.
+    - Resampler: Employs Flamingo's Perceiver Resampler
+
+- Training:
+    - Dataset: Encompasses web-scale multimodal corpora including monomodal, cross-modal paired, and interleaved multimodal data.
+    - Objective: Focused on next-token prediction to maximize log-likelihood of tokens within examples.
+
+BLIP-2: Bootstrapping Language-Image Pre-training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+BLIP-2 :cite:`mm-models-blip2` adopts a two-phase training strategy focusing on learning key visual information and adapting visual encoding structure to LLMs.
+
+- Model Structure:
+    - Visual Encoder: Combines a pre-trained image encoder with a Querying Transformer (Q-Former).
+    - Bridging: The Q-Former acts as the bridge between the image encoder and the Large Language Model (LLM).
+
+- Training:
+    1. Phase 1: Focuses on tasks like Image-Text Contrastive Learning, Image-grounded Text Generation, and Image-Text Matching.
+    2. Phase 2: Aims at adapting the visual encoding structure's output to LLMs with language modeling as the training task.
+
+Mini-GPT4: Enhancing Vision-Language Understanding
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Mini-GPT4 :cite:`mm-models-minigpt4` emphasizes the importance of multimodal instruction data for model performance in multimodal open-ended scenarios.
+
+- Model Structure:
+    - Visual Encoder: Employs BLIP2’s ViT and Q-Former.
+    - Text Decoder: Uses Vicuna (a fine-tuned version of LLaMA).
+    - Connection: A linear mapping layer projects visual features into text representation space.
+
+- Training:
+    1. Cross-modal Learning: Focuses on learning the relationship between vision and language using data from CC+SBU+LAION datasets.
+    2. Fine-tuning: Utilizes a multimodal fine-tuning dataset built using ChatGPT to enhance text descriptions generated in phase 1.
+
+.. note::
+    NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
+
+For more information, see additional sections in the NeMo multimodal language model docs on the left-hand-side menu or in the list below:
+
+.. toctree::
+   :maxdepth: 1
+
+   datasets
+   configs
+   checkpoint
+   neva
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/multimodal/mllm/neva.rst b/docs/source/multimodal/mllm/neva.rst
new file mode 100644
index 000000000000..9e98bb97ed1d
--- /dev/null
+++ b/docs/source/multimodal/mllm/neva.rst
@@ -0,0 +1,160 @@
+NeVA
+====
+
+Model Introduction
+------------------
+
+Emerging from the roots of LLaVA (Large Language and Vision Assistant) :cite:`mm-models-llava`, NeVA stands as a pioneering model in the NeMo Multimodal ecosystem. It adeptly fuses large language-centric models, such as NVGPT or LLaMA, with a vision encoder. The training utilizes machine-generated multimodal language-image instruction-following data. Remarkably, even with a limited dataset, NeVA exhibits profound capabilities in deciphering images and adeptly answering queries about them. Its prowess is especially evident in tasks requiring intricate visual comprehension and instruction-following. Intriguingly, NeVA mirrors the capabilities of advanced multimodal models like GPT-4, even when faced with novel images and instructions.
+
+Building upon LLaVA's foundational principles, NeVA amplifies its training efficiency by harnessing the NeMo LLM framework's features, including model parallelism, activation checkpointing, AMP O2, Flash Attention, and more.
+
+    .. image:: images/llava_arch.jpg
+        :align: center
+        :alt: LLaVA model
+        :scale: 30%
+
+
+Main Language Model
+^^^^^^^^^^^^^^^
+
+The original LLaVA model incorporates the LLaMA architecture, renowned for its prowess in open-source, language-only instruction-tuning endeavors. LLaMA refines textual input through a process of tokenization and embedding. To these token embeddings, positional embeddings are integrated, and the combined representation is channeled through multiple transformer layers. The output from the concluding transformer layer, associated with the primary token, is designated as the text representation.
+
+In NeMo, the text encoder is anchored in the :class:`~nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel` class. This class is versatile, supporting not only NVGPT models but also LLaMA, LLaMA-2 and other community models, complete with a checkpoint conversion script. Concurrently, the vision model and projection layers enhance the primary language model's word embedding component. For a comprehensive understanding of the implementation, one can refer to the :class:`~nemo.collections.multimodal.models.multimodal_llm.neva.neva_model.MegatronNevaModel` class.
+
+
+Vision Model
+^^^^^^^^^^
+
+For visual interpretation, NeVA harnesses the power of the pre-trained CLIP visual encoder, ViT-L/14, recognized for its visual comprehension acumen. Images are first partitioned into standardized patches, for instance, 16x16 pixels. These patches are linearly embedded, forming a flattened vector that subsequently feeds into the transformer. The culmination of the transformer's processing is a unified image representation. In the NeMo framework, the NeVA vision model, anchored on the CLIP visual encoder ViT-L/14, can either be instantiated via the :class:`~nemo.collections.multimodal.models.multimodal_llm.clip.megatron_clip_models.CLIPVisionTransformer` class or initiated through the `transformers` package from Hugging Face.
+
+Projection and Integration
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The encoder retrieves visual features from images and intertwines them with language embeddings using a modifiable projection matrix. This intricate projection translates visual cues into language embedding tokens, seamlessly merging text and imagery. LLaVA-1.5 :cite:`mm-models-liu2023improvedllava` introduces two pivotal enhancements. The integration of an MLP vision-language connector amplifies the system's prowess. Building on the triumphs of MLPs in self-supervised learning, LLaVA-1.5 undergoes a transformative design shift. Transitioning from a linear to a dual-layer MLP projection markedly bolsters LLaVA-1.5's multimodal faculties, empowering the model to adeptly navigate and synergize language and visual elements.
+
+Architecture Table
+------------------
+
++------------------+---------------+------------+--------------------+-----------------+------------+----------------+--------------------------+
+| Base LLM         | Vision Encoder| Projection | Encoder Seq Length | Number of Layers| Hidden Size| FFN Hidden Size| Number of Attention Heads|
++==================+===============+============+====================+=================+============+================+==========================+
+| LLaMA-2-13B-Chat | CLIP-L        | Linear     | 4096               | 40              | 5120       | 13824          | 40                       |
++------------------+---------------+------------+--------------------+-----------------+------------+----------------+--------------------------+
+| LLaMA-2-7B-Chat  | CLIP-L        | Linear     | 4096               | 32              | 4096       | 11008          | 32                       |
++------------------+---------------+------------+--------------------+-----------------+------------+----------------+--------------------------+
+
+Model Configuration
+------------------
+
+Multimodal Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  mm_cfg:
+    use_im_start_end: False
+
+- ``use_im_start_end``: If set to `True`, image start and end tokens will be used before and after image embeddings.
+
+Language Model Configuration within Multimodal
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  mm_cfg:
+    llm:
+      from_pretrained: ${data_dir}/neva/checkpoints/llama-2-13b-chat-tp8.nemo
+      freeze: False
+      model_type: llama_2
+
+- ``from_pretrained``: Path to the pretrained NeMo language model checkpoint.
+- ``freeze``: If set to `True`, the model parameters will not be updated during training.
+- ``model_type``: Specifies the type of model, either `nvgpt` or `llama_2`.
+
+Vision Encoder Configuration within Multimodal
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  mm_cfg:
+    vision_encoder:
+      from_pretrained: "openai/clip-vit-large-patch14"
+      from_hf: True
+      patch_dim: 14
+      hidden_size: 1024
+      vision_select_layer: -2
+      class_token_length: 1
+      freeze: True
+
+- ``from_pretrained``: Path or name of the pretrained vision encoder.
+- ``from_hf``: If set to `True`, the model will be loaded from the Hugging Face model hub.
+- ``patch_dim``: Size of the patches the image is divided into.
+- ``hidden_size``: Dimensionality of the hidden layers.
+- ``vision_select_layer``: Specifies which layer to select from the vision model.
+- ``class_token_length``: Length of the classification token.
+
+Main Language Model Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  mcore_gpt: False
+  encoder_seq_length: 4096
+  position_embedding_type: rope
+  num_layers: 40
+  hidden_size: 5120
+  ffn_hidden_size: 13824
+  num_attention_heads: 40
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  normalization: rmsnorm
+  bias: False
+  activation: 'fast-swiglu'
+
+- ``mcore_gpt``: If set to `True`, the GPTModel from `megatron.core` will be used.
+- ``encoder_seq_length``: Sequence length for the main language model encoder.
+- ``position_embedding_type``: Type of position embedding used.
+- ``num_layers``, ``hidden_size``, ``ffn_hidden_size``, ``num_attention_heads``: Parameters defining the architecture of the main language model. The ``ffn_hidden_size`` is typically 4 times the ``hidden_size``.
+- ``hidden_dropout``, ``attention_dropout``, ``ffn_dropout``: Dropout probabilities for the hidden state, attention, and feed-forward layers in the transformer respectively.
+- ``normalization``: Type of normalization layers used.
+- ``bias``: If set to `True`, bias terms will be used in all weight matrices.
+- ``activation``: Activation function used in the model.
+
+Optimizations
+^^^^^^^^^^^^^^
+
++------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Feature                            | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | To Enable                                                                                                                                                                                                        |
++====================================+=========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================+==================================================================================================================================================================================================================+
+| Data parallelism                   | Dataset is read concurrently across multiple GPUs or nodes, allowing for faster data loading and processing.                                                                                                                                                                                                                                                                                                                                                                                            | Automatically when training on multi GPUs/nodes                                                                                                                                                                  |
++------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Tensor parallelism                 | Each tensor is split up into multiple chunks, allowing for horizontal parallelism across GPUs. This technique, known as TensorParallel (TP), distributes the model's tensors across multiple GPUs. During processing, each shard gets processed separately and in parallel on different GPUs, and the results are synced at the end of the step. This approach is inspired by NVIDIA's Megatron implementation. [Reference](https://github.com/NVIDIA/Megatron-LM#distributed-pretraining)              | ``model.tensor_model_parallel_size={parallel_size}``                                                                                                                                                             |
++------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Activation Checkpointing           | To reduce memory usage, activations of certain layers are cleared and recomputed during a backward pass. This technique is particularly useful for training large models that wouldn't fit in GPU memory using traditional methods.                                                                                                                                                                                                                                                                     | ``model.activations_checkpoint_granularity=full``, ``model.activations_checkpoint_method=block``, ``model.activations_checkpoint_num_layers={num_layers_to_check}``                                              |
++------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Selective Activation Checkpointing | Selective granularity version of activation checkpointing. See our paper for details. [Reference](https://arxiv.org/pdf/2205.05198.pdf)                                                                                                                                                                                                                                                                                                                                                                 | ``model.activations_checkpoint_granularity=selective``, ``model.activations_checkpoint_method=uniform``                                                                                                          |
++------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Bfloat16 Training                  | Training is conducted in Bfloat16 precision, which offers a balance between the higher precision of FP32 and the memory savings and speed of FP16.                                                                                                                                                                                                                                                                                                                                                      | ``trainer.precision=bf16``                                                                                                                                                                                       |
++------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| BF16 O2                            | Enables O2-level automatic mixed precision, optimizing Bfloat16 precision for better performance.                                                                                                                                                                                                                                                                                                                                                                                                       | ``model.megatron_amp_O2=True``                                                                                                                                                                                   |
++------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Flash Attention V2                 | FlashAttention is a fast and memory-efficient algorithm to compute exact attention. It speeds up model training and reduces memory requirement by being IO-aware. This approach is particularly useful for large-scale models and is detailed further in the repository linked. [Reference](https://github.com/Dao-AILab/flash-attention)                                                                                                                                                               | ``model.use_flash_attention=True``                                                                                                                                                                               |
++----------------------------------- +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+NeVA Training
+--------------
+
+NeVA's training encompasses two essential stages that enhance its capacity to comprehend user instructions, understand both language and visual content, and generate accurate responses:
+
+1. Pre-training for Feature Alignment: In this initial stage, NeVA aligns visual and language features to ensure compatibility.
+2. Fine-tuning End-to-End: The second training stage focuses on fine-tuning the entire model, end-to-end. While the visual encoder's weights remain unchanged, both the projection layer's pre-trained weights and the LLM's parameters become subjects of adaptation. This fine-tuning can be tailored to different application scenarios, yielding versatile capabilities.
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/multimodal/mm_all.bib b/docs/source/multimodal/mm_all.bib
new file mode 100644
index 000000000000..3930484d71e5
--- /dev/null
+++ b/docs/source/multimodal/mm_all.bib
@@ -0,0 +1,206 @@
+# Imagen
+@inproceedings{saharia2022photorealistic,
+      title={Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding},
+      author={Chitwan Saharia and William Chan and Saurabh Saxena and Lala Li and Jay Whang and Emily Denton and Seyed Kamyar Seyed Ghasemipour and Burcu Karagol Ayan and S. Sara Mahdavi and Rapha Gontijo Lopes and Tim Salimans and Jonathan Ho and David J Fleet and Mohammad Norouzi},
+      booktitle={Conference on Neural Information Processing Systems (NeurIPS)},
+      year={2022},
+      doi={10.48550/arXiv.2205.11487}
+}
+
+# DDPM
+@misc{ho2020denoising,
+      title={Denoising Diffusion Probabilistic Models}, 
+      author={Jonathan Ho and Ajay Jain and Pieter Abbeel},
+      year={2020},
+      eprint={2006.11239},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+
+# EDM
+@misc{karras2022elucidating,
+      title={Elucidating the Design Space of Diffusion-Based Generative Models}, 
+      author={Tero Karras and Miika Aittala and Timo Aila and Samuli Laine},
+      year={2022},
+      eprint={2206.00364},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# Make-A-Scene
+@misc{gafni2022makeascene,
+      title={Make-A-Scene: Scene-Based Text-to-Image Generation with Human Priors},
+      author={Oran Gafni and Adam Polyak and Oron Ashual and Shelly Sheynin and Devi Parikh and Yaniv Taigman},
+      year={2022},
+      eprint={2203.13131},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# Stable Diffusion
+@misc{rombach2022highresolution,
+      title={High-Resolution Image Synthesis with Latent Diffusion Models},
+      author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
+      year={2022},
+      eprint={2112.10752},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# Parti
+@misc{yu2022scaling,
+      title={Scaling Autoregressive Models for Content-Rich Text-to-Image Generation},
+      author={Jiahui Yu and Yuanzhong Xu and Jing Yu Koh and Thang Luong and Gunjan Baid and Zirui Wang and Vijay Vasudevan and Alexander Ku and Yinfei Yang and Burcu Karagol Ayan and Ben Hutchinson and Wei Han and Zarana Parekh and Xin Li and Han Zhang and Jason Baldridge and Yonghui Wu},
+      year={2022},
+      eprint={2206.10789},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# MUSE
+@misc{chang2023muse,
+      title={Muse: Text-To-Image Generation via Masked Generative Transformers},
+      author={Huiwen Chang and Han Zhang and Jarred Barber and AJ Maschinot and Jose Lezama and Lu Jiang and Ming-Hsuan Yang and Kevin Murphy and William T. Freeman and Michael Rubinstein and Yuanzhen Li and Dilip Krishnan},
+      year={2023},
+      eprint={2301.00704},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# Ins P2P
+@misc{insp2p,
+      Author = {Tim Brooks and Aleksander Holynski and Alexei A. Efros},
+      Title = {InstructPix2Pix: Learning to Follow Image Editing Instructions},
+      Year = {2022},
+      Eprint = {arXiv:2211.09800},
+}
+
+# Dream Booth
+@misc{ruiz2023dreambooth,
+      title={DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation},
+      author={Nataniel Ruiz and Yuanzhen Li and Varun Jampani and Yael Pritch and Michael Rubinstein and Kfir Aberman},
+      year={2023},
+      eprint={2208.12242},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# Control Net
+@misc{zhang2023adding,
+      title={Adding Conditional Control to Text-to-Image Diffusion Models},
+      author={Lvmin Zhang and Anyi Rao and Maneesh Agrawala},
+      year={2023},
+      eprint={2302.05543},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# LLAva
+@misc{llava,
+      Author = {Haotian Liu and Chunyuan Li and Qingyang Wu and Yong Jae Lee},
+      Title = {Visual Instruction Tuning},
+      Year = {2023},
+      Eprint = {arXiv:2304.08485},
+}
+
+@misc{liu2023improvedllava,
+      title={Improved Baselines with Visual Instruction Tuning},
+      author={Liu, Haotian and Li, Chunyuan and Li, Yuheng and Lee, Yong Jae},
+      publisher={arXiv:2310.03744},
+      year={2023},
+}
+
+@misc{minigpt4,
+      Author = {Deyao Zhu and Jun Chen and Xiaoqian Shen and Xiang Li and Mohamed Elhoseiny},
+      Title = {MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models},
+      Year = {2023},
+      Eprint = {arXiv:2304.10592},
+}
+
+@misc{flamingo,
+      Author = {Jean-Baptiste Alayrac and Jeff Donahue and Pauline Luc and Antoine Miech and Iain Barr and Yana Hasson and Karel Lenc and Arthur Mensch and Katie Millican and Malcolm Reynolds and Roman Ring and Eliza Rutherford and Serkan Cabi and Tengda Han and Zhitao Gong and Sina Samangooei and Marianne Monteiro and Jacob Menick and Sebastian Borgeaud and Andrew Brock and Aida Nematzadeh and Sahand Sharifzadeh and Mikolaj Binkowski and Ricardo Barreira and Oriol Vinyals and Andrew Zisserman and Karen Simonyan},
+      Title = {Flamingo: a Visual Language Model for Few-Shot Learning},
+      Year = {2022},
+      Eprint = {arXiv:2204.14198},
+}
+
+@misc{blip2,
+      Author = {Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
+      Title = {BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
+      Year = {2023},
+      Eprint = {arXiv:2301.12597},
+}
+
+@misc{kosmos1,
+      Author = {Shaohan Huang and Li Dong and Wenhui Wang and Yaru Hao and Saksham Singhal and Shuming Ma and Tengchao Lv and Lei Cui and Owais Khan Mohammed and Barun Patra and Qiang Liu and Kriti Aggarwal and Zewen Chi and Johan Bjorck and Vishrav Chaudhary and Subhojit Som and Xia Song and Furu Wei},
+      Title = {Language Is Not All You Need: Aligning Perception with Language Models},
+      Year = {2023},
+      Eprint = {arXiv:2302.14045},
+}
+
+# DECLIP
+@misc{li2021declip,
+      title={Supervision Exists Everywhere: A Data Efficient Contrastive Language-Image Pre-training Paradigm},
+      author={Yangguang Li and Feng Liang and Lichen Zhao and Yufeng Cui and Wanli Ouyang and Jing Shao and Fengwei Yu and Junjie Yan},
+      year={2021},
+      eprint={2110.05208},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://ar5iv.org/abs/2110.05208}
+}
+
+# CLIP
+@misc{radford2021learning,
+      title={Learning Transferable Visual Models From Natural Language Supervision},
+      author={Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
+      year={2021},
+      eprint={2103.00020},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+# FLAVA
+@inproceedings{singh2022flava,
+      title={FLAVA: A Foundational Language And Vision Alignment Model},
+      author={Amanpreet Singh and Ronghang Hu and Vedanuj Goswami and Guillaume Couairon and Wojciech Galuba and Marcus Rohrbach and Douwe Kiela},
+      booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+      pages={15638--15650},
+      year={2022}
+}
+
+# ControlNet GITHUB
+@misc{controlnetgithub,
+      title={Lllyasviel/controlnet},
+      url={https://github.com/lllyasviel/ControlNet},
+      journal={GitHub},
+      author={Lllyasviel, Zhang},
+      year={2023}
+}
+
+#DreamBooth Github
+@misc{dreamboothdataset,
+      title={DreamBooth},
+      url={https://github.com/google/dreambooth/tree/main/dataset},
+      journal={GitHub},
+      author={Google},
+      year={2023}
+}
+
+#DreamBooth Paper
+@misc{dreamboothpaper,
+      title={DreamBooth: Fine Tuning Text-to-Image Diffusion Models
+for Subject-Driven Generation},
+      url={https://arxiv.org/abs/2208.12242},
+      author={Nataniel Ruiz and Yuanzhen Li and Varun Jampani and Yael Pritch and Michael Rubinstein and Kfir Aberman},
+      year={2022},
+      archivePrefix={arXiv}
+}
+
+# DreamFusion paper
+@misc{poole2022dreamfusion,
+      title={DreamFusion: Text-to-3D using 2D Diffusion},
+      url={https://arxiv.org/abs/2209.14988},
+      author={Poole, Ben and Jain, Ajay and Barron, Jonathan T. and Mildenhall, Ben},
+      year={2022},
+      archivePrefix={arXiv},
+}
\ No newline at end of file
diff --git a/docs/source/multimodal/nerf/configs.rst b/docs/source/multimodal/nerf/configs.rst
new file mode 100644
index 000000000000..96dac7694f6d
--- /dev/null
+++ b/docs/source/multimodal/nerf/configs.rst
@@ -0,0 +1,142 @@
+Common Configuration Files
+============================
+
+This section describes the NeMo configuration file setup that is specific to models in the MM NeRF collection. For general information
+about how to set up and run experiments that is common to all NeMo models (e.g. Experiment Manager and PyTorch Lightning trainer
+parameters), see the `Core Documentation <../../core/core.html>`_ section.
+
+The model section of the NeMo Multimodal NeRF configuration files generally requires information about the dataset,
+the background and/or foreground NeRF networks, renderer and the guidance model being used. The sections on
+this page cover each of these in more detail.
+
+Example configuration files for all of the NeMo Multimodal NeRF scripts can be found in the
+config directory of the examples ``{NEMO_ROOT/examples/multimodal/generative/nerf/conf}``.
+
+
+Trainer Configuration
+---------------------
+
+Trainer configuration specifies the arguments for Pytorch Lightning Trainer Object.
+
+.. code-block:: yaml
+
+  trainer:
+    devices: 1                     # Number of GPUs for distributed, or the list of the GPUs to use e.g. [0, 1]
+    num_nodes: 1                   # Number of nodes for distributed training
+    precision: 16                  # Use 16 to enable or 32 for FP32 precision
+    max_steps: 10000               # Number of training steps to perform
+    accelerator: gpu               # accelerator to use, only "gpu" is officially supported
+    enable_checkpointing: False    # Provided by exp_manager
+    logger: False                  # Provided by exp_manager
+    log_every_n_steps: 1           # Interval of logging
+    val_check_interval: 100        # Interval of validation
+    accumulate_grad_batches: 1     # Accumulates gradients over k batches before stepping the optimizer.
+    benchmark: False               # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware.
+    enable_model_summary: True     # Enable or disable the model summarization
+
+
+Refer to the `Pytorch Lightning Trainer <https://lightning.ai/docs/pytorch/stable/common/trainer.html#>`__ API section
+for all possible arguments
+
+
+Experiment Manager Configurations
+---------------------------------
+
+NeMo Experiment Manager provides convenient way to configure logging, saving, resuming options and more.
+
+.. code-block:: yaml
+
+  exp_manager:
+    name: ${name}                          # The name of the experiment.
+    exp_dir: /results                      # Directory of the experiment, if None, defaults to "./nemo_experiments"
+    create_tensorboard_logger: False       # Whether you want exp_manger to create a TensorBoard logger
+    create_wandb_logger: False             # Whether you want exp_manger to create a Wandb logger
+    wandb_logger_kwargs:                   # Wandb logger arguments
+      project: dreamfusion
+      group: nemo-df
+      name: ${name}
+      resume: True
+    create_checkpoint_callback: True       # Whether you want Experiment manager to create a model checkpoint callback
+    checkpoint_callback_params:            # Model checkpoint callback arguments
+      every_n_epochs: 0
+      every_n_train_steps:
+      monitor: loss
+      filename: '${name}-{step}'
+      save_top_k: -1
+      always_save_nemo: False
+    resume_if_exists: True                 # Whether this experiment is resuming from a previous run
+    resume_ignore_no_checkpoint: True      # Experiment manager errors out if resume_if_exists is True and no checkpoint could be found. This behavior can be disabled, in which case exp_manager will print a message and continue without restoring, by setting resume_ignore_no_checkpoint to True
+
+Model Configuration
+-------------------
+
+Dataset Configuration
+^^^^^^^^^^^^^^^^^^^^^
+
+Training, validation, and test parameters are specified using the ``data`` sections in the model
+configuration file. Depending on the task, there may be arguments specifying the augmentations
+for the dataset, the image resolution, camera parameters and so on.
+
+Any initialization parameter that is accepted for the Dataset class used in the experiment can be set in the config file.
+Refer to the `Datasets <./datasets.html#Datasets>`__ section of the API for a list of Datasets and their respective parameters.
+
+An example NeRF dataset configuration should look similar to the following:
+
+.. code-block:: yaml
+
+  model:
+    data:
+      train_batch_size: 1
+      train_shuffle: false
+      train_dataset:
+        _target_: a pytorch Dataset or IterableDataset class
+
+      val_batch_size: 1
+      val_shuffle: false
+      val_dataset:
+        _target_: a pytorch Dataset or IterableDataset class
+
+      test_batch_size: 1
+      test_shuffle: false
+      test_dataset:
+        _target_: a pytorch Dataset or IterableDataset class
+
+
+Model Architecture Configurations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Each configuration file should describe the model pipeline and architecture being used for the experiment.
+
+Here is a list of modules a nerf pipeline might use:
+
++--------------------+-----------------------------------------------------+
+| **Module**         | **Description**                                     |
++====================+=====================================================+
+| :code:`guidance`   | guidance model                                      |
++--------------------+-----------------------------------------------------+
+| :code:`nerf`       | the main network for foreground density and color   |
++--------------------+-----------------------------------------------------+
+| :code:`background` | a complimentary layer for background color          |
++--------------------+-----------------------------------------------------+
+| :code:`material`   | materials network for lightning and shading effects |
++--------------------+-----------------------------------------------------+
+| :code:`renderer`   | rendering layer                                     |
++--------------------+-----------------------------------------------------+
+
+Refer to `DreamFusion <./dreamfusion.html#dreamfusion>`_ for model specific configurations.
+
+
+Optimizer Configurations
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  optim:
+    name: adan
+    lr: 5e-3
+    eps: 1e-8
+    weight_decay: 2e-5
+    max_grad_norm: 5.0
+    foreach: False
+
+
+By default we use ``adan`` as the optimizer, refer to NeMo user guide for all supported optimizers.
diff --git a/docs/source/multimodal/nerf/datasets.rst b/docs/source/multimodal/nerf/datasets.rst
new file mode 100644
index 000000000000..3981cef34c9f
--- /dev/null
+++ b/docs/source/multimodal/nerf/datasets.rst
@@ -0,0 +1,81 @@
+Datasets
+========
+
+.. note:: It is the responsibility of each user to check the content of the dataset, review the applicable licenses, and determine if it is suitable for their intended use. Users should review any applicable links associated with the dataset before placing the data on their machine.
+
+
+Rays dataset
+------------
+Ray datasets are specialized data structures designed for applications in computer graphics, notably in 3D reconstruction, neural rendering, and ray tracing.
+
+Ray datasets are characterized by their detailed representation of rays, each defined by an origin point (rays_o) and a direction vector (rays_d).
+These datasets are closely tied to specific image dimensions, including height and width, which dictate the resolution and aspect ratio of the target images.
+Alongside the core ray data, these datasets typically include additional metadata such as camera parameters, depth values, and color information.
+The diversity and complexity of the dataset, encompassing a range of viewpoints and lighting conditions, play a crucial role in capturing the nuances of real-world light behavior.
+
+
+Random Poses Dataset
+^^^^^^^^^^^^^^^^^^^^
+The Random Poses Dataset randomly generates camera poses, each translating to a unique set of rays characterized by their origins and directions.
+This randomization is key to covering a wide range of potential viewpoints and angles, mimicking a comprehensive exploration of a 3D scene.
+This diverse sampling is essential for training robust NeRF models capable of accurately reconstructing and rendering 3D environments from previously unseen angles.
+
+The dataset inherently accounts for the necessary parameters of ray generation, such as the height and width of the target images,
+ensuring that the rays are compatible with the specific requirements of the rendering or reconstruction algorithms.
+In addition to the ray origins and directions, the dataset may also include other relevant metadata like camera intrinsic and extrinsic parameters,
+contributing to a more detailed and versatile training process.
+
+An example of RandomPosesDataset usage as a training dataset is shown below:
+
+.. code-block:: yaml
+
+  model:
+    data:
+      train_batch_size: 1
+      train_shuffle: false
+      train_dataset:
+        _target_: nemo.collections.multimodal.data.nerf.random_poses.RandomPosesDataset
+        internal_batch_size: 100
+        width: 512
+        height: 512
+        radius_range: [3.0, 3.5]
+        theta_range: [45, 105]
+        phi_range: [-180, 180]
+        fovx_range: [10, 30]
+        fovy_range: [10, 30]
+        jitter: False
+        jitter_center: 0.2
+        jitter_target: 0.2
+        jitter_up: 0.02
+        uniform_sphere_rate: 0
+        angle_overhead: 30
+        angle_front: 60
+
+
+Circle Poses Dataset
+^^^^^^^^^^^^^^^^^^^^
+Circle Poses Dataset is a specialized ray dataset designed for generating samples of rays in a circular pattern.
+The key feature of this dataset is its ability to simulate camera positions arranged along a circular path, focusing on a central point.
+This arrangement is particularly useful for capturing scenes from multiple, evenly spaced angles, ensuring a comprehensive view around a central axis.
+
+The defining parameter of the Circle Poses Dataset is its size, which dictates the number of samples or camera poses around the circle.
+A larger size results in more camera positions being generated, offering finer granularity and coverage of the circle.
+Each camera pose corresponds to a unique set of rays, with origins and directions calculated based on the position around the circle and the focus on the central point.
+
+The Circle Poses Dataset is particularly valuable during validation and testing to generate a holistic view of the reconstructed scene.
+
+An example of CirclePosesDataset usage as a validation dataset is shown below:
+
+.. code-block:: yaml
+
+  model:
+    data:
+      val_batch_size: 1
+      val_shuffle: false
+      val_dataset:
+        _target_: nemo.collections.multimodal.data.nerf.circle_poses.CirclePosesDataset
+        size: 5
+        width: 512
+        height: 512
+        angle_overhead: 30
+        angle_front: 60
diff --git a/docs/source/multimodal/nerf/dreamfusion.rst b/docs/source/multimodal/nerf/dreamfusion.rst
new file mode 100644
index 000000000000..a9f2f630bcdd
--- /dev/null
+++ b/docs/source/multimodal/nerf/dreamfusion.rst
@@ -0,0 +1,310 @@
+DreamFusion
+===========
+
+Model Introduction
+-------------------
+DreamFusion  :cite:`mm-models-poole2022dreamfusion` uses a pretrained text-to-image diffusion model to perform
+text-to-3D synthesis. The model uses a loss based on probability density distillation that enables the use of a 2D
+diffusion model as a prior for optimization of a parametric image generator.
+
+Using this loss in a DeepDream-like procedure, the model optimizes a randomly-initialized 3D model
+(a Neural Radiance Field, or NeRF) via gradient descent such that its 2D renderings from random angles achieve a low
+loss. The resulting 3D model of the given text can be viewed from any angle, relit by arbitrary illumination, or composited
+into any 3D environment. This approach requires no 3D training data and no modifications to the image diffusion model,
+demonstrating the effectiveness of pretrained image diffusion models as priors.
+
+Dreamfusion models can be instantiated using the :class:`~nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion` class.
+
+.. image:: images/dreamfusion_model_overview.png
+        :align: center
+        :width: 800px
+        :alt: DreamFsuion, overview of the model
+
+
+Image guidance
+^^^^^^^^^^^^^^
+This section of DreamFusion pertains to the initial phase where the model interprets and translates text inputs into visual concepts.
+Utilizing a diffusion based text-to-image model, DreamFusion processes the text input, extracts key visual elements, and translates these into initial 2D images.
+The process ensures that the generated 3D models are not only accurate in terms of the text description but also visually coherent and detailed by conditioning
+the 2D image based on the view angle.
+
+
+NeRF (foreground) network
+^^^^^^^^^^^^^^^^^^^^^^^^^
+The Neural Radiance Fields (NeRF) network is at the heart of DreamFusion's 3D rendering capabilities.
+In DreamFusion, the NeRF network takes the 2D images generated from the textual description and constructs a 3D model.
+This model is represented as a continuous volumetric scene function, which encodes the color and density of points in space,
+allowing for highly detailed and photorealistic renderings.
+
+Background layer
+^^^^^^^^^^^^^^^^
+DreamFusion can leverage a background layer dedicated to background modeling.
+
+In scenarios where a dynamic background is needed, DreamFusion can be configured to use a secondary NeRF network to generate a background.
+This network functions in parallel to the primary NeRF network, focusing on creating a coherent and contextually appropriate backdrop for the main scene.
+It dynamically adjusts to lighting and perspective changes, maintaining consistency with the foreground model.
+
+Alternatively, DreamFusion allows for the integration of a static background color, which is particularly useful in scenarios where the focus is predominantly on the object being generated, and a non-distracting backdrop is desirable.
+Implementing a static color background involves setting a uniform chromatic value that encompasses the periphery of the 3D model.
+This approach simplifies the rendering process and can be beneficial in reducing computational load while maintaining focus on the primary object.
+
+Materials network
+^^^^^^^^^^^^^^^^^
+The material network in DreamFusion is responsible for adding realism to the 3D models by accurately simulating the physical properties of different materials.
+This network takes into account various aspects like texture, reflectivity, and transparency.
+By doing so, it adds another layer of detail, making the objects generated by DreamFusion not just structurally accurate but also visually and tactilely realistic.
+
+
+Renderer layer
+^^^^^^^^^^^^^^
+The renderer layer functions as the culminating stage in DreamFusion's processing pipeline.
+It translates the synthesized volumetric data from the NeRF and material networks into perceptible imagery.
+Employing ray-tracing algorithms, this layer computes the interaction of light with the 3D scene,
+producing images that exhibit sophisticated attributes like accurate shadow casting,
+dynamic lighting, and perspective-correct renderings.
+
+
+
+Model Configuration
+-------------------
+
+DreamFusion models can be instantiated using the :class:`~nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion` class.
+The model configuration file is organized into the following sections:
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion
+  defaults:
+    - nerf: torchngp
+    - background: static
+    - material: basic_shading
+    - renderer: torchngp_raymarching
+    - guidance: sd_huggingface
+    - optim: adan
+    - loss: dreamfusion
+    - data: data
+    - _self_
+
+  ### model options
+  resume_from_checkpoint:
+  prompt: 'a hamburger'
+  negative_prompt: ''
+  front_prompt: ', front view'
+  side_prompt: ', side view'
+  back_prompt: ', back view'
+  update_extra_interval: 16
+  guidance_scale: 100
+  export_video: False
+
+  iters: ${trainer.max_steps}
+  latent_iter_ratio: 0.2
+  albedo_iter_ratio: 0.0
+  min_ambient_ratio: 0.1
+  textureless_ratio: 0.2
+
+  data:
+    train_dataset:
+      width: 64
+      height: 64
+    val_dataset:
+      width: 800
+      height: 800
+    test_dataset:
+      width: 800
+      height: 800
+
+- ``defaults``: Defines default modules for different components like nerf, background, material, etc.
+- ``resume_from_checkpoint``: Path to a checkpoint file to initialize the model with.
+- ``prompt``: Main textual input for the model describing the object to generate.
+- ``negative_prompt``: Textual input describing what to avoid in the generated object.
+- ``front_prompt``, ``side_prompt``, ``back_prompt``: Textual inputs that are appended to the prompts for more detailed orientation guidance.
+- ``update_extra_interval``: Interval for updating internal module parameters.
+- ``guidance_scale``: The guidance scaled used with the diffusion model.
+- ``export_video``: Boolean to determine whether to export a 360 video of the generated object.
+- ``iters``, ``latent_iter_ratio``, ``albedo_iter_ratio``, ``min_ambient_ratio``, ``textureless_ratio``: Various ratios and parameters defining iteration behavior and visual characteristics of the output.
+- ``data``: Defines dataset dimensions for training, validation, and testing.
+
+The behavior of the pipeline can be precisely adjusted by fine-tuning the parameters of various components in the default section.
+Some components support different backends and implementations, the full components catalog can be viewed in the config directory ``{NEMO_ROOT/examples/multimodal/generative/nerf/conf/model}``.
+
+Image guidance
+^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.modules.nerf.guidance.stablediffusion_huggingface_pipeline.StableDiffusion
+  precision: ${trainer.precision}
+  model_key: stabilityai/stable-diffusion-2-1-base
+  t_range: [0.02, 0.98]
+
+- ``precision``: Sets the precision of computations (e.g., FP32 or FP16).
+- ``model_key``: Specifies the pre-trained model to use for image guidance.
+- ``t_range``: Range of threshold values for guidance stability.
+
+
+NeRF (foreground) network
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.modules.nerf.geometry.torchngp_nerf.TorchNGPNerf
+  num_input_dims: 3
+  bound: 1
+  density_activation: exp
+  blob_radius: 0.2
+  blob_density: 5
+  normal_type: central_finite_difference
+
+  encoder_cfg:
+    encoder_type: 'hashgrid'
+    encoder_max_level:
+    log2_hashmap_size: 19
+    desired_resolution: 2048
+    interpolation: smoothstep
+
+  sigma_net_num_output_dims: 1
+  sigma_net_cfg:
+    num_hidden_dims: 64
+    num_layers: 3
+    bias: True
+
+  features_net_num_output_dims: 3
+  features_net_cfg:
+    num_hidden_dims: 64
+    num_layers: 3
+    bias: True
+
+Describes the NeRF network's architecture, including the density activation function, network configuration, and the specification of the sigma and features networks.
+
+Background layer
+^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.modules.nerf.background.static_background.StaticBackground
+  background: [0, 0, 1]
+
+Static background, where the background key is the RGB color.
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.modules.nerf.background.torchngp_background.TorchNGPBackground
+
+  encoder_type: "frequency"
+  encoder_input_dims: 3
+  encoder_multi_res: 6
+
+  num_output_dims: 3
+  net_cfg:
+    num_hidden_dims: 32
+    num_layers: 2
+    bias: True
+
+Dynamic background, where the background is generated by a NeRF network.
+
+
+Materials network
+^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.modules.nerf.materials.basic_shading.BasicShading
+
+Defines the basic shading model for the material network. The basic shading model supports textureless, lambertian and phong shading.
+
+
+Renderer layer
+^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.modules.nerf.renderers.torchngp_volume_renderer.TorchNGPVolumeRenderer
+  bound: ${model.nerf.bound}
+  update_interval: 16
+  grid_resolution: 128
+  density_thresh: 10
+  max_steps: 1024
+  dt_gamma: 0
+
+Configures the renderer, specifying parameters like update interval, grid resolution, and rendering thresholds.
+
+
+DreamFusion-DMTet
+-----------------
+NeRF models integrate geometry and appearance through volume rendering. As a result,
+using NeRF for 3D modeling can be less effective when it comes to capturing both the intricate details of a surface as well as
+its material and texture.
+
+DMTet finetunning disentangles the learning of geometry and appearance models, such that both a fine surface and a rich
+material/texture can be generated. To enable such a disentangled learning, a hybrid scene representation of
+[DMTet](https://nv-tlabs.github.io/DMTet/) is used.
+
+The DMTet model maintains a deformable tetrahedral grid that encodes a discretized signed distance function and a
+differentiable marching tetrahedra layer that converts the implicit signed distance representation to the explicit
+surface mesh representation.
+
+
+Model Configuration
+^^^^^^^^^^^^^^^^^^^
+
+DreamFusion models can be instantiated using the same class as DreamFusion :class:`~nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion`.
+However, the following changes to the training pipeline are necessary:
+
+.. code-block:: yaml
+
+  _target_: nemo.collections.multimodal.models.nerf.dreamfusion.DreamFusion
+  defaults:
+    - nerf: torchngp
+    - background: torchngp
+    - material: basic_shading
+    - renderer: nvdiffrast            # (1)
+    - guidance: sd_huggingface
+    - optim: adan
+    - loss: dmtet                     # (2)
+    - data: data
+    - _self_
+
+  ### model options
+  resume_from_checkpoint: "/results/DreamFusion/checkpoints/DreamFusion-step\=10000-last.ckpt"   # (3)
+  prompt: 'a hamburger'
+  negative_prompt: ''
+  front_prompt: ', front view'
+  side_prompt: ', side view'
+  back_prompt: ', back view'
+  update_extra_interval: 16
+  guidance_scale: 100
+  export_video: False
+
+  iters: ${trainer.max_steps}
+  latent_iter_ratio: 0.0
+  albedo_iter_ratio: 0
+  min_ambient_ratio: 0.1
+  textureless_ratio: 0.2
+
+  data:
+    train_dataset:
+      width: 512         # (4)
+      height: 512        # (4)
+    val_dataset:
+      width: 800
+      height: 800
+    test_dataset:
+      width: 800
+      height: 800
+
+
+We note the following changes:
+1. The rendering module was changed from a volumetric based one to a rasterization based one (nvdiffrast).
+2. The model loss is changed to account for the changes in the geometry representation.
+3. DreamFusion-DMTet finetunes a pretrained DreamFusion model, the pretrained checkpoint is provided using ``resume_from_checkpoint``.
+4. The training shape is incrased to 512x512
+
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
diff --git a/docs/source/multimodal/nerf/images/dreamfusion_model_overview.png b/docs/source/multimodal/nerf/images/dreamfusion_model_overview.png
new file mode 100644
index 000000000000..f70b84e932af
Binary files /dev/null and b/docs/source/multimodal/nerf/images/dreamfusion_model_overview.png differ
diff --git a/docs/source/multimodal/nerf/intro.rst b/docs/source/multimodal/nerf/intro.rst
new file mode 100644
index 000000000000..eca057215a75
--- /dev/null
+++ b/docs/source/multimodal/nerf/intro.rst
@@ -0,0 +1,55 @@
+NeRF
+====
+NeMO NeRF is a collection of models and tools for training 3D and 4D models.
+
+The library is designed with a modular approach, enabling developers to explore and find the most suitable solutions for their requirements,
+and allowing researchers to accelerate their experimentation process.
+
+
+Supported Models
+-----------------
+NeMo NeRF currently supports the following models:
+
++----------------------------------------+------------+
+| Model                                  | Categories |
++========================================+============+
+| `DreamFusion <./dreamfusion.html>`_    | text to 3D |
++----------------------------------------+------------+
+
+
+Spotlight Models
+-----------------
+
+DreamFusion
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The `DreamFusion <https://dreamfusion3d.github.io/>`_ model utilizing pre-trained 2D text-to-image diffusion models to create detailed 3D objects from textual descriptions.
+This approach overcomes the limitations of traditional 3D synthesis, which typically requires extensive labeled 3D data and sophisticated denoising architectures.
+At the core of DreamFusion is the optimization of a Neural Radiance Field (NeRF), a parametric model for rendering 3D scenes.
+The optimization process is driven by a loss function based on probability density distillation, which enables the 2D diffusion model to act as an effective prior.
+DreamFusion is capable of producing 3D models that are not only accurate representations of the input text but also offer versatility in terms of rendering from any viewpoint,
+relighting under diverse lighting conditions, and integration into various 3D environments. Importantly, this method achieves these results without the need for
+specific 3D training data or modifications to the existing image diffusion model.
+
+- Model Structure:
+    - Text-to-image model: a pretrained text-to-image diffusion model is used to generate a 2D image from a given text.
+    - NeRF: a neural radiance field (NeRF) that can generate novel views of complex 3D scenes, based on a partial set of 2D images.
+    - Renderer: A volume rendering layer is used to render the NeRF model from a given viewpoint.
+
+
+For more information, see additional sections in the NeRF docs on the left-hand-side menu or in the list below:
+
+.. toctree::
+   :maxdepth: 1
+
+   datasets
+   configs
+   dreamfusion
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
diff --git a/docs/source/multimodal/text2img/checkpoint.rst b/docs/source/multimodal/text2img/checkpoint.rst
new file mode 100644
index 000000000000..7e8f7149896d
--- /dev/null
+++ b/docs/source/multimodal/text2img/checkpoint.rst
@@ -0,0 +1,80 @@
+Checkpoints
+===========
+
+There are three main ways to load pretrained checkpoints in NeMo:
+
+* Using the :code:`restore_from()` method to load a local checkpoint file (``.nemo``), or
+* Converting a partially trained ``.ckpt`` (intermediate) checkpoint to ``.nemo`` format.
+* Converting HuggingFace public checkpoints to ``.nemo`` format.
+
+Refer to the following sections for instructions and examples for each.
+
+Note that these instructions are for loading fully trained checkpoints for evaluation or fine-tuning.
+
+Loading ``.nemo`` Checkpoints
+-------------------------
+
+NeMo automatically saves checkpoints of a model that is trained in a ``.nemo`` format. Alternatively, to manually save the model at any 
+point, issue :code:`model.save_to(<checkpoint_path>.nemo)`.
+
+If there is a local ``.nemo`` checkpoint that you'd like to load, use the :code:`restore_from()` method:
+
+.. code-block:: python
+
+  import nemo.collections.multimodal as nemo_multimodal
+  model = nemo_multimodal.models.<MODEL_BASE_CLASS>.restore_from(restore_path="<path/to/checkpoint/file.nemo>")
+
+Where the model base class is the MM model class of the original checkpoint.
+
+Converting Intermediate Checkpoints
+---------------------------
+To evaluate a partially trained checkpoint, you may need to convert it to ``.nemo`` format.
+`script to convert the checkpoint <ADD convert_ckpt_to_nemo.py PATH>`.
+
+.. code-block:: python
+
+  python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
+    convert_ckpt_to_nemo.py \
+    --checkpoint_folder <path_to_PTL_checkpoints_folder> \
+    --checkpoint_name <checkpoint_name> \
+    --nemo_file_path <path_to_output_nemo_file> \
+    --tensor_model_parallel_size <tensor_model_parallel_size> \
+    --pipeline_model_parallel_size <pipeline_model_parallel_size>
+
+
+Converting HuggingFace Checkpoints
+---------------------------------
+
+To fully utilize the optimized training pipeline and framework/TRT inference pipeline
+of NeMo, we provide scripts to convert popular checkpoints on HuggingFace into NeMo format.
+Once converted, you can perform fine-tuning or inference on such checkpoints.
+
+Stable Diffusion & ControlNet
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We provide `script to convert the Huggingface checkpoint <ADD convert_hf_ckpt_to_nemo.py PATH>` to ``.nemo`` format, which can then be used within our inference pipeline.
+
+
+.. code-block:: python
+
+  python convert_hf_ckpt_to_nemo.py \
+    --ckpt_path <path_to_HF_checkpoints> \
+    --hparams_file <path_to_hparams_file> \
+    --nemo_file_path <path_to_output_nemo_file> \
+    --model_type <model_to_be_converted> \
+    --nemo_clip_path <clip_ckpt_in_nemo_format>
+
+
+- ``hparams_file``: Config file to be combined with model weights to generate ``.nemo`` checkpoint. It can be generated from a dummy run and can be found at, for example, ``nemo_experiments/stable-diffusion-train/version_0/hparams.yaml``.
+
+- ``model_type``: We support converting `stable_diffusion` and `controlnet` checkpoint in this script.
+
+- ``nemo_clip_path``: It's required only when the ``cond_stage_config`` in ``hparams_file`` refer to a NeMo CLIP model. It will be ignored when ``cond_stage_config`` refer to Hugginface CLIP. See :ref:`sd-config-section` for more details.
+
+
+Imagen
+^^^^^^^^^^^^^^
+
+We will provide conversion script if Imagen research team releases their checkpoint
+in the future. Conversion script for DeepFloyd IF models will be provided in the
+next release.
\ No newline at end of file
diff --git a/docs/source/multimodal/text2img/configs.rst b/docs/source/multimodal/text2img/configs.rst
new file mode 100644
index 000000000000..3c534044ba40
--- /dev/null
+++ b/docs/source/multimodal/text2img/configs.rst
@@ -0,0 +1,166 @@
+Common Configuration Files
+============================
+
+This section describes the NeMo configuration file setup that is specific to models in the MM Text2Img collection. For general information
+about how to set up and run experiments that is common to all NeMo models (e.g. Experiment Manager and PyTorch Lightning trainer
+parameters), see the `Core Documentation <../../core/core.html>`_ section.
+
+The model section of the NeMo Multimodal Text2Img configuration files generally requires information about the dataset(s) being used, 
+the text and image encoder, parameters for any augmentation being performed, as well as the model architecture specification. The sections on
+this page cover each of these in more detail.
+
+Example configuration files for all of the NeMo Multimodal Text2Img scripts can be found in the
+`config directory of the examples <PUT THE URL>`_.
+
+
+Dataset Configuration
+---------------------
+
+Training, validation, and test parameters are specified using the ``train``, ``validation``, and
+``test`` sections in the configuration file, respectively. Depending on the task, there may be arguments specifying the augmentations
+for the dataset, the resolution filter for filtering out images, and so on. 
+
+Any initialization parameter that is accepted for the Dataset class used in the experiment can be set in the config file.
+Refer to the `Datasets <../api.html#Datasets>`__ section of the API for a list of Datasets and their respective parameters.
+
+An example Text2Img train configuration should look similar to the following:
+
+.. code-block:: yaml
+
+  model:
+    data:
+      num_workers: 16 # The number of workers for dataloader process
+      train:
+        dataset_path: # List of wdinfo files for the datasets to train on
+          - dataset1.pkl
+          - dataset2.pkl
+        augmentations:
+          resize_samllest_side: 64 # Resize the smallest side of the image to the specified resolution
+          center_crop_h_w: 64, 64 # Center cropping
+          horizontal_flip: False # Whether to perform horizontal flip
+        filterings:
+          resolution:
+            method: larger
+            value: 64
+      webdataset:
+        use_webdataset: True
+        infinite_sampler: false
+        local_root_path: ??? # Path that stores the dataset
+        verbose: False # Whether to print detail debugging information
+
+Currently, our diffusion-based Text2Img models do not require validation steps for faster convergence. 
+As discussed in `Datasets <./datasets.html>`_, storing training dataset in webdataset format is the requirement for all
+text2img training pipeline. Using ``webdataset.infinite_sampler=True`` is the preferred way for training especially if the dataset
+is large as suggested by `Webdataset Multinode Training Guideline <https://github.com/webdataset/webdataset#multinode-training>`_ .
+          
+Enabling ``train.filterings`` allows one to filter out images (and corresponding text pairs) based on some common use cases (e.g., minimum resolution)
+without having to create a redundant subset of the webdataset on the disk prior to training. The example above showcases how to filter the dataset so that only images with a resolution
+larger than 64x64 will be used for training. Concatenating multiple webdataset is as easy as listing all wdinfo files in
+``train.dataset_path``.
+
+
+
+
+Trainer Configuration
+--------------------------
+
+Trainer configuration specifies the arguments for Pytorch Lightning Trainer Object.
+
+.. code-block:: yaml
+
+  trainer:
+    devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
+    num_nodes: 1
+    max_epochs: -1
+    max_steps: 2500000 # precedence over max_epochs
+    logger: False  # Provided by exp_manager 
+    precision: bf16 # Should be set to 16 for O1 and O2 to enable the AMP.
+    accelerator: gpu
+    log_every_n_steps: 5  # Interval of logging.
+    resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+    num_sanity_val_steps: 10 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+    enable_checkpointing: False # Provided by exp_manager
+    accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+    gradient_clip_val: 1.0
+    benchmark: False
+    enable_model_summary: True
+
+Refer to the `Pytorch Lightning Trainer <https://lightning.ai/docs/pytorch/stable/common/trainer.html#>`__ API section 
+for all possible arguments
+
+
+Experiment Manager Configurations
+---------------------------
+
+NeMo Experiment Manager provides convenient way to configure logging, saving, resuming options and more.
+
+.. code-block:: yaml
+
+  exp_manager:
+    exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
+    name: ${name}
+    create_wandb_logger: True
+    wandb_logger_kwargs: # Whether you want exp_manger to create a Wandb logger
+      name: training-session
+      project: text2img
+      group: nemo
+      resume: True
+    create_tensorboard_logger: True  # Whether you want exp_manger to create a tb logger
+    create_checkpoint_callback: True  # Whether you want exp_manager to create a model checkpoint callback
+    checkpoint_callback_params:
+      monitor: reduced_train_loss
+      save_top_k: 5
+      every_n_epochs: 0 # Save checkpoint frequency.
+      every_n_train_steps: 1000 # Mutually exclusive with every_n_epochs. It is recommended to set this if training on large-scale dataset.
+      filename: '${name}--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+    resume_if_exists: True
+    resume_ignore_no_checkpoint: True
+    resume_from_checkpoint: ${model.resume_from_checkpoint}
+    ema:
+      enable: True
+      decay: 0.9999
+      validate_original_weights: False
+      every_n_steps: 1
+      cpu_offload: False
+
+EMA feature can be enabled by setting ``exp_manager.ema.enable=True``. 
+
+Optimizer Configurations
+-------------------------
+
+.. code-block:: yaml
+
+  optim:
+    name: fused_adam
+    lr: 0.0001
+    eps: 1e-8
+    betas: [ 0.9, 0.999 ]
+    weight_decay: 0.01
+    sched:
+      name: WarmupPolicy
+      warmup_steps: 10000
+      warmup_ratio: null
+
+By default we use ``fused_adam`` as the optimizer, refer to NeMo user guide for all supported optimizers.
+Learning rate scheduler can be specified in ``optim.sched`` section.
+
+Model Architecture Configurations
+------------------------
+
+Each configuration file should describe the model architecture being used for the experiment. 
+
+Here is the list of the parameters in the model section which are shared among most of the MM Text2Img models:
+
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| **Parameter**             | **Datatype** | **Description**                                                                       |
++===========================+==============+=======================================================================================+
+| :code:`micro_batch_size`  | int          | micro batch size that fits on each GPU                                                |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`global_batch_size` | int          | global batch size that takes consideration of gradient accumulation, data parallelism |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`inductor`          | bool         | enable TorchInductor optimization                                                     |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`channels_last`     | bool         | enable NHWC training format                                                           |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`seed`              | int          | seed used in training                                                                 |
++---------------------------+--------------+---------------------------------------------------------------------------------------+
diff --git a/docs/source/multimodal/text2img/controlnet.rst b/docs/source/multimodal/text2img/controlnet.rst
new file mode 100644
index 000000000000..8f3155770f82
--- /dev/null
+++ b/docs/source/multimodal/text2img/controlnet.rst
@@ -0,0 +1,106 @@
+ControlNet
+===================
+
+Model Introduction
+--------------------
+
+ControlNet :cite:`mm-models-controlnetgithub` is a neural network structure to control diffusion models by adding extra conditions.
+It copies the weights of neural network blocks into a "locked" copy and a "trainable" copy. The "trainable" one learns your condition. The "locked" one preserves your model. In this way, the ControlNet can reuse the SD encoder as a deep, strong, robust, and powerful backbone to learn diverse controls.
+NeMo Multimodal provides a training pipeline and example implementation for generating images based on segmentation maps. Users have the flexibility to explore other implementations using their own control input dataset and recipe.
+
+.. image:: ./images/controlnet-structure.png
+   :alt: ControlNet structure on stable diffusion (See :cite:`mm-models-controlnetgithub`)
+
+
+ControlNet Dataset
+____________________
+
+ControlNet employs the WebDataset format for data ingestion. (See :doc:`Datasets<./datasets>`) Beyond the essential image-text pairs saved in tarfiles with matching names but distinct extensions (like 000001.jpg and 000001.txt), ControlNet also requires control input within the tarfiles, identifiable by their specific extension. By default, the control input should be stored as 000001.png for correct loading and identification in NeMo's implementation.
+
+Model Configuration
+--------------------
+
+Even though the original copy of Stable Diffusion weights is locked, proper configuration settings toghether with a compatible pre-trained checkpoint are required for initialization. See :ref:`sd-config-section` for more details about ``unet_config``, ``first_stage_config`` and ``cond_stage_config``.
+
+Contol Stage Config
+^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    control_stage_config:
+        _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlNet
+        params:
+          from_pretrained_unet: /ckpts/v1-5-pruned.ckpt
+          from_NeMo: False
+          image_size: 32 # unused
+          in_channels: 4
+          hint_channels: 3
+          model_channels: 320
+          attention_resolutions: [ 4, 2, 1 ]
+          num_res_blocks: 2
+          channel_mult: [ 1, 2, 4, 4 ]
+          num_heads: 8
+          use_spatial_transformer: True
+          use_linear_in_transformer: False
+          transformer_depth: 1
+          context_dim: 768
+          use_checkpoint: False
+          legacy: False
+          use_flash_attention: True
+
+- ``from_pretrained_unet``: Same logic as ``unet_config.from_pretrained``, adjust the from_NeMo based on the checkpoint's source, whether it's from Huggingface or NeMo.
+
+
+- ``control_stage_config``: Outlines the architecture for the trainable copy of U-Net. It's essential that all parameters align with the U-Net checkpoint specified in this section.
+
+- ``hint_channels``: Represents the channels of input controls, which is 3 in the mentioned example due to the RGB image input having a shape of (H, W, 3).
+
+ControlNet Training Options
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    model:
+        control_key: hint
+        only_mid_control: False
+        sd_locked: True
+        ...
+
+
+- ``contorl_key``: Identifier of the control input, ``.png`` files will be converted to dictionary for dataloaders with their keys being ``hint``.
+
+- ``only_mid_control``: When set to True, during training, only the output from the middle block of the trainable copy will be incorporated into the locked copy.
+
+- ``sd_locked``: Whether to lock the original stable diffusion weights during training.
+
+
+Optimization related configurations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
+| Feature                  | Description                                                                                               | To Enable                                                                                                  |
++==========================+===========================================================================================================+============================================================================================================+
+| Data parallelism         | Dataset read concurrently                                                                                 | Automatically when training on multi GPUs/nodes                                                            |
++--------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
+| Activation Checkpointing | Reduce memory usage by clearing activations of certain layers and recomputing them during a backward pass | ``model.unet_config.use_checkpoint=True``                                                                  |
++--------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
+| Bfloat16 Training        | Training in Bfloat16 precision                                                                            | ``trainer.precision=bf16``                                                                                 |
++--------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
+| Flash Attention          | Fast and Memory-Efficient Exact Attention with IO-Awareness                                               | ``model.unet_config.use_flash_attention=True`` &&  ``model.control_stage_config.use_flash_attention=True`` |
++--------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
+| Channels Last            | Ordering NCHW tensors in memory preserving dimensions ordering.                                           | ``model.channels_last=True``                                                                               |
++--------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
+| Inductor                 | TorchInductor compiler                                                                                    | ``model.inductor=True``                                                                                    |
++--------------------------+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------+
+
+
+
+
+Reference
+-----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/multimodal/text2img/datasets.rst b/docs/source/multimodal/text2img/datasets.rst
new file mode 100644
index 000000000000..d1efa0322863
--- /dev/null
+++ b/docs/source/multimodal/text2img/datasets.rst
@@ -0,0 +1,40 @@
+Datasets
+========
+
+Data pipeline overview
+-----------------
+
+.. note:: It is the responsibility of each user to check the content of the dataset, review the applicable licenses, and determine if it is suitable for their intended use. Users should review any applicable links associated with the dataset before placing the data on their machine.
+
+For all text2img multimodal models, we provide a generic pipeline as detailed below to download and prepare the dataset. 
+The pipeline is suitable for any multimodal datasets hosted on the HuggingFace data repository
+where the data is stored as one or more parquet files. The pipeline processes the dataset into the
+WebDataset format, consisting of tar files of equal sizes for efficient training.
+
+The 6 sub-stages are as follows.
+
+    #. download_parquet: Parquet files consisting of text (captions) and image URLs are downloaded from a HuggingFace repository.
+
+    #. download_images: The images are downloaded from their respective URLs and, along with the captions, are packed into tar files following the Webdataset format.
+
+    #. reorganize_tar: (Optional) Due to a variety of reasons (such as unstable network or removal of images), some images may fail to download, resulting in uneven tar files with varying number of examples each. If you are using a training sampler that does not support uneven tar files, you need to re-organize the contents of the tar files so that each one contains an equal number of image-text pairs.
+
+    #. precache_encodings: (Optional) If you are training a model with frozen encoders (e.g. Stable Diffusion), you have the option to precache (precompute) image and/or text encodings (embeddings) in this sub-stage. Precaching these encodings can significantly enhance training throughput.
+
+    #. generate_wdinfo: (Optional) The wdinfo.pkl file, which stores information on dataset shards, is generated.
+
+Depending on your specific circumstance, not all sub-stages need to be run all at once.
+For example, for parquet datasets not hosted on HuggingFace or those whose format is not parquet,
+sub-stages 2-5 can be used to process locally downloaded datasets.
+For webdatasets already downloaded locally, sub-stages 4-5 can be used to precache the encoding to reduce training time.
+For models that encode image and text on-the-fly, only sub-stages 1-3 need to be run.
+
+Instruction for configuring each sub-stage is provided as a comment next to each field in
+`download_multimodal.yaml <http://TODOURL>`_
+
+
+Examples of Preparing a Dataset for Training Text2Img Model
+-----------------------
+
+Refer to the `Dataset Tutorial <http://TODOURL>`_` for details on how to prepare the training dataset for Training Text2Img models.
+
diff --git a/docs/source/multimodal/text2img/dreambooth.rst b/docs/source/multimodal/text2img/dreambooth.rst
new file mode 100644
index 000000000000..438615676c62
--- /dev/null
+++ b/docs/source/multimodal/text2img/dreambooth.rst
@@ -0,0 +1,132 @@
+DreamBooth
+===================
+
+
+Model Introduction
+--------------------
+
+DreamBooth :cite:`mm-models-dreamboothpaper` is a fine-tuning technique and a solution to personalize large diffusion models like Stable Diffusion, which are powerful but lack the
+ability to mimic subjects of a given reference set. With DreamBooth, you only need a few images of a specific subject to
+fine-tune a pretrained text-to-image model, so that it learns to bind a unique identifier with a special subject. This
+unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in
+different scenes.
+
+NeMo's Dreambooth is built upon the Stable Diffusion framework. While its architecture mirrors Stable Diffusion (refer to :ref:`sd-config-section`), the distinction lies in its training process, specifically when utilizing a different dataset and incorporating the prior preservation loss when necessary.
+
+- Prior Preservation Loss
+When finetuning large pretrained language models on specific tasks or text-to-image diffusion models on a small dataset, problems like language drift and decreased output variety often arise. The concept of the prior preservation loss is straightforward: it guides the model using its self-generated samples and incorporates the discrepancy between the model-predicted noise on these samples. The influence of this loss component can be adjusted using model.prior_loss_weight.
+
+.. code-block:: python
+
+    model_pred, model_pred_prior = torch.chunk(model_output, 2, dim=0)
+    target, target_prior = torch.chunk(target, 2, dim=0)
+    loss = torch.nn.functional.mse_loss(model_pred.float(), target.float(), reduction="mean")
+    prior_loss = torch.nn.functional.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+    loss = loss + prior_loss * self.prior_loss_weight
+
+
+- Training Dataset
+NeMo's Dreambooth model dataset is different from other NeMo multimodal models in that it doesn't necessitate data stored in the webdataset format. You can find a sample dataset at :cite:`mm-models-dreamboothdataset`. For each object you aim to integrate into the model, just place its images (typically 3-5) in a folder and specify its path in ``model.data.instance_dir``. When training with the prior preservation loss, store images produced by the original model in a distinct folder and reference its path in ``model.data.regularization_dir``. This process is automated in NeMo's DreamBooth implementation.
+
+Model Configuration
+--------------------
+
+Pleaser refer to :ref:`sd-config-section` for how to configure Stable Diffusion. Here we show DreamBooth-specific configurations.
+
+Prior Preservation Loss
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  model:
+    with_prior_preservation: False
+    prior_loss_weight: 0.5
+    train_text_encoder: False
+    restore_from_path: /ckpts/nemo-v1-5-188000-ema.nemo #This ckpt is only used to generate regularization images, thus .nemo ckpt is needed
+
+    data:
+      instance_dir: /datasets/instance_dir
+      instance_prompt: a photo of a sks dog
+      regularization_dir: /datasets/nemo_dogs
+      regularization_prompt: a photo of a dog
+      num_reg_images: 10
+      num_images_per_prompt: 4
+      resolution: 512
+      center_crop: True
+
+
+- ``train_text_encoder``: Dictates if the text encoder should be finetuned alongside the U-Net.
+
+- ``with_prior_preservation``: Depending on its setting, this influences how the model behaves with respect to the regularization data. If set to ``False``, both ``model.prior_loss_weight`` and ``model.restore_from_path`` will be disregarded. If set to ``True``, the actions will differ based on the number of images present in ``model.data.regularization_dir``:
+
+  #. If the count is fewer than ``model.data.num_reg_images``:
+
+     + ``model.restore_from_path`` should be provided with a `.nemo` checkpoint, allowing the inference pipeline to produce regularization images.
+     + ``model.data.num_images_per_prompt`` is analogous to the inference batch size and indicates the number of images generated in one pass, restricted by GPU capabilities.
+     + ``model.regularization_prompt`` determines the text prompt for the inference pipeline to generate images. It's generally a variant of ``model.data.instance_prompt`` minus the unique token.
+     + Once all above parameters are satisfied, the inference pipeline will run until the required image count is achieved in the regularization directory.
+
+  #. If the count matches or exceeds ``model.data.num_reg_images``
+
+     + Training will proceed without calling inference pipeline, and the parameters mentioned above will be ignored.
+
+Optimization related configurations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Feature                  | Description                                                                                               | To Enable                                       |
++==========================+===========================================================================================================+=================================================+
+| Data parallelism         | Dataset read concurrently                                                                                 | Automatically when training on multi GPUs/nodes |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Activation Checkpointing | Reduce memory usage by clearing activations of certain layers and recomputing them during a backward pass | ``model.unet_config.use_checkpoint=True``       |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Bfloat16 Training        | Training in Bfloat16 precision                                                                            | ``trainer.precision=bf16``                      |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Flash Attention          | Fast and Memory-Efficient Exact Attention with IO-Awareness                                               | ``model.unet_config.use_flash_attention=True``  |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Channels Last            | Ordering NCHW tensors in memory preserving dimensions ordering.                                           | ``model.channels_last=True``                    |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Inductor                 | TorchInductor compiler                                                                                    | ``model.inductor=True``                         |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+
+
+Training with Cached Latents
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    model:
+        use_cached_latents: True
+
+        data:
+            num_workers: 4
+            instance_dir: /datasets/instance_dir
+            instance_prompt: a photo of a sks dog
+            regularization_dir: /datasets/nemo_dogs
+            regularization_prompt: a photo of a dog
+            cached_instance_dir: #/datasets/instance_dir_cached
+            cached_reg_dir: #/datasets/nemo_dogs_cached
+
+
+- ``use_cached_latents``: Determines whether to train using online encoding or pre-cached latents.
+
+- ``cached_instance_dir``:
+
+  + If ``use_cached_latents`` is enabled and these directories with latents in `.pt` format are specified, training will utilize the latents rather than the original images.
+  + If a cached directory isn't provided or the number of latent files doesn't match the original image count, the Variational Auto Encoder will compute the image latents before training, and the results will be saved on the disk.
+
+- ``cached_reg_dir``:
+  + The logic is consistent with above, contingent on the model.with_prior_preservation setting.
+
+
+
+
+
+Reference
+-----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
diff --git a/docs/source/multimodal/text2img/imagen.rst b/docs/source/multimodal/text2img/imagen.rst
new file mode 100644
index 000000000000..1e065c738bb4
--- /dev/null
+++ b/docs/source/multimodal/text2img/imagen.rst
@@ -0,0 +1,287 @@
+Imagen
+========
+
+Model Introduction
+-------------------
+
+Imagen  :cite:`mm-models-saharia2022photorealistic` is a multi-stage text-to-image diffusion model with an unprecedented 
+degree of photorealism and a deep level of language understanding. Given a text prompt, 
+Imagen first generates an image at a 64x64 resolution and then upsamples the generated image to 256x256 and 1024x1024 
+resolutions, all using diffusion models.
+
+    .. image:: images/imagen_arch.png
+        :align: center
+        :alt: imagen model
+        :scale: 50%
+
+Imagen models can be instantiated using the :class:`~nemo.collections.multimodal.models.text_to_image.imagen.imagen.MegatronImagen` class.
+
+Text Encoder
+^^^^^^^^^^^^^^^
+
+Imagen employs a text encoder, typically T5, to encode textual features. 
+To enhance efficiency, we strongly recommend preprocessing the training dataset with pre-cached embeddings, 
+given the substantial size of T5 encoders. Loading encoders during training can lead to a notable reduction in training time.
+
+UNet
+^^^^^^^^^^
+
+Imagen has two types of UNet: Regular UNet and EfficientUNet.
+
+Regular UNet
+~~~~~~~~~~~~
+Regular UNet is used for Imagen base64 model. You can also use regular UNet for SR models
+(see example config file `sr256-400m-edm.yaml <http://TODOURL>`_), but this typically
+results in a larger memory footprint during training for the same model size.
+
+Recommended UNet size for base64 and SR256 models are listed below:
+
++--------------+------------+-----------------------------+------------------------------------+---------------+
+| Model        | Resolution | Hidden Size (``embed_dim``) | Text Condition Size (``cond_dim``) | UNet Size (M) |
++==============+============+=============================+====================================+===============+
+| 500m_res_64  | 64x64      | 256                         | 512                                | 524           |
++--------------+------------+-----------------------------+------------------------------------+---------------+
+| 2b_res_64    | 64x64      | 512                         | 2048                               | 2100          |
++--------------+------------+-----------------------------+------------------------------------+---------------+
+| 400m_res_256 | 256x256    | 128                         | 512                                | 429           |
++--------------+------------+-----------------------------+------------------------------------+---------------+
+
+
+Efficient UNet
+~~~~~~~~~~~~~
+
+Efficient UNet is based on Regular UNet with the following modifications:
+ 
+  #. Shift the model parameters from the high resolution blocks to the low resolution blocks, via adding more residual blocks for the lower resolutions
+  #. Scaling skip connection by 1/sqrt(2)
+  #. perform downsampling operation **before** convolution and perform upsampling operation **after** convolution.
+
+With the aforementioned modifications, Efficient UNet can converge more rapidly and with greater memory efficiency. 
+The Imagen paper states that such a modification has no discernible impact on convergence. 
+However, our empirical findings reveal that the Regular UNet yields slightly better visual quality. 
+Metric-wise, they exhibit similar quality based on FID-CLIP evaluation.
+
+Recommended Efficient UNet size for SR256 and SR1024 models are listed below:
+
++---------------+------------+-----------------------------+------------------------------------+-----------------+---------------+
+| Model         | Resolution | Hidden Size (``embed_dim``) | Text Condition Size (``cond_dim``) | Attention Block | UNet Size (M) |
++===============+============+=============================+====================================+=================+===============+
+| 600m_res_256  | 256x256    | 128                         | 512                                | Fuse Attention  | 646           |
++---------------+------------+-----------------------------+------------------------------------+-----------------+---------------+
+| 400m_res_1024 | 1024x1024  | 128                         | 512                                | Cross Attention | 427           |
++---------------+------------+-----------------------------+------------------------------------+-----------------+---------------+
+
+
+Noise Scheduling / Sampler
+^^^^^^^^^^^^^^
+
+NeMo Imagen supports two types of noise scheduling: Continous DDPM :cite:`mm-models-nichol2021improved` and EDM :cite:`mm-models-karras2022elucidating`.
+
+Denoising diffusion probabilistic models (DDPM) :cite:`mm-models-ho2020denoising` 
+represents the most widely adopted noise scheduling approach among all diffusion models. 
+Continuous DDPM introduces several modifications to the standard DDPM framework, 
+with the most significant change being the transition from a discrete noise space to a continuous space.
+
+Elucidating the Design Space of Diffusion-Based Generative Models" (EDM) proposes an enhanced noise level distribution 
+strategy during training. It also identifies the optimal time discretization for sampling and 
+incorporates a higher-order Runge-Kutta method for the sampling process.
+
+Model Configuration
+------------------
+
+Text Encoder
+^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  model:
+    conditioning:
+      embed_dim: 1024
+      token_length: 128
+      drop_rate: 0.1
+      precached_key: embeddings_t5_xxl
+      out_key: t5_text
+
+``embed_dim`` represents text feature dimension after encoding. For T5, dimensions are either 1024 or 4096.
+``token_length`` specifies the maximum context lnegth. All precached text features will be either trimmed or padded to match this specified length.
+``drop_rate``  defines the rate at which random text segments are dropped during training.
+``embeddings_t5_xxl`` specifies the key name associated with the precached features in the dataset.
+
+When using online encoding:
+
+.. code-block:: yaml
+
+  model:
+    conditioning:
+      online_encoding: True
+      encoder_path: ???
+      embed_dim: 1024
+      token_length: 128
+      drop_rate: 0.1
+
+Set ``online_encoding=True`` and set the text encoder path ``encoder_path``. It will load the text encoder
+during training to generate text embedding for the raw text from the dataset.
+
+Regular UNet
+^^^^^^^^^^^^
+.. code-block:: yaml
+
+  unet_type: base
+  unet:
+    embed_dim: 256
+    image_size: 64
+    channels: 3
+    num_res_blocks: 3
+    channel_mult: [ 1, 2, 3, 4 ]
+    num_attn_heads: 4
+    per_head_channels: 64
+    cond_dim: 512
+    attention_type: fused
+    feature_pooling_type: attention
+    learned_sinu_pos_emb_dim: 0
+    attention_resolutions: [ 8, 16, 32 ]
+    dropout: False
+    use_null_token: False
+    init_conv_kernel_size: 3
+    gradient_checkpointing: False
+    scale_shift_norm: True
+    stable_attention: True
+    flash_attention: False
+    resblock_updown: False
+    resample_with_conv: True
+
+
+To configure the UNet model, set ``unet_type`` to ``base`` for the regular UNet base model or ``sr-unet`` for 
+the super-resolution (SR) model. The ``embed_dim`` parameter denotes the base number of channels in each ResBlock.
+
+At each level in the UNet architecture, ``num_res_blocks`` defines the number of ResBlocks for that level, 
+while ``channel_mult`` is employed in combination with ``embed_dim`` to determine the number of channels at different levels. 
+``cond_dim`` specifies the size of the conditioning projection.
+
+Imagen supports two methods of time embedding: either learned time positional embedding or unlearned (fixed). 
+To use unlearned embedding, set ``learned_sinu_pos_emb_dim`` to 0; for learned embedding, use a positive number.
+
+The ``feature_pooling_type`` parameter specifies the pooling method, which can be either ``attention`` or ``mean``.
+
+If you wish to enable model dropout (note that this is different from the text dropout in conditioning), 
+set the ``dropout`` parameter. When ``resblock_updown`` is set to False, it indicates the use of ResBlocks for 
+downsampling and upsampling, as opposed to Torch's upsample and downsample functions without learnable weights. 
+If ``resblock_updown`` is ``False``, you can use ``resample_with_conv`` to determine whether an additional convolutional layer 
+is needed in addition to pooling and convolution transpose operations.
+
+
+Efficient UNet
+^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  unet_type: sr
+  unet:
+    embed_dim: 128
+    image_size: 256
+    channels: 3
+    channel_mult: [ 1, 2, 4, 8, 8 ]
+    num_attn_heads: 8
+    per_head_channels: 64
+    attention_type: stacked
+    atnn_enabled_at: [ 0, 0, 0, 1, 1 ]
+    feature_pooling_type: attention
+    stride: 2
+    num_resblocks: [ 2, 4, 8, 8, 8 ]
+    learned_sinu_pos_emb_dim: 0
+    use_null_token: False
+    init_conv_kernel_size: 3
+    gradient_checkpointing: False
+    scale_shift_norm: True
+    stable_attention: False
+    flash_attention: False
+    skip_connection_scaling: True
+
+Many of the arguments remain consistent with those for the Regular UNet. 
+To configure the Efficient UNet SR model training, you should set ``unet_type`` to ``sr``. 
+When using the Efficient UNet SR model, ``num_resblocks`` can be specified as a list to define varying numbers 
+of ResBlocks for each level. Additionally, you have the option to enable ``skip_connection_scaling``, 
+which scales the skip connections, as detailed in the Imagen paper.
+
+Attention Blocks
+^^^^^^^^^^^^^
+
+Imagen's UNet incorporates multiple attention blocks to effectively handle text embeddings. 
+The following arguments in the UNet configurations pertain to these attention blocks:
+
+.. code-block:: yaml
+
+  unet:
+    attention_type: stacked
+    attention_resolutions: [8, 16, 32]
+    stable_attention: False
+    flash_attention: False
+
+NeMo Imagen has the following ``attention_type`` implemented:
+
+  #. ``self``: Multi-head self attention block
+  #. ``cross``: Multi-head cross attention block. Imagen paper uses this implementation for SR1024 model.
+  #. ``stacked``: Attention blocks that stack one ``self`` attention and ``cross`` attention
+  #. ``fused``: Attention blocks that fuses one ``self`` attention and ``cross`` attention. Imagen paper uses this implementation for base64 and SR256 model.
+
+Attention blocks can be integrated at various levels within the UNet by specifying the attention_resolutions. 
+The option ``stable_attention`` facilitates the computation of attention block backpropagation in a more 
+numerically stable manner. You can control whether to utilize the optimized FlashAttention by setting the ``flash_attention`` parameter.
+
+
+Scheduling
+^^^^^^^^^^^^
+
+To train NeMo Imagen with EDM, set ``preconditioning_type=EDM`` and use the suggested parameters from EDM paper:
+
+.. code-block:: yaml
+
+  preconditioning_type: EDM
+  preconditioning:
+    loss_type: l2
+    sigma_data: 0.5
+    p_mean: -1.2
+    p_std: 1.2
+
+Note for EDM scheduling, UNet is trained to predict the denoise image rather than the noise itself. supported ``loss_type`` are ``l1``, ``l2``, 
+and ``huber``.
+
+.. code-block:: yaml 
+
+  preconditioning_type: DDPM
+  preconditioning:
+    loss_type: l2
+    pred_objective: noise
+    noise_schedule: cosine
+    timesteps: 1000
+
+Setting ``preconditioning_type=DDPM`` allows user to train UNet with continous DDPM scheduling. ``pred_objective`` can
+be either ``noise`` or ``x_start``. We currently support ``linear`` and ``cosine`` modes for ``noise_schedule``.
+
+Training Optimizations
+^^^^^^^^^^^^^^
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Feature                  | Description                                                                                                                                                                                                                                                                                                                                  | To Enable                                       |
++==========================+==============================================================================================================================================================================================================================================================================================================================================+=================================================+
+| Data parallelism         | Dataset is read concurrently across multiple GPUs or nodes, allowing for faster data loading and processing.                                                                                                                                                                                                                                 | Automatically when training on multi GPUs/nodes |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Activation Checkpointing | To reduce memory usage, activations of certain layers are cleared and recomputed during a backward pass. This technique is particularly useful for training large models that wouldn't fit in GPU memory using traditional methods.                                                                                                          | ``model.unet.gradient_checkpointing=True``      |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Bfloat16 Training        | Training is conducted in Bfloat16 precision, which offers a balance between the higher precision of FP32 and the memory savings and speed of FP16.                                                                                                                                                                                           | ``trainer.precision=bf16``                      |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Flash Attention          | FlashAttention is a fast and memory-efficient algorithm to compute exact attention. It speeds up model training and reduces memory requirement by being IO-aware. This approach is particularly useful for large-scale models and is detailed further in the repository linked. [Reference](https://github.com/Dao-AILab/flash-attention)    | ``model.unet.flash_attention=True``             |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Channels Last            |  ordering NCHW tensors in memory preserving dimensions ordering.                                                                                                                                                                                                                                                                             | ``model.channels_last=True``                    |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Inductor                 | TorchInductor compiler                                                                                                                                                                                                                                                                                                                       | ``model.inductor=True``                         |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+
+
+Reference
+-----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/multimodal/text2img/images/controlnet-structure.png b/docs/source/multimodal/text2img/images/controlnet-structure.png
new file mode 100644
index 000000000000..2eb3a8567e52
Binary files /dev/null and b/docs/source/multimodal/text2img/images/controlnet-structure.png differ
diff --git a/docs/source/multimodal/text2img/images/imagen_arch.png b/docs/source/multimodal/text2img/images/imagen_arch.png
new file mode 100644
index 000000000000..530e76946ebf
Binary files /dev/null and b/docs/source/multimodal/text2img/images/imagen_arch.png differ
diff --git a/docs/source/multimodal/text2img/insp2p.rst b/docs/source/multimodal/text2img/insp2p.rst
new file mode 100644
index 000000000000..20e68f5742e3
--- /dev/null
+++ b/docs/source/multimodal/text2img/insp2p.rst
@@ -0,0 +1,84 @@
+InstructPix2Pix
+===================
+
+Model Introduction
+--------------------
+
+InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
+
+Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
+
+Training Dataset
+--------------------
+
+The dataset for NeMo's InstructPix2Pix model stands out among NeMo multimodal models, as it doesn't mandate data storage in the webdataset format. Users are advised to verify the dataset's content, assess the relevant licenses, and ensure its appropriateness for their use. Before downloading, it's essential to review any links associated with the dataset.
+
+For instructions on downloading and preparing the custom dataset for training InstructPix2Pix, refer to the official InstructPix2Pix repository. `Instruct-Pix2Pix Repository <https://github.com/timothybrooks/instruct-pix2pix#generated-dataset>`_
+
+Model Configuration
+------------------
+
+Data Configuration
+^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  data:
+    data_path: ???
+    num_workers: 2
+
+- ``data_path``: Path to the instruct-pix2pix dataset. Users are required to specify this path. Further details on the dataset are available at `Instruct-Pix2Pix Repository <https://github.com/timothybrooks/instruct-pix2pix#generated-dataset>`_.
+- ``num_workers``: Denotes the number of worker processes for data loading, determining the number of subprocesses used.
+
+Essential Model Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  model:
+    first_stage_key: edited
+    cond_stage_key: edit # txt for cifar, caption for pbss
+
+    unet_config:
+      _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+      from_pretrained:
+      in_channels: 8
+
+- ``first_stage_key``: Key for the model's initial processing stage. Set to `edited` for InstructPix2Pix.
+- ``cond_stage_key``: Key for the model's conditional stage. Set to `edit` for InstructPix2Pix.
+- ``unet_config``: Configuration parameters for the UNet model within the NeMo collection.
+  - ``_target_``: Designates the target module for the UNet model in the NeMo collection.
+  - ``from_pretrained``: (Value not provided) Generally indicates the path or identifier of a pretrained model.
+  - ``in_channels``: Specifies the number of input channels for the UNet model. Here, the value is set to 8, with the initial 4 channels dedicated to image guidance.
+Additional model configurations align with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`).
+
+Optimization related configurations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Feature                  | Description                                                                                               | To Enable                                       |
++==========================+===========================================================================================================+=================================================+
+| Data parallelism         | Dataset read concurrently                                                                                 | Automatically when training on multi GPUs/nodes |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Activation Checkpointing | Reduce memory usage by clearing activations of certain layers and recomputing them during a backward pass | ``model.unet_config.use_checkpoint=True``       |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Bfloat16 Training        | Training in Bfloat16 precision                                                                            | ``trainer.precision=bf16``                      |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Flash Attention          | ast and Memory-Efficient Exact Attention with IO-Awareness                                                | ``model.unet_config.use_flash_attention=True``  |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Channels Last            |  ordering NCHW tensors in memory preserving dimensions ordering.                                          | ``model.channels_last=True``                    |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Inductor                 | TorchInductor compiler                                                                                    | ``model.inductor=True``                         |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
+
+
diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst
new file mode 100644
index 000000000000..39ce33562d50
--- /dev/null
+++ b/docs/source/multimodal/text2img/intro.rst
@@ -0,0 +1,99 @@
+Text to Image Models
+====================
+
+
+Supported Models
+-----------------
+NeMo Multimodal currently supports the following models:
+
++----------------------------------------+------------+
+| Model                                  | Categories |
++========================================+============+
+| `Stable Diffusion <./sd.html>`_        | Foundation |
++----------------------------------------+------------+
+| `Imagen <./imagen.html>`_              | Foundation |
++----------------------------------------+------------+
+| `DreamBooth <./dreambooth.html>`_      | Finetune   |
++----------------------------------------+------------+
+| `ControlNet <./controlnet.html>`_      | Finetune   |
++----------------------------------------+------------+
+| `instructPix2Pix <./insp2p.html>`_     | Finetune   |
++----------------------------------------+------------+
+
+
+Text2Img Foundation Models
+--------------------------
+Text-to-image models are a fascinating category of artificial intelligence models that aim to generate realistic images from textual descriptions. The mainstream text-2-image models can be broadly grouped into:
+
+#. **Diffusion Based Models**: these models leverage diffusion processes to
+   generate images from text and may operate in the latent space (Stable Diffusion :cite:`mm-models-rombach2022highresolution`) or directly in the pixel space (Imagen :cite:`mm-models-saharia2022photorealistic`). These models typically use probabilistic models to model the generation process.
+   They consider the sequential diffusion of information, which helps them generate images in a more coherent and controlled manner.
+   This approach is known for producing high-quality and diverse images while incorporating textual descriptions.
+
+#. **Autoregressive Based Models**: like Parti :cite:`mm-models-yu2022scaling`
+   and Make-A-Scene :cite:`mm-models-gafni2022makeascene`, generate images one pixel or region at a time.
+   These models take in the text description and gradually build the image pixel by pixel or element by element in
+   an autoregressive manner. While this approach can produce detailed images, it can be computationally expensive
+   and may not scale well for high-resolution images.
+
+
+#. **Masked Token Prediction Models**: including MUSE :cite:`mm-models-chang2023muse`, employ masked token prediction-based architectures.
+   These models learn to map text and image inputs into a shared embedding space.
+   They use a masked token prediction task during pretraining, allowing them to understand the
+   relationships between text and images. Given a text prompt, they can retrieve or generate images
+   that align with the content and context of the text description.
+
+
+Each of these approaches has its strengths and weaknesses, making them suitable for different use cases and scenarios.
+Diffusion-based models excel in generating diverse and high-quality images, autoregressive models offer fine-grained control,
+and masked token prediction-based models are strong at understanding and aligning text and images.
+The choice of model depends on the specific requirements of the text-to-image generation task at hand.
+
+
+Approaches to Customize/Extend Text2Img Models
+----------------------------------------------
+
+Customizing and extending Text2Img models can be essential to tailor these foundation models to
+specific applications or creative tasks. Some popular approaches to customize and extend text2img models include:
+
+
+#. **Text-Based Image Editing**: such as instructPix2Pix :cite:`mm-models-insp2p`, involves manipulating or modifying generated images based on
+   textual descriptions. To customize text2img models for this purpose, one can employ post-processing techniques to
+   alter the generated images.
+
+#. **Injecting New Concepts**: including DreamBooth :cite:`mm-models-ruiz2023dreambooth`, can introduce new concepts into text2img models. This is typically done by
+   adapting foundation models with additional data for finetuning.
+
+#. **Adding Conditionings to Guide Image Generation**: like ControlNet :cite:`mm-models-zhang2023adding`, allows for greater control and specificity in the generated images.
+   These conditionings can be based on various factors including specific attributes mentioned in the text (such as colors, sizes, or object properties),
+   spatial information, style and mood.
+
+Customizing and extending Text2Img models based on these approaches empowers users to have more control over the generated content,
+make images more contextually relevant, and adapt the models to a wide array of creative and practical tasks,
+from art creation to content personalization.
+
+.. note::
+    NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
+
+
+For more information, see additional sections in the MM Text2Img docs on the left-hand-side menu or in the list below:
+
+.. toctree::
+   :maxdepth: 1
+
+   datasets
+   configs
+   checkpoint
+   sd
+   imagen
+   dreambooth
+   controlnet
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst
new file mode 100644
index 000000000000..23865058ab9b
--- /dev/null
+++ b/docs/source/multimodal/text2img/sd.rst
@@ -0,0 +1,170 @@
+Stable Diffusion
+========
+
+This section gives a brief overview of the stable diffusion model in NeMo framework.
+
+Model Introduction
+--------------------
+
+Stable Diffusion stands out as an advanced text-to-image diffusion model, trained using a massive dataset of image,text pairs. Its core capability is to refine and enhance images by eliminating noise, resulting in clear output visuals. When presented with an image named z0, the model systematically injects noise. With each iteration marked by "t", the image, now termed zt, becomes increasingly distorted. As the value of "t" climbs, the image edges closer to resembling complete noise. Moreover, when provided with specific details like a text prompt or the time step "t", the model can accurately determine the extent of noise introduced to zt.
+
+Stable diffusion has three main components: A U-Net, an image encoder(Variational Autoencoder, VAE) and a text-encoder(CLIP).
+
+
+- U-Net: The Unet processes the noisy latents (x) to predict the noise, utilizing a conditional model which also incorporates the timestep (t) and text embedding for guidance.
+
+- VAE: The VAE model, equipped with both an encoder and decoder, engages in image compression during latent diffusion training. In a standard Stable Diffusion training stage, for instance, an input image is condensed from 512x512x3 dimensions to 64x64x4. This compression results in decreased memory and computational requirements when compared to pixel-space diffusion models. Subsequently, during inference, the decoder reverses this process by transforming denoised latent representations back into their original, tangible image forms.
+
+- Text-encoder: The text-encoder, typically a simple transformer like CLIP, converts input prompts into embeddings, which guides the U-Net's denoising process. These embeddings help train the U-Net to handle noisy latents effectively.
+
+.. _sd-config-section:
+
+Model Configuration
+--------------------
+
+
+In this section, we explain how to configure the size and initialization of the VAE, U-Net, and text encoder components of the Stable Diffusion model.
+
+Variational Auto Encoder
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The VAE configuration is defined under **first_stage_config**.
+
+.. code-block:: yaml
+
+    first_stage_config:
+        _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+        from_pretrained: /path/to/vae.bin
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256  #Never used
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+The VAE weights are fixed during training, and it's essential to pass a pretraiend checkpoint to ``first_stage_config.from_pretrained`` for initialization. The VAE architecture is shared for Stable diffusion v1 and v2 series. The scaling factor of VAE is ``2**(len(ch_mult - 1))``, which is 8 in this case. Thus the output image shape will be ``(H//8, W//8, 4)``.
+
+
+U-Net
+^^^^^^
+
+The U-Net configuration is defined under **unet_config**.
+
+.. code-block:: yaml
+
+    unet_config:
+        _target_: nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel
+        from_pretrained: /path/to/pretrain.ckpt
+        from_NeMo: True #Must be specified when from pretrained is not None, False means loading unet from HF ckpt
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_head_channels: 64
+        use_spatial_transformer: true
+        use_linear_in_transformer: true
+        transformer_depth: 1
+        context_dim: 1024
+        use_checkpoint: False
+        legacy: False
+        use_flash_attention: True
+
+- If ``from_pretrained`` is not specified, the U-Net initializes with random weights. To fine-tune, you can provide a pretrained U-Net checkpoint, either from an intermediate NeMo checkpoint (set ``from_NeMo=True``) or from other platforms like Huggingface (set ``from_NeMo=False``).
+
+- U-Net size
+    + ``num_res_blocks``: Defines the count of resnet blocks at every level.
+    + ``model_channels`` and ``channel_mult``: Set the tensor dimensions for each level.
+
+- Attention blocks
+    + ``attention_resolution``: Integrates attention blocks after the resnet block of every level.
+    + ``use_spatial_transformer``: Specifies the type of attention block employed.
+    + ``use_linear_in_transformer``: Chooses between a linear layer and convolution layer for in/out projections.
+    + ``transformer_depth``: Dictates the count of ``basic_transformer_block`` in each ``spatial_transformer_block``.
+
+- ``context_dim``: Must be adjusted to match the text encoder's output dimension.
+
+Text Encoder
+^^^^^^^^^^^^
+The text encoder configuration is defined under **cond_stage_config**.
+
+To use the NeMo implementation of the CLIP model in stable diffusion, one can use the following cond_stage_config:
+
+.. code-block:: yaml
+
+      cond_stage_config:
+        _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenMegatronCLIPEmbedder
+        restore_from_path: /path/to/nemo_clip.nemo
+        device: cuda
+        freeze: True
+        layer: "penultimate"
+
+- ``restore_from_path``: Must be provided to use NeMo CLIP models, all CLIP config-related information is already embeded in ``.nemo`` checkpoint file.
+
+- ``layer``: Specifies which layer's output will be used as text encoder output.
+
+Alternatively, one can also use the Huggingface implementation of the CLIP model using the config below
+
+.. code-block:: yaml
+
+    cond_stage_config:
+        _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder
+        arch: ViT-H-14
+        version: laion2b_s32b_b79k
+        device: cuda
+        max_length: 77
+        freeze: True
+        layer: "penultimate"
+
+- ``arch`` and ``version``: Determines which CLIP model to load.
+
+
+Optimization related configurations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Feature                  | Description                                                                                               | To Enable                                       |
++==========================+===========================================================================================================+=================================================+
+| Data parallelism         | Dataset read concurrently                                                                                 | default when training on multi GPUs/nodes       |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Activation Checkpointing | Reduce memory usage by clearing activations of certain layers and recomputing them during a backward pass | ``model.unet_config.use_checkpoint=True``       |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Bfloat16 Training        | Training in Bfloat16 precision                                                                            | ``trainer.precision=bf16``                      |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Flash Attention          | Fast and Memory-Efficient Exact Attention with IO-Awareness                                               | ``model.unet_config.use_flash_attention=True``  |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Channels Last            | Ordering NCHW tensors in memory preserving dimensions ordering.                                           | ``model.channels_last=True``                    |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Inductor                 | TorchInductor compiler                                                                                    | ``model.inductor=True``                         |
++--------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+
+Training with precached latents
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
+
+Reference
+-----------
+
diff --git a/docs/source/multimodal/vlm/checkpoint.rst b/docs/source/multimodal/vlm/checkpoint.rst
new file mode 100644
index 000000000000..fcd4bed975db
--- /dev/null
+++ b/docs/source/multimodal/vlm/checkpoint.rst
@@ -0,0 +1,92 @@
+Checkpoints
+===========
+
+In this section, we present four key functionalities of NVIDIA NeMo related to checkpoint management:
+
+1. **Checkpoint Loading**: Use the :code:`restore_from()` method to load local ``.nemo`` checkpoint files.
+2. **Partial Checkpoint Conversion**: Convert partially-trained ``.ckpt`` checkpoints to the ``.nemo`` format.
+3. **Community Checkpoint Conversion**: Transition checkpoints from community sources, like HuggingFace, into the ``.nemo`` format.
+4. **Model Parallelism Adjustment**: Modify model parallelism to efficiently train models that exceed the memory of a single GPU. NeMo employs both tensor (intra-layer) and pipeline (inter-layer) model parallelisms. Dive deeper with `"Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM" <https://arxiv.org/pdf/2104.04473.pdf>`_. This tool aids in adjusting model parallelism, accommodating users who need to deploy on larger GPU arrays due to memory constraints.
+
+Understanding Checkpoint Formats
+--------------------------------
+
+A ``.nemo`` checkpoint is fundamentally a tar file that bundles the model configurations (given as a YAML file), model weights, and other pertinent artifacts like tokenizer models or vocabulary files. This consolidated design streamlines sharing, loading, tuning, evaluating, and inference.
+
+Contrarily, the ``.ckpt`` file, created during PyTorch Lightning training, encompasses both the model weights and the optimizer states, usually employed to pick up training from a pause.
+
+The subsequent sections elucidate instructions for the functionalities above, specifically tailored for deploying fully trained checkpoints for assessment or additional fine-tuning.
+
+Loading Local Checkpoints
+-------------------------
+
+By default, NeMo saves checkpoints of trained models in the ``.nemo`` format. To save a model manually during training, use:
+
+.. code-block:: python
+
+   model.save_to(<checkpoint_path>.nemo)
+
+To load a local ``.nemo`` checkpoint:
+
+.. code-block:: python
+
+   import nemo.collections.multimodal as nemo_multimodal
+   model = nemo_multimodal.models.<MODEL_BASE_CLASS>.restore_from(restore_path="<path/to/checkpoint/file.nemo>")
+
+Replace `<MODEL_BASE_CLASS>` with the appropriate MM model class.
+
+Converting Local Checkpoints
+----------------------------
+
+Only the last checkpoint is automatically saved in the ``.nemo`` format. If intermediate training checkpoints evaluation is required, a ``.nemo`` conversion might be necessary. For this, refer to the script at `script <http://TODOURL>`_:
+
+.. code-block:: python
+
+   python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
+       examples/multimodal/convert_ckpt_to_nemo.py \
+       --checkpoint_folder <path_to_PTL_checkpoints_folder> \
+       --checkpoint_name <checkpoint_name> \
+       --nemo_file_path <path_to_output_nemo_file> \
+       --tensor_model_parallel_size <tensor_model_parallel_size> \
+       --pipeline_model_parallel_size <pipeline_model_parallel_size>
+
+Converting Community Checkpoints
+--------------------------------
+
+CLIP Checkpoints
+^^^^^^^^^^^^^^^^
+
+To migrate community checkpoints:
+
+.. code-block:: python
+
+   python examples/multimodal/foundation/clip/convert_external_clip_to_nemo.py \
+       --arch=ViT-H-14 \
+       --version=laion2b_s32b_b79k \
+       --hparams_file=path/to/saved.yaml \
+       --nemo_file_path=open_clip.nemo
+
+Ensure the NeMo hparams file has the correct model architectural parameters, placed at `path/to/saved.yaml`. An example can be found in `examples/multimodal/foundation/clip/conf/megatron_clip_config.yaml`.
+
+For OpenCLIP migrations, provide the architecture (`arch`) and version (`version`) according to the OpenCLIP `model list <https://github.com/mlfoundations/open_clip#usage>`_. For Hugging Face conversions, set the version to `huggingface` and the architecture (`arch`) to the specific Hugging Face model identifier, e.g., `yuvalkirstain/PickScore_v1`.
+
+Model Parallelism Adjustment
+---------------------------
+
+CLIP Checkpoints
+^^^^^^^^^^^^^^^^
+
+To adjust model parallelism from original model parallelism size to a new model parallelism size (Note: NeMo CLIP currently only supports `pipeline_model_parallel_size=1`):
+
+.. code-block:: python
+
+   python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+    --model_file=/path/to/source.nemo \
+    --target_file=/path/to/target.nemo \
+    --tensor_model_parallel_size=??? \
+    --target_tensor_model_parallel_size=??? \
+    --pipeline_model_parallel_size=-1 \
+    --target_pipeline_model_parallel_size=1 \
+    --precision=32 \
+    --model_class="nemo.collections.multimodal.models.clip.megatron_clip_models.MegatronCLIPModel" \
+    --tp_conversion_only
diff --git a/docs/source/multimodal/vlm/clip.rst b/docs/source/multimodal/vlm/clip.rst
new file mode 100644
index 000000000000..8b8ed59c8084
--- /dev/null
+++ b/docs/source/multimodal/vlm/clip.rst
@@ -0,0 +1,156 @@
+CLIP
+====
+
+Model Introduction
+-------------------
+
+Contrastive Language-Image Pre-training (CLIP) :cite:`mm-models-radford2021learning` offers an efficient method for learning image representations using natural language supervision. The essence of CLIP is to train both an image encoder and a text encoder from scratch. The model aims to predict the correct pairings of a batch of (image, text) training examples by jointly training these encoders. During pre-training, CLIP is designed to predict which images and texts form a semantically coherent pair by maximizing the similarity between the correct (image, text) pairs while minimizing the similarity between incorrect pairs. This contrastive learning approach ensures that CLIP learns meaningful and contextually rich representations of both visual and textual data.
+
+NeMo's implementation of the CLIP model leverages its parallel transformer implementation, specifically the `nemo.collections.nlp.modules.common.megatron.transformer.ParallelTransformer`, to enable model parallelism support in both the text encoder and vision model. This design choice ensures efficient scaling and utilization of resources during training. Additionally, some of the model design and loss implementations in NeMo's CLIP are inspired by the open-source [open_clip](https://github.com/mlfoundations/open_clip) repository.
+
+    .. image:: images/clip_arch.png
+        :align: center
+        :alt: CLIP model
+        :scale: 30%
+
+CLIP models in NeMo can be instantiated using the :class:`~nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models.MegatronCLIPModel` class.
+
+Text Encoder
+^^^^^^^^^^^^^^^
+
+CLIP uses a transformer-based text encoder to encode text features. The text input is tokenized and embedded. Positional embeddings are added to these token embeddings, and this combined representation is then passed through several transformer layers. The output from the last transformer layer corresponding to the first token is used as the text representation. In NeMo, the CLIP text encoder can be instantiated using the :class:`~nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models.CLIPTextTransformer` class.
+
+Vision Model
+^^^^^^^^^^
+
+CLIP's vision model is based on the Vision Transformer (ViT) architecture. The image is first divided into fixed-size patches (e.g., 16x16 pixels). These patches are linearly embedded into a flat vector, which is then used as input to the transformer. The output of the transformer is then pooled to produce a single image representation. In NeMo, the CLIP vision model can be instantiated using the :class:`~nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models.CLIPVisionTransformer` class.
+
+
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+| Model | Image size | Image Model size (M) | Hidden size (FFN size) | Attention heads | Number of layers| Patch dim| Model size (M) | Hidden size| Attention heads | Number of layers| Output dim |
+|       |            | (Vision)             | (Vision)       | (Vision)        | (Vision)        | (Vision) | (Text)         | (Text)     | (Text)          | (Text)          |            |
++=======+============+======================+================+=================+=================+==========+================+============+=================+=================+============+
+| B/32  | 224        | 87.85                | 768            | 12              | 12              | 16       | 63.43          | 512        | 8               | 12              | 512        |
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+| B/16  | 224        | 86.19                | 768            | 12              | 12              | 32       | 91.16          | 512        | 8               | 12              | 512        |
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+| L/14  | 224        | 303.97               | 1024           | 16              | 24              | 14       | 123.65         | 768        | 12              | 12              | 768        |
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+| H/14  | 224        | 638.08               | 1280           | 20              | 32              | 14       | 354.03         | 1024       | 16              | 24              | 1024       |
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+| g/14  | 224        | 1012.65              | 1408 (6144)    | 22              | 40              | 14       | 354.03         | 1024       | 16              | 24              | 1024       |
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+| G/14  | 224        | 1840                | 1664 (8192)    | 16              | 48              | 14       | 590            | 1280       | 20              | 32              | 1280       |
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+| e/14  | 224        | 2200                | 1792 (15360)   | 28              | 56              | 14       | 660            | 1280       | 20              | 36              | 1280       |
++-------+------------+----------------------+----------------+-----------------+-----------------+----------+----------------+------------+-----------------+-----------------+------------+
+
+
+Model Configuration
+------------------
+
+General Configuration
+^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  model:
+    output_dim: 512
+    local_loss: False
+    gather_with_grad: True
+
+- ``output_dim``: Represents the dimensionality of the output embeddings for both the text and vision models.
+- ``local_loss``: If set to `True`, the loss is calculated with local features at a global level, avoiding the need to realize the full global matrix. This can be beneficial for memory efficiency, especially when training on multiple devices.
+- ``gather_with_grad``: Enables full distributed gradient for feature gathering. Disabling this (setting to `False`) may cause convergence issues.
+
+Vision Model Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  vision:
+    patch_dim: 16
+    img_h: 224
+    img_w: 224
+    image_mean: null
+    image_std: null
+    num_channels: 3
+    drop_patch_rate: 0.0
+    drop_path_rate: 0.0
+    global_average_pool: False
+
+    output_dim: ${model.output_dim}
+    class_token_length: 8
+    encoder_seq_length: 196
+    num_layers: 12
+    hidden_size: 768
+    ffn_hidden_size: 3072
+    num_attention_heads: 12
+    hidden_dropout: 0.
+    attention_dropout: 0.
+
+- ``patch_dim``: Size of the patches the image is divided into.
+- ``img_h`` and ``img_w``: Height and width of the input images.
+- ``image_mean`` and ``image_std``: Mean and standard deviation values for image normalization.
+- ``num_channels``: Number of channels in the input image (e.g., 3 for RGB images).
+- ``drop_patch_rate`` and ``drop_path_rate``: Dropout rates for patches and paths respectively.
+- ``global_average_pool``: If set to `True`, applies global average pooling to the output.
+- ``class_token_length``: Length of the extra classification tokens.
+- ``encoder_seq_length``: Sequence length for the vision encoder.
+- ``num_layers``, ``hidden_size``, ``ffn_hidden_size``, ``num_attention_heads``: Parameters defining the architecture of the vision transformer. The ``ffn_hidden_size`` is typically 4 times the ``hidden_size``.
+- ``hidden_dropout`` and ``attention_dropout``: Dropout probabilities for the hidden state and attention in the transformer respectively.
+
+Text Model Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+  text:
+    output_dim: ${model.output_dim}
+    encoder_seq_length: 77
+    num_layers: 12
+    hidden_size: 512
+    ffn_hidden_size: 2048
+    num_attention_heads: 8
+    hidden_dropout: 0.
+    attention_dropout: 0.
+
+- ``output_dim``: Dimensionality of the output embeddings for the text model.
+- ``encoder_seq_length``: Sequence length for the text encoder.
+- ``num_layers``, ``hidden_size``, ``ffn_hidden_size``, ``num_attention_heads``: Parameters defining the architecture of the text transformer. The ``ffn_hidden_size`` is typically 4 times the ``hidden_size``.
+- ``hidden_dropout`` and ``attention_dropout``: Dropout probabilities for the hidden state and attention in the transformer respectively.
+
+Optimizations
+^^^^^^^^^^^^^^
+
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Feature                  | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | To Enable                                                                                                                                                                                                        |
++==========================+=========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================+==================================================================================================================================================================================================================+
+| Data parallelism         | Dataset is read concurrently across multiple GPUs or nodes, allowing for faster data loading and processing.                                                                                                                                                                                                                                                                                                                                                                                            | Automatically when training on multi GPUs/nodes                                                                                                                                                                  |
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Tensor parallelism       | Each tensor is split up into multiple chunks, allowing for horizontal parallelism across GPUs. This technique, known as TensorParallel (TP), distributes the model's tensors across multiple GPUs. During processing, each shard gets processed separately and in parallel on different GPUs, and the results are synced at the end of the step. This approach is inspired by NVIDIA's Megatron implementation. [Reference](https://github.com/NVIDIA/Megatron-LM#distributed-pretraining)              | ``model.tensor_model_parallel_size={parallel_size}``                                                                                                                                                             |
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Activation Checkpointing | To reduce memory usage, activations of certain layers are cleared and recomputed during a backward pass. This technique is particularly useful for training large models that wouldn't fit in GPU memory using traditional methods.                                                                                                                                                                                                                                                                     | ``model.vision.activations_checkpoint_granularity=full``, ``model.vision.activations_checkpoint_method=block``, ``model.vision.activations_checkpoint_num_layers={num_layers_to_check}`` (Same for ``model.llm``)|
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Bfloat16 Training        | Training is conducted in Bfloat16 precision, which offers a balance between the higher precision of FP32 and the memory savings and speed of FP16.                                                                                                                                                                                                                                                                                                                                                      | ``trainer.precision=bf16``                                                                                                                                                                                       |
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| BF16 O2                  | Enables O2-level automatic mixed precision, optimizing Bfloat16 precision for better performance.                                                                                                                                                                                                                                                                                                                                                                                                       | ``model.megatron_amp_O2=True``                                                                                                                                                                                   |
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Distributed Optimizer    | The optimization process is distributed across multiple GPUs, reducing memory requirements. This technique distributes the optimizer state across data parallel ranks, rather than replicating it, offering significant memory savings. This approach is inspired by the ZeRO optimization described in the paper "ZeRO: Memory Optimizations Toward Training Trillion Parameter Models" and implemented in NVIDIA's Megatron. [Reference](https://github.com/NVIDIA/Megatron-LM#distributed-optimizer) | ``model.optim.name="distributed_fused_adam"``                                                                                                                                                                    |
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Flash Attention V2       | FlashAttention is a fast and memory-efficient algorithm to compute exact attention. It speeds up model training and reduces memory requirement by being IO-aware. This approach is particularly useful for large-scale models and is detailed further in the repository linked. [Reference](https://github.com/Dao-AILab/flash-attention)                                                                                                                                                               | ``model.vision.use_flash_attention=True``, ``model.llm.use_flash_attention=True``                                                                                                                                |
++--------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+
+Model Training
+-------------------
+Refer to https://laion.ai/blog/large-openclip/#results for community training recipe.
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/multimodal/vlm/configs.rst b/docs/source/multimodal/vlm/configs.rst
new file mode 100644
index 000000000000..160ba05cd6d0
--- /dev/null
+++ b/docs/source/multimodal/vlm/configs.rst
@@ -0,0 +1,160 @@
+Common Configuration Files
+==========================
+
+This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo Multimodal Language Model collection. For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`../core/core` section.
+
+Within the configuration files of the NeMo Multimodal Language Model, details concerning dataset(s), augmentation, optimization parameters, and model architectural specifications are central. This page explores each of these aspects.
+
+Discover exemplary configuration files for all NeMo Multimodal Language Model scripts in the `config directory of the examples <http://TODOURL>`_.
+
+Dataset Configuration
+=====================
+
+The configuration file delineates parameters for training, validation, and testing datasets under the ``train``, ``validation``, and ``test`` sections, respectively. Depending on the specific task, the configuration might include parameters related to dataset augmentations, image resolution filtering, among others.
+
+All initialization parameters supported by the Dataset class utilized in the experiment can be defined in the config file. For a comprehensive list of Datasets and their associated parameters, consult the `Datasets <../api.html#Datasets>`__ section of the API.
+
+A representative training configuration appears as:
+
+.. code-block:: yaml
+
+  data:
+      num_workers: 16 # Number of workers for the dataloader
+      train:
+        dataset_path: # Paths to wdinfo files specifying training datasets
+          - dataset1.pkl
+          - dataset2.pkl
+      validation: # Paths to validation dataset files (pkl or tar)
+        dataset_path:
+          - dataset3.pkl
+      webdataset:
+        use_webdataset: True     # Enable the use of webdataset
+        infinite_sampler: False  # Set to False for correct training resumption and accurate epoch counting
+        local_root_path: ???     # Specify the directory where the dataset is located
+        verbose: False           # Enable verbose debugging information if set to True
+
+As outlined in `Datasets <./datasets.html>`_, the utilization of the webdataset format is imperative for the vision-language contrastive training pipeline. Due to the extensive data requirements, it's beneficial to follow the `Webdataset Multinode Training Guideline <https://github.com/webdataset/webdataset#multinode-training>`_ for recommended practices.
+
+
+.. code-block:: yaml
+
+  data:
+    train:
+      augmentations:
+        resize_smallest_side: 256 # Adjust the image's smallest side to the specified dimension
+        center_crop_h_w: 256, 256 # Perform center cropping
+        horizontal_flip: False # Apply horizontal flip
+      filterings:
+        resolution:
+          method: larger
+          value: 256
+
+This configuration uses the same data class as the text2image setup. Enabling the ``train.filterings`` and ``train.augmentations`` sections offers flexibility to filter specific images (and their text pairs) for particular requirements without the need to recreate the entire webdataset. The provided example demonstrates filtering to retain only images with resolutions exceeding 256x256 for training. To concatenate multiple webdatasets, simply list all corresponding wdinfo files under ``train.dataset_path``.
+
+
+
+
+
+Trainer Configuration
+---------------------
+
+This section outlines arguments for the Pytorch Lightning Trainer Object.
+
+.. code-block:: yaml
+
+  trainer:
+    devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
+    num_nodes: 1
+    max_epochs: -1
+    max_steps: 2500000 # precedence over max_epochs
+    logger: False  # Provided by exp_manager
+    precision: bf16 # Should be set to 16 for O1 and O2 to enable the AMP.
+    accelerator: gpu
+    log_every_n_steps: 5  # Interval of logging.
+    resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+    num_sanity_val_steps: 10 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+    enable_checkpointing: False # Provided by exp_manager
+    accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+    gradient_clip_val: 1.0
+    benchmark: False
+    enable_model_summary: True
+
+For a detailed list of arguments, refer to the `Pytorch Lightning Trainer <https://lightning.ai/docs/pytorch/stable/common/trainer.html#>`__ API section.
+
+Experiment Manager Configurations
+---------------------------------
+
+The NeMo Experiment Manager provides a streamlined approach to manage various tasks such as logging, saving, and resuming.
+
+.. code-block:: yaml
+
+  exp_manager:
+    exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
+    name: ${name}
+    create_wandb_logger: True
+    wandb_logger_kwargs: # Whether you want exp_manger to create a Wandb logger
+      name: training-session
+      project: text2img
+      group: nemo
+      resume: True
+    create_tensorboard_logger: True  # Whether you want exp_manger to create a tb logger
+    create_checkpoint_callback: True  # Whether you want exp_manager to create a modelcheckpoint callback
+    checkpoint_callback_params:
+      monitor: reduced_train_loss
+      save_top_k: 5
+      every_n_epochs: 0 # Save checkpoint frequency.
+      every_n_train_steps: 1000 # Mutually exclusive with every_n_epochs. It is recommended to set this if training on large-scale dataset.
+      filename: '${name}--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+    resume_if_exists: True
+    resume_ignore_no_checkpoint: True
+    resume_from_checkpoint: ${model.resume_from_checkpoint}
+    ema:
+      enable: True
+      decay: 0.9999
+      validate_original_weights: False
+      every_n_steps: 1
+      cpu_offload: False
+
+Optimizer Configurations
+-------------------------
+
+.. code-block:: yaml
+
+  optim:
+    name: fused_adam
+    lr: 0.0001
+    eps: 1e-8
+    betas: [ 0.9, 0.999 ]
+    weight_decay: 0.01
+    sched:
+      name: WarmupPolicy
+      warmup_steps: 10000
+      warmup_ratio: null
+
+The default optimizer used is ``fused_adam``. For details on all supported optimizers, refer to the NeMo user guide. The learning rate scheduler can be specified in the ``optim.sched`` section.
+
+Model Configurations
+--------------------
+
+Each configuration file should detail the model architecture used for the experiment.
+
+The parameters commonly shared across most multimodal language models include:
+
++--------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| **Parameter**                        | **Datatype** | **Description**                                                                       |
++======================================+==============+=======================================================================================+
+| :code:`micro_batch_size`             | int          | Micro batch size that fits on each GPU                                                |
++--------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`global_batch_size`            | int          | Global batch size considering gradient accumulation and data parallelism              |
++--------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`tensor_model_parallel_size`   | int          | Intra-layer model parallelism                                                         |
++--------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`pipeline_model_parallel_size` | int          | Inter-layer model parallelism                                                         |
++--------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`seed`                         | int          | Seed used in training                                                                 |
++--------------------------------------+--------------+---------------------------------------------------------------------------------------+
+
+CLIP
+~~~~~~~~
+
+For model-specific configurations, refer to `clip <./clip.html#clip>`_.
diff --git a/docs/source/multimodal/vlm/datasets.rst b/docs/source/multimodal/vlm/datasets.rst
new file mode 100644
index 000000000000..678e618b9221
--- /dev/null
+++ b/docs/source/multimodal/vlm/datasets.rst
@@ -0,0 +1,35 @@
+Datasets
+========
+
+Data pipeline overview
+-----------------
+
+.. note:: It is the responsibility of each user to check the content of the dataset, review the applicable licenses, and determine if it is suitable for their intended use. Users should review any applicable links associated with the dataset before placing the data on their machine.
+
+For all vision-language pretraining models, we provide a generic pipeline as detailed below to download and prepare the dataset.
+The pipeline is suitable for any multimodal datasets hosted on the HuggingFace data repository
+where the data is stored as one or more parquet files. The pipeline processes the dataset into the
+WebDataset format, consisting of tar files of equal sizes for efficient training.
+
+The 6 sub-stages are as follows.
+
+    #. download_parquet: Parquet files consisting of text (captions) and image URLs are downloaded from a HuggingFace repository.
+
+    #. download_images: The images are downloaded from their respective URLs and, along with the captions, are packed into tar files following the Webdataset format.
+
+    #. reorganize_tar: (Optional) Due to a variety of reasons (such as unstable network or removal of images), some images may fail to download, resulting in uneven tar files with varying number of examples each. If you are using a training sampler that does not support uneven tar files, you need to re-organize the contents of the tar files so that each one contains an equal number of image-text pairs.
+
+    #. precache_encodings: (Optional) If you are training a model with frozen encoders (e.g. Stable Diffusion), you have the option to precache (precompute) image and/or text encodings (embeddings) in this sub-stage. Precaching these encodings can significantly enhance training throughput.
+
+    #. generate_wdinfo: (Optional) The wdinfo.pkl file, which stores information on dataset shards, is generated.
+
+    #. merge_source_tar: (Optional) After precaching, this sub-stage can copy and append any additional objects (such as original image or metadata files) from the source tar files to the result tar files.
+
+Depending on your specific circumstance, not all sub-stages need to be run all at once.
+For example, for parquet datasets not hosted on HuggingFace or those whose format is not parquet,
+sub-stages 2-6 can be used to process locally downloaded datasets.
+For webdatasets already downloaded locally, sub-stages 4-6 can be used to precache the encoding to reduce training time.
+For models that encode image and text on-the-fly, only sub-stages 1-3 need to be run.
+
+Instruction for configuring each sub-stage is provided as a comment next to each field in
+`download_multimodal.yaml <http://TODOURL>`_
diff --git a/docs/source/multimodal/vlm/images/clip_arch.png b/docs/source/multimodal/vlm/images/clip_arch.png
new file mode 100644
index 000000000000..a1b5ec9171fd
Binary files /dev/null and b/docs/source/multimodal/vlm/images/clip_arch.png differ
diff --git a/docs/source/multimodal/vlm/intro.rst b/docs/source/multimodal/vlm/intro.rst
new file mode 100644
index 000000000000..949fb8a11196
--- /dev/null
+++ b/docs/source/multimodal/vlm/intro.rst
@@ -0,0 +1,82 @@
+Vision-Language Foundation
+==========================
+
+Humans naturally process information using multiple senses like sight and sound. Similarly, multi-modal learning aims to create models that handle different types of data, such as images, text, and audio. There's a growing trend in models that combine vision and language, like OpenAI's CLIP. These models excel in tasks like aligning image and text features, image captioning and visual question-answering. Their ability to generalize without specific training offers many practical uses.
+
+Supported Models
+-----------------
+NeMo Multimodal currently supports the following models:
+
++-----------------------------------+----------+-------------+------+-------------------------+------------------+
+| Model                             | Training | Fine-Tuning | PEFT | Evaluation              | Inference        |
++===================================+==========+=============+======+=========================+==================+
+| `CLIP <./clip.html>`_             | ✓        | -           | -    | zero-shot imagenet      | similarity score |
++-----------------------------------+----------+-------------+------+-------------------------+------------------+
+
+Spotlight Models
+-----------------
+
+Vision-Language models are at the forefront of multimodal learning, showcasing impressive abilities in tasks that require a combination of visual and textual comprehension. Let's take a quick look at some key models driving progress in this field:
+
+#. **Contrastive Learning Based Models**: At the forefront is CLIP :cite:`mm-models-radford2021clip`, which harnesses contrastive learning to jointly fine-tune a text and image encoder, facilitating a gamut of downstream tasks. CLIP's success has spurred further research, leading to models like ALIGN :cite:`mm-models-saharia2022photorealistic` and DeCLIP :cite:`mm-models-li2021declip`.
+
+#. **Holistic Foundation Models**: FLAVA :cite:`mm-models-singh2022flava` aspires to craft a universal model adept at vision, language, and multimodal tasks. Through a unified architecture, it vies to excel across a spectrum of tasks, embodying the essence of a true foundation model.
+
+#. **Bootstrapping Techniques**: BLIP :cite:`mm-models-blip2` employs a pioneering framework that shines in both understanding-based and generation-based vision-language tasks. By bootstrapping captions from noisy web data, it exhibits remarkable generalization across a plethora of vision-language challenges.
+
+Anatomy of Vision-Language Models
+----------------------------------
+
+At their core, vision-language models fundamentally consist of three main parts:
+
+1. **Image Encoder:** Extracts features from images.
+2. **Text Encoder:** Extracts features from textual data.
+3. **Fusion Strategy:** Merges the information gleaned from both encoders.
+
+These models have undergone a significant transformation. Earlier models used manually designed image descriptors and pre-trained word vectors. Nowadays, models primarily utilize transformer architectures for both image and text encoding, learning features together or separately. The pre-training objectives of these models are carefully designed to suit a wide range of tasks.
+
+Contrastive Learning: Bridging Vision and Language
+---------------------------------------------------
+
+Contrastive learning has burgeoned as a pivotal pre-training objective, especially for vision-language models. Models like CLIP, CLOOB, ALIGN, and DeCLIP have harnessed contrastive learning to bridge the chasm between vision and language. They accomplish this by jointly learning a text encoder and an image encoder using a contrastive loss, typically on extensive datasets encompassing {image, caption} pairs.
+
+The quintessence of contrastive learning is to map images and texts to a shared feature realm. Here, the distance between the embeddings of congruent image-text pairs is minimized, while it's maximized for incongruent pairs. For instance, CLIP employs the cosine distance between text and image embeddings, while models like ALIGN and DeCLIP have crafted their own distance metrics to cater to the intricacies of their datasets.
+
+CLIP and Beyond
+---------------
+
+The CLIP (Contrastive Language-Image Pre-training) model  has notably served as a linchpin for various models and applications within the realms of deep learning and computer vision, and also within the NeMo toolkit. Below is an elucidation on how the CLIP model extends its influence into other models and domains:
+
+1. **Use Cases in Vision Tasks:**
+   * **Classification:** CLIP can be harnessed for classification tasks, accepting arbitrary text labels for zero-shot classification on video frames or images.
+   * **Semantic Image Search:** Constructing a semantic image search engine with CLIP showcases its capability to generate embeddings for semantic content analysis and similarity search.
+
+2. **Image Similarity and Clustering:**
+   * In a practical scenario, CLIP's embeddings were leveraged for an image similarity search engine, showcasing its effectiveness in generating useful representations for visual similarity scenarios, even without being specifically trained for such tasks.
+
+3. **Foundation for Multimodal Language Models:**
+   * Large language models with visual capabilities, such as LLaVA, Flamingo, Kosmos-1, and Kosmos-2, have leaned on CLIP's architecture. In these models, images are encoded using a visual encoder derived from CLIP.
+
+4. **Foundation Diffusion Models:**
+   * Models like Stable Diffusion and Imagen have tapped into the prowess of the text encoder from CLIP to condition their processes based on text prompts. This integration exemplifies the adaptability and influence of the CLIP encoder in the broader AI landscape, especially in the domain of diffusion models.
+
+.. note::
+    NeMo Megatron has an Enterprise edition which proffers tools for data preprocessing, hyperparameter tuning, containers, scripts for various clouds, and more. With the Enterprise edition, you also garner deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
+
+
+.. toctree::
+   :maxdepth: 1
+
+   datasets
+   configs
+   checkpoint
+   clip
+
+References
+----------
+
+.. bibliography:: ../mm_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: MM-MODELS
+    :keyprefix: mm-models-
\ No newline at end of file
diff --git a/docs/source/vision/checkpoint.rst b/docs/source/vision/checkpoint.rst
new file mode 100644
index 000000000000..3d6f1365a76e
--- /dev/null
+++ b/docs/source/vision/checkpoint.rst
@@ -0,0 +1,77 @@
+Checkpoints
+===========
+
+In this section, we present four key functionalities of NVIDIA NeMo related to checkpoint management:
+
+1. **Checkpoint Loading**: Use the :code:`restore_from()` method to load local ``.nemo`` checkpoint files.
+2. **Partial Checkpoint Conversion**: Convert partially-trained ``.ckpt`` checkpoints to the ``.nemo`` format.
+3. **Community Checkpoint Conversion**: Convert checkpoints from community sources, like HuggingFace, into the ``.nemo`` format.
+4. **Model Parallelism Adjustment**: Adjusting model parallelism is crucial when training models that surpass the memory capacity of a single GPU, such as the NVGPT 5B version, LLaMA2 7B version, or larger models. NeMo incorporates both tensor (intra-layer) and pipeline (inter-layer) model parallelisms. For a deeper understanding, refer to "Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM" (`link <https://arxiv.org/pdf/2104.04473.pdf>`_). This tool assists in modifying model parallelism. After downloading and converting a community checkpoint to the ``.nemo`` format, if a user wishes to fine-tune it further, this adjustment might become essential.
+
+Understanding Checkpoint Formats
+--------------------------------
+
+A ``.nemo`` checkpoint is fundamentally a tar file that bundles the model configurations (given as a YAML file), model weights, and other pertinent artifacts like tokenizer models or vocabulary files. This consolidated design streamlines sharing, loading, tuning, evaluating, and inference.
+
+On the other hand, the ``.ckpt`` file, created during PyTorch Lightning training, contains only the model weights and the optimizer states, which is used to pick up training from a pause.
+
+The subsequent sections elucidate instructions for the functionalities above, specifically tailored for deploying fully trained checkpoints for assessment or additional fine-tuning.
+
+Loading Local Checkpoints
+-------------------------
+
+By default, NeMo saves checkpoints of trained models in the ``.nemo`` format. To save a model manually during training, use:
+
+.. code-block:: python
+
+   model.save_to(<checkpoint_path>.nemo)
+
+To load a local ``.nemo`` checkpoint:
+
+.. code-block:: python
+
+   import nemo.collections.multimodal as nemo_multimodal
+   model = nemo_multimodal.models.<MODEL_BASE_CLASS>.restore_from(restore_path="<path/to/checkpoint/file.nemo>")
+
+Replace `<MODEL_BASE_CLASS>` with the appropriate MM model class.
+
+Converting Local Checkpoints
+----------------------------
+
+Only the last checkpoint is automatically saved in the ``.nemo`` format. If intermediate training checkpoints evaluation is required, a ``.nemo`` conversion might be necessary. For this, refer to the script at `<ADD convert_ckpt_to_nemo.py PATH>`:
+
+.. code-block:: python
+
+   python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
+       examples/multimodal/convert_ckpt_to_nemo.py \
+       --checkpoint_folder <path_to_PTL_checkpoints_folder> \
+       --checkpoint_name <checkpoint_name> \
+       --nemo_file_path <path_to_output_nemo_file> \
+       --tensor_model_parallel_size <tensor_model_parallel_size> \
+       --pipeline_model_parallel_size <pipeline_model_parallel_size>
+
+Converting Community Checkpoints
+--------------------------------
+
+There is no support for converting community checkpoints to NeMo ViT.
+
+Model Parallelism Adjustment
+---------------------------
+
+ViT Checkpoints
+^^^^^^^^^^^^^^^^
+
+To adjust model parallelism from original model parallelism size to a new model parallelism size (Note: NeMo ViT currently only supports `pipeline_model_parallel_size=1`):
+
+.. code-block:: python
+
+   python examples/nlp/language_modeling/megatron_change_num_partitions.py \
+    --model_file=/path/to/source.nemo \
+    --target_file=/path/to/target.nemo \
+    --tensor_model_parallel_size=??? \
+    --target_tensor_model_parallel_size=??? \
+    --pipeline_model_parallel_size=-1 \
+    --target_pipeline_model_parallel_size=1 \
+    --precision=32 \
+    --model_class="nemo.collections.vision.models.megatron_vit_classification_models.MegatronVitClassificationModel" \
+    --tp_conversion_only
diff --git a/docs/source/vision/configs.rst b/docs/source/vision/configs.rst
new file mode 100644
index 000000000000..b62d85c47380
--- /dev/null
+++ b/docs/source/vision/configs.rst
@@ -0,0 +1,134 @@
+Common Configuration Files
+==========================
+This section provides a detailed overview of the NeMo configuration file setup specific to models within the NeMo vision models collection . For foundational knowledge about setting up and executing experiments common to all NeMo models, such as the Experiment Manager and PyTorch Lightning trainer parameters, refer to the :doc:`../core/core` section.
+
+Within the configuration files of the NeMo vision models, details concerning dataset(s), augmentation, optimization parameters, and model architectural specifications are central. This page explores each of these aspects.
+
+Discover exemplary configuration files for all NeMo vision models scripts in the `config directory of the examples <http://TODOURL>_.
+
+Dataset Configuration
+=====================
+
+The configuration file delineates parameters for dataset path.
+
+All initialization parameters supported by the Dataset class utilized in the experiment can be defined in the config file. 
+.. For a comprehensive list of Datasets and their associated parameters, consult the `Datasets <./api.html#Datasets>`__ section of the API.
+
+A representative training configuration appears as:
+
+.. code-block:: yaml
+
+  data:
+    data_path:
+      - ${data_dir}/imagenet_1k/train
+      - ${data_dir}/imagenet_1k/val
+    num_workers: 8
+    dataloader_type: cyclic
+    validation_drop_last: True
+    data_sharding: False
+
+
+
+Trainer Configuration
+---------------------
+
+This section outlines arguments for the Pytorch Lightning Trainer Object.
+
+.. code-block:: yaml
+
+  trainer:
+    devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
+    num_nodes: 1
+    max_epochs: -1
+    max_steps: 2500000 # precedence over max_epochs
+    logger: False  # Provided by exp_manager
+    precision: bf16 # Should be set to 16 for O1 and O2 to enable the AMP.
+    accelerator: gpu
+    log_every_n_steps: 5  # Interval of logging.
+    resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+    num_sanity_val_steps: 10 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+    enable_checkpointing: False # Provided by exp_manager
+    accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
+    gradient_clip_val: 1.0
+    benchmark: False
+    enable_model_summary: True
+
+For a detailed list of arguments, refer to the `Pytorch Lightning Trainer <https://lightning.ai/docs/pytorch/stable/common/trainer.html#>`__ API section.
+
+Experiment Manager Configurations
+---------------------------------
+
+The NeMo Experiment Manager provides a streamlined approach to manage various tasks such as logging, saving, and resuming.
+
+.. code-block:: yaml
+
+  exp_manager:
+    exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
+    name: ${name}
+    create_wandb_logger: True
+    wandb_logger_kwargs: # Whether you want exp_manger to create a Wandb logger
+      name: training-session
+      project: text2img
+      group: nemo
+      resume: True
+    create_tensorboard_logger: True  # Whether you want exp_manger to create a tb logger
+    create_checkpoint_callback: True  # Whether you want exp_manager to create a model checkpoint callback
+    checkpoint_callback_params:
+      monitor: reduced_train_loss
+      save_top_k: 5
+      every_n_epochs: 0 # Save checkpoint frequency.
+      every_n_train_steps: 1000 # Mutually exclusive with every_n_epochs. It is recommended to set this if training on large-scale dataset.
+      filename: '${name}--{reduced_train_loss:.2f}-{step}-{consumed_samples}'
+    resume_if_exists: True
+    resume_ignore_no_checkpoint: True
+    resume_from_checkpoint: ${model.resume_from_checkpoint}
+    ema:
+      enable: True
+      decay: 0.9999
+      validate_original_weights: False
+      every_n_steps: 1
+      cpu_offload: False
+
+Optimizer Configurations
+-------------------------
+
+.. code-block:: yaml
+
+  optim:
+    name: fused_adam
+    lr: 0.0001
+    eps: 1e-8
+    betas: [ 0.9, 0.999 ]
+    weight_decay: 0.01
+    sched:
+      name: WarmupPolicy
+      warmup_steps: 10000
+      warmup_ratio: null
+
+The default optimizer used is ``fused_adam``. For details on all supported optimizers, refer to the NeMo user guide. The learning rate scheduler can be specified in the ``optim.sched`` section.
+
+Model Configurations
+--------------------
+
+Each configuration file should detail the model architecture used for the experiment.
+
+The parameters commonly shared across most vision collection models include:
+
++-----------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| **Parameter**                           | **Datatype** | **Description**                                                                       |
++=========================================+==============+=======================================================================================+
+| :code:`micro_batch_size`                | int          | Micro batch size that fits on each GPU                                                |
++-----------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`global_batch_size`               | int          | Global batch size considering gradient accumulation and data parallelism              |
++-----------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`tensor_model_parallel_size`      | int          | Intra-layer model parallelism                                                         |
++-----------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`pipeline_model_parallel_size`    | int          | Inter-layer model parallelism                                                         |
++-----------------------------------------+--------------+---------------------------------------------------------------------------------------+
+| :code:`seed`                            | int          | Seed used in training                                                                 |
++-----------------------------------------+--------------+---------------------------------------------------------------------------------------+
+
+ViT
+~~~~~~~~
+
+For model-specific configurations, refer to `vit <./vit.html#vit>`_.
diff --git a/docs/source/vision/datasets.rst b/docs/source/vision/datasets.rst
new file mode 100644
index 000000000000..84bb38a077fa
--- /dev/null
+++ b/docs/source/vision/datasets.rst
@@ -0,0 +1,45 @@
+Datasets
+========
+
+ImageNet Data Preparation
+-----------------
+
+.. note:: It is the responsibility of each user to check the content of the dataset, review the applicable licenses, and determine if it is suitable for their intended use. Users should review any applicable links associated with the dataset before placing the data on their machine.
+
+Please note that according to the ImageNet terms and conditions, automated scripts for downloading the dataset are not
+provided. Instead, one can follow the steps outlined below to download and extract the data.
+
+ImageNet 1k
+^^^^^^^^^^^^^^^
+
+1. Create an account on `ImageNet <http://image-net.org/download-images>`_ and navigate to ILSVRC 2012.
+   Download "Training images (Task 1 & 2)" and "Validation images (all tasks)" to ``data/imagenet_1k``.
+2. Extract the training data:
+
+.. code-block::
+
+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+  tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+  find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+  cd ..
+
+3. Extract the validation data and move the images to subfolders:
+
+.. code-block::
+
+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+
+
+ImageNet 21k
+^^^^^^^^^^^^^^^
+
+1. Create an account on `ImageNet <http://image-net.org/download-images>`_ and download "ImageNet21k" to
+   ``data/imagenet_21k``.
+2. Extract the data:
+
+.. code-block::
+
+  tar -xvf winter21_whole.tar.gz && rm -f winter21_whole.tar.gz
+  find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+
diff --git a/docs/source/vision/images/vit_arch.png b/docs/source/vision/images/vit_arch.png
new file mode 100644
index 000000000000..4ddbcd3058bb
Binary files /dev/null and b/docs/source/vision/images/vit_arch.png differ
diff --git a/docs/source/vision/intro.rst b/docs/source/vision/intro.rst
new file mode 100644
index 000000000000..6df5881e1121
--- /dev/null
+++ b/docs/source/vision/intro.rst
@@ -0,0 +1,45 @@
+Foundation Vision Models in NeMo
+================================
+
+NeMo has implemented foundational vision models, establishing a solid base for further exploration into multimodal applications. These foundational vision models can be leveraged in a variety of multimodal applications including multimodal language models and text to image generation tasks, among others. These foundation models not only lay the functional groundwork but also play a crucial role in achieving state-of-the-art performance on NVIDIA GPUs through our custom optimizations.
+
+Supported Models
+-----------------
+NeMo's vision foundation currently supports the following models:
+
++----------------------------------+----------+-------------+------+----------------------+------------------+
+| Model                            | Training | Fine-Tuning | PEFT | Evaluation           | Inference        |
++==================================+==========+=============+======+======================+==================+
+| Vision Transformer (ViT)         | ✓        | ✓           | ✗    | imagenet zero-shot   | ✗                |
++----------------------------------+----------+-------------+------+----------------------+------------------+
+| AutoencoderKL (VAE with KL loss) | ✗        | ✗           | ✗    | ✗                    | To be added      |
++----------------------------------+----------+-------------+------+----------------------+------------------+
+
+Spotlight Models
+-----------------
+
+1. **Vision Transformer (ViT)**:
+   Vision Transformer (ViT) :cite:`vision-models-vit` stands as a compelling alternative to the traditionally employed Convolutional Neural Networks (CNNs) for image classification tasks. Unlike CNNs that work on the entire image, ViT divides an image into fixed-size patches, linearly embeds them into 1D vectors, and adds positional embeddings. These vectors are then fed into a Transformer encoder to capture both local and global features of the image. This model has shown to outperform CNNs in terms of computational efficiency and accuracy by a significant margin, making it a powerful tool for image-related tasks.
+
+2. **AutoencoderKL (Variational Autoencoder with KL loss**:
+   The AutoencoderKL model is a Variational Autoencoder (VAE) equipped with KL loss, introduced in the paper Auto-Encoding Variational Bayes by Diederik P. Kingma and Max Welling :cite:`vision-models-kingma2022autoencoding`. This model is adept at encoding images into latent representations and decoding these representations back into images. The KL divergence term in the loss function serves to align the distribution of the encoder output as closely as possible to a standard multivariate normal distribution, facilitating the exploration of the latent space. The continuous nature of the Variational Autoencoder's latent space enables random sampling and interpolation, which are crucial for tasks like image reconstruction and generation.
+
+.. note::
+    NeMo Megatron has an Enterprise edition which contains tools for data preprocessing, hyperparameter tuning, container, scripts for various clouds and more. With Enterprise edition you also get deployment tools. Apply for `early access here <https://developer.nvidia.com/nemo-megatron-early-access>`_ .
+
+
+.. toctree::
+   :maxdepth: 1
+
+   datasets
+   configs
+   checkpoint
+   vit
+
+References
+----------
+
+.. bibliography:: ./vision_all.bib
+    :style: plain
+    :labelprefix: VISION-MODELS
+    :keyprefix: vision-models-
\ No newline at end of file
diff --git a/docs/source/vision/vision_all.bib b/docs/source/vision/vision_all.bib
new file mode 100644
index 000000000000..f51cfdac1617
--- /dev/null
+++ b/docs/source/vision/vision_all.bib
@@ -0,0 +1,17 @@
+@misc{vit,
+      title={Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding}, 
+      author={Chitwan Saharia and William Chan and Saurabh Saxena and Lala Li and Jay Whang and Emily Denton and Seyed Kamyar Seyed Ghasemipour and Burcu Karagol Ayan and S. Sara Mahdavi and Rapha Gontijo Lopes and Tim Salimans and Jonathan Ho and David J Fleet and Mohammad Norouzi},
+      year={2022},
+      eprint={2205.11487},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{kingma2022autoencoding,
+      title={Auto-Encoding Variational Bayes},
+      author={Diederik P Kingma and Max Welling},
+      year={2022},
+      eprint={1312.6114},
+      archivePrefix={arXiv},
+      primaryClass={stat.ML}
+}
\ No newline at end of file
diff --git a/docs/source/vision/vit.rst b/docs/source/vision/vit.rst
new file mode 100644
index 000000000000..7109880d7ad7
--- /dev/null
+++ b/docs/source/vision/vit.rst
@@ -0,0 +1,136 @@
+ViT
+========
+
+Model Introduction
+-------------------
+
+The Vision Transformer, commonly referred to as ViT :cite:`vision-models-vit`, serves as a foundational model
+for image classification tasks in NeMo. Unlike conventional convolutional neural networks, ViT adopts a transformer-like
+architecture to process image data. In this approach, an image is divided into fixed-size patches, typically
+14x14 or 16x16. These patches are linearly embedded and augmented with position embeddings. The resulting
+sequence of vectors is passed through a standard transformer encoder. In order to facilitate classification, a "classification token"
+that is learnable is incorporated into the sequence.
+
+    .. image:: images/vit_arch.png
+        :align: center
+        :alt: ViT model
+
+ViT models can be instantiated using the :class:`~nemo.collections.vision.models.megatron_vit_classification_models.MegatronVitClassificationModel` class.
+
+Transformer Encoder
+^^^^^^^^^^^^^^^
+
+NeMo's implementation of the ViT model leverages its parallel transformer implementation, specifically
+the `nemo.collections.nlp.modules.common.megatron.transformer.ParallelTransformer`, to enable model parallelism support
+in the transformer encoder. This design choice ensures efficient scaling and utilization of resources during training.
+
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+| Model | Model size (M) | Hidden size| FFN_dim| Attention heads | Number of layers | PatchDim| Num Batches (Seq) |
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+| B/16  | 86             | 768        | 3072   | 12              | 12               | 16      | 204               |
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+| L/16  | 303            | 1024       | 4096   | 16              | 24               | 16      | 204               |
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+| H/16  | 632            | 1280       | 5120   | 16              | 32               | 16      | 204               |
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+| H/14  | 632            | 1280       | 5120   | 16              | 32               | 14      | 264               |
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+| g/14  | 1011           | 1408       | 6144   | 16              | 40               | 14      | 264               |
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+| G/14  | 1843           | 1664       | 8192   | 16              | 48               | 14      | 264               |
++-------+----------------+------------+--------+-----------------+------------------+---------+-------------------+
+
+Model Configuration
+------------------
+
+Transformer Encoder
+^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+  encoder_seq_length: 196
+  max_position_embeddings: ${.encoder_seq_length}
+  num_layers: 12
+  hidden_size: 768
+  ffn_hidden_size: 3072
+  num_attention_heads: 12
+  hidden_dropout: 0.1
+  attention_dropout: 0.
+
+- ``encoder_seq_length``: Sequence length for the transformer encoder.
+- ``num_layers``, ``hidden_size``, ``ffn_hidden_size``, ``num_attention_heads``: Parameters defining the architecture of the text transformer. The ``ffn_hidden_size`` is typically 4 times the ``hidden_size``.
+- ``hidden_dropout`` and ``attention_dropout``: Dropout probabilities for the hidden state and attention in the transformer respectively.
+
+Patch & Positional Embedding
+^^^^^^^^^^^^
+
+.. code-block:: yaml
+  vision_pretraining_type: "classify"
+  num_classes: 1000
+  patch_dim: 16
+  img_h: 224
+  img_w: 224
+  num_channels: 3
+
+- ``vision_pretraining_type``: Type of MLP head, with support limited to classification tasks now
+- ``num_classes``: Number of labels used for classification
+- ``patch_dim``: Size of the patches the image is divided into.
+- ``img_h`` and ``img_w``: Height and width of the input images.
+- ``num_channels``: Number of channels in the input image (e.g., 3 for RGB images).
+
+Optimizations
+^^^^^^^^^^^^^^
+
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Feature                  | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | To Enable                                       |
++==========================+================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================+=================================================+
+| Data parallelism         | Dataset is read concurrently across multiple GPUs or nodes, allowing for faster data loading and processing.                                                                                                                                                                                                                                                                                                                                                                                                    | Automatically when training on multi GPUs/nodes |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Tensor parallelism       | Each tensor is split up into multiple chunks, allowing for horizontal parallelism across GPUs. This technique, known as TensorParallel (TP), distributes the model's tensors across multiple GPUs. During processing, each shard gets processed separately and in parallel on different GPUs, and the results are synced at the end of the step. This approach is inspired by NVIDIA's Megatron implementation. [Reference](https://github.com/NVIDIA/Megatron-LM#distributed-pretraining)                     | ``model.tensor_model_parallel_size`` |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Activation Checkpointing | To reduce memory usage, activations of certain layers are cleared and recomputed during a backward pass. This technique is particularly useful for training large models that wouldn't fit in GPU memory using traditional methods.                                                                                                                                                                                                                                                                            | ``model.activations_checkpoint_granularity=full``, ``model.activations_checkpoint_method=block``, ``model.activations_checkpoint_num_layers={num_layers_to_check}`` |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Bfloat16 Training        | Training is conducted in Bfloat16 precision, which offers a balance between the higher precision of FP32 and the memory savings and speed of FP16.                                                                                                                                                                                                                                                                                                                                                                                                                    | ``trainer.precision=bf16``                      |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| BF16 O2                  | Enables O2-level automatic mixed precision, optimizing Bfloat16 precision for better performance.                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | ``model.megatron_amp_O2=True``                  |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Distributed Optimizer    | The optimization process is distributed across multiple GPUs, reducing memory requirements. This technique distributes the optimizer state across data parallel ranks, rather than replicating it, offering significant memory savings. This approach is inspired by the ZeRO optimization described in the paper "ZeRO: Memory Optimizations Toward Training Trillion Parameter Models" and implemented in NVIDIA's Megatron. [Reference](https://github.com/NVIDIA/Megatron-LM#distributed-optimizer) | ``model.optim.name="distributed_fused_adam"``   |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+| Flash Attention V2       | FlashAttention is a fast and memory-efficient algorithm to compute exact attention. It speeds up model training and reduces memory requirement by being IO-aware. This approach is particularly useful for large-scale models and is detailed further in the repository linked. [Reference](https://github.com/Dao-AILab/flash-attention)                                                                                                                                                                                                                     | ``model.use_flash_attention=True`` |
++--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------+
+
+Model Training
+^^^^^^^^^^^^^^
+Below are the highlights of the training and fine-tuning recipe we used:
+.. code-block::
+  Model: ViT B/16
+  Dataset: ImageNet 1K
+  Pretraining:
+
+  Epochs: 300
+  Batch Size: 4096
+  Training Resolution: 224
+  Optimizer: Adam (0.9, 0.999)
+  Base Learning Rate: 3.00E-03
+  Learning Rate Decay: Cosine
+  Weight Decay: 0.3
+  Dropout: 0.1
+
+
+  Fine-tuning:
+
+  Steps: 20,000
+  Batch Size: 512
+  Fine-tuning Resolution: 512
+  Optimizer: SGD (0.9)
+  Base Learning Rate: 0.003 - 0.06
+  Learning Rate Decay: Cosine
+  Weight Decay: 0
+
+Reference
+-----------
+
+.. bibliography:: ./vision_all.bib
+    :style: plain
+    :filter: docname in docnames
+    :labelprefix: VISION-MODELS
+    :keyprefix: vision-models-
\ No newline at end of file
diff --git a/examples/multimodal/convert_ckpt_to_nemo.py b/examples/multimodal/convert_ckpt_to_nemo.py
index 2c36f434a075..2bc0f5d7ab62 100644
--- a/examples/multimodal/convert_ckpt_to_nemo.py
+++ b/examples/multimodal/convert_ckpt_to_nemo.py
@@ -30,10 +30,9 @@
 import torch
 from omegaconf.omegaconf import OmegaConf, open_dict
 
-from nemo.collections.multimodal.models.multimodal_llm.kosmos import MegatronKosmosModel
 from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
 from nemo.collections.multimodal.models.text_to_image.controlnet.controlnet import MegatronControlNet
-from nemo.collections.multimodal.models.text_to_image.imagen import MegatronImagen
+from nemo.collections.multimodal.models.text_to_image.imagen.imagen import MegatronImagen
 from nemo.collections.multimodal.models.text_to_image.instruct_pix2pix.ldm.ddpm_edit import MegatronLatentDiffusionEdit
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm import MegatronLatentDiffusion
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import MegatronCLIPModel
diff --git a/examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml b/examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml
index 13ca53e835f2..2d1b3cfd79c4 100644
--- a/examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml
+++ b/examples/multimodal/text_to_image/controlnet/conf/controlnet_v1-5.yaml
@@ -98,7 +98,7 @@ model:
   sd_locked: True
 
   control_stage_config:
-    _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlNet
+    _target_: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.ControlNet
     params:
       from_pretrained_unet: /ckpts/v1-5-pruned.ckpt
       from_NeMo: True
@@ -119,7 +119,7 @@ model:
       use_flash_attention: False
 
   unet_config:
-    _target_: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel
+    _target_: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.ControlledUnetModel
     from_pretrained: /ckpts/v1-5-pruned.ckpt
     from_NeMo: True
     image_size: 32 # unused
@@ -145,7 +145,7 @@ model:
     use_flash_attention: False
 
   first_stage_config:
-    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+    _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
     from_pretrained: /ckpts/vae.bin
     embed_dim: 4
     monitor: val/rec_loss
diff --git a/examples/multimodal/text_to_image/controlnet/controlnet_train.py b/examples/multimodal/text_to_image/controlnet/controlnet_train.py
index 239409f616f1..2bb8b66cac1a 100644
--- a/examples/multimodal/text_to_image/controlnet/controlnet_train.py
+++ b/examples/multimodal/text_to_image/controlnet/controlnet_train.py
@@ -14,8 +14,8 @@
 
 from pytorch_lightning import Trainer
 
-from nemo.collections.multimodal.models.text_to_image.controlnet import ImageLogger
 from nemo.collections.multimodal.models.text_to_image.controlnet.controlnet import MegatronControlNet
+from nemo.collections.multimodal.models.text_to_image.controlnet.util import ImageLogger
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils.exp_manager import exp_manager
diff --git a/examples/multimodal/text_to_image/dreambooth/conf/dreambooth.yaml b/examples/multimodal/text_to_image/dreambooth/conf/dreambooth.yaml
index 37e9b284e219..a0886c5e8970 100644
--- a/examples/multimodal/text_to_image/dreambooth/conf/dreambooth.yaml
+++ b/examples/multimodal/text_to_image/dreambooth/conf/dreambooth.yaml
@@ -49,7 +49,7 @@ model:
   global_batch_size: 2 # will use more micro batches to reach global batch size
 
   with_prior_preservation: False
-  use_cached_latents: True
+  use_cached_latents: False
   prior_loss_weight: 0.5
   train_text_encoder: False
   restore_from_path: /ckpts/nemo-v1-5-188000-ema.nemo #This ckpt is only used to generate regularization images, thus .nemo ckpt is needed
@@ -123,7 +123,7 @@ model:
     use_flash_attention: False
 
   first_stage_config:
-    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+    _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
     from_pretrained: /ckpts/vae.bin
     embed_dim: 4
     monitor: val/rec_loss
@@ -158,7 +158,7 @@ model:
     #    max_length: 77
 
   noise_scheduler:
-    _target_: nemo.collections.multimodal.models.dreambooth.util.sd_noise_scheduler
+    _target_: nemo.collections.multimodal.models.text_to_image.dreambooth.util.sd_noise_scheduler
     parameterization: eps
     v_posterior: 0
     given_betas:
diff --git a/examples/multimodal/text_to_image/dreambooth/dreambooth.py b/examples/multimodal/text_to_image/dreambooth/dreambooth.py
index d968d301389c..e8e7d776f1ff 100644
--- a/examples/multimodal/text_to_image/dreambooth/dreambooth.py
+++ b/examples/multimodal/text_to_image/dreambooth/dreambooth.py
@@ -16,7 +16,7 @@
 import torch
 from omegaconf import OmegaConf
 
-from nemo.collections.multimodal.models.text_to_image.dreambooth import MegatronDreamBooth
+from nemo.collections.multimodal.models.text_to_image.dreambooth.dreambooth import MegatronDreamBooth
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm import MegatronLatentDiffusion
 from nemo.collections.multimodal.parts.stable_diffusion.pipeline import pipeline
 from nemo.collections.multimodal.parts.utils import setup_trainer_and_model_for_inference
diff --git a/examples/multimodal/text_to_image/imagen/imagen_training.py b/examples/multimodal/text_to_image/imagen/imagen_training.py
index 61e879ebb063..23c1c9c1a1d7 100644
--- a/examples/multimodal/text_to_image/imagen/imagen_training.py
+++ b/examples/multimodal/text_to_image/imagen/imagen_training.py
@@ -18,7 +18,7 @@
 from torch._dynamo import disable
 from torch._inductor import config as inductor_config
 
-from nemo.collections.multimodal.models.text_to_image.imagen import MegatronImagen
+from nemo.collections.multimodal.models.text_to_image.imagen.imagen import MegatronImagen
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
index 6c07d460670c..e87a99344d70 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_train.yaml
@@ -119,11 +119,11 @@ model:
     use_checkpoint: False
     legacy: False
     use_flash_attention: True
-    enable_amp_o2_fp16: True
+    enable_amp_o2_fp16: False
     resblock_gn_groups: 32
 
   first_stage_config:
-    _target_: nemo.collections.multimodal.models.stable_diffusion.ldm.autoencoder.AutoencoderKL
+    _target_: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKL
     from_pretrained: /ckpts/vae.bin
     embed_dim: 4
     monitor: val/rec_loss
@@ -168,7 +168,7 @@ model:
   ddp_overlap: True # True for using PyTorch DDP overlap.
 
   optim:
-    name: megatron_fused_adam
+    name: fused_adam
     lr: null
     weight_decay: 0.
     betas:
@@ -178,9 +178,6 @@ model:
       name: WarmupHoldPolicy
       warmup_steps: 10000
       hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant
-    capturable: True
-    master_weights: True
-    max_norm: ${trainer.gradient_clip_val}
 
   # Nsys profiling options
   nsys_profile:
diff --git a/nemo/README.md b/nemo/README.md
index 2db456547ab9..d7c95a070979 100644
--- a/nemo/README.md
+++ b/nemo/README.md
@@ -7,3 +7,4 @@ NeMo (**Ne**ural **Mo**dules) is a toolkit for creating AI applications built ar
 * ASR - collection of modules and models for building speech recognition networks
 * TTS - collection of modules and models for building speech synthesis networks
 * NLP - collection of modules and models for building NLP networks
+* Multimodal - collection of modules and models for building multimodal networks
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 65bcee9d48e4..1a38d86742b4 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -540,7 +540,6 @@ def dummy():
                 init_method_std=self.cfg.get('init_method_std', 0.02),
                 use_scaled_init_method=self.cfg.get('use_scaled_init_method', True),
                 fp16_lm_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
-                megatron_amp_O2=self.cfg.get('megatron_amp_O2', False),
                 hidden_dropout=self.cfg.get('hidden_dropout', 0.1),
                 attention_dropout=self.cfg.get('attention_dropout', 0.1),
                 ffn_dropout=self.cfg.get('ffn_dropout', 0.0),
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
index d551edaf1bd2..1dd695af86f5 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
@@ -16,7 +16,11 @@
 import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
-from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+
+try:
+    from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+except ImportError:
+    from taming.modules.vqvae.quantize import VectorQuantizer
 
 from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.model import Decoder, Encoder
 from nemo.collections.multimodal.modules.stable_diffusion.distributions.distributions import (
diff --git a/nemo/collections/multimodal/modules/nerf/geometry/nerf_base.py b/nemo/collections/multimodal/modules/nerf/geometry/nerf_base.py
index c539a4c17771..6ea7e98abb9c 100644
--- a/nemo/collections/multimodal/modules/nerf/geometry/nerf_base.py
+++ b/nemo/collections/multimodal/modules/nerf/geometry/nerf_base.py
@@ -16,7 +16,6 @@
 
 import mcubes
 import numpy as np
-import pymeshlab
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -304,7 +303,7 @@ def normal_finite_differences(self, positions: torch.Tensor, eps: float = 1e-2)
     @torch.no_grad()
     def mesh(
         self, resolution: Optional[int] = 128, batch_size: int = 128, density_thresh: Optional[float] = None
-    ) -> pymeshlab.Mesh:
+    ) -> trimesh.base.Trimesh:
         """
         Generate a mesh from the nerf.
 
@@ -314,7 +313,7 @@ def mesh(
             density_thresh (Optional[float]): Density threshold for the mesh generation. Default is None, will be calculated from mean density.
 
         Returns:
-            pymeshlab.Mesh: Mesh object.
+            trimesh.base.Trimesh: Mesh object.
         """
         # Generate a grid of 3D points
         x = np.linspace(-self.bound, self.bound, resolution)
@@ -343,31 +342,21 @@ def batch_process(fn, input, batch_size):
         vertices, triangles = mcubes.marching_cubes(density, density_thresh)
 
         # Create a new Mesh
-        ms = pymeshlab.MeshSet()
+        mesh = trimesh.Trimesh(vertices=vertices, faces=triangles)
 
-        # Create Mesh using vertices and faces
-        m = pymeshlab.Mesh(vertices.copy(), triangles.copy())
+        # Basic mesh cleaning and optimization
+        mesh.remove_unreferenced_vertices()
+        mesh.remove_infinite_values()
+        mesh.remove_duplicate_faces()
 
-        # Add mesh to the MeshSet
-        ms.add_mesh(m, "generated_mesh")
+        # Scale vertices back to [-self.bound, self.bound]
+        scaled_vertices = -self.bound + (mesh.vertices / resolution) * 2 * self.bound
+        mesh.vertices = scaled_vertices
 
-        # Filters
-        ms.meshing_remove_unreferenced_vertices()
-        ms.meshing_remove_duplicate_faces()
-        ms.meshing_remove_null_faces()
-        ms.meshing_repair_non_manifold_edges(method=0)
-        ms.meshing_repair_non_manifold_vertices(vertdispratio=0)
-
-        m = ms.current_mesh()
-        vertices = m.vertex_matrix()
-        faces = m.face_matrix()
-
-        scaled_vertice = (
-            -self.bound + (vertices / resolution) * 2 * self.bound
-        )  # scale vertices back to [-self.bound, self.bound]
-        scaled_vertices_torch = torch.tensor(scaled_vertice, dtype=torch.float32).to(device="cuda")
+        # Assigning color to vertices
+        scaled_vertices_torch = torch.tensor(scaled_vertices, dtype=torch.float32).to(device="cuda")
         color = batch_process(fn=self.forward_features, input=scaled_vertices_torch, batch_size=batch_size)
+        color = (color * 255).astype(np.uint8)
+        mesh.visual.vertex_colors = color
 
-        # Create the final mesh from cleaned vertices and faces and with color
-        mesh = trimesh.Trimesh(vertices=vertices, faces=faces, vertex_colors=color)
         return mesh
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 7f762343c4e9..1cf1798015eb 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -30,6 +30,7 @@
     timestep_embedding,
     zero_module,
 )
+from nemo.utils import logging
 
 
 def convert_module_to_dtype(module, dtype):
@@ -304,6 +305,23 @@ def _forward(self, x, emb):
         return self.skip_connection(x) + h
 
 
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False,
+    ):
+        super().__init__()
+        logging.info(
+            "This option is deprecated, please set use_spatial_transformer=True in unet_config to build attention blocks"
+        )
+        raise NotImplementedError
+
+
 def count_flops_attn(model, _x, y):
     """
     A counter for the `thop` package to count the operations in an
diff --git a/nemo/collections/multimodal/parts/stable_diffusion/lr_scheduler.py b/nemo/collections/multimodal/parts/stable_diffusion/lr_scheduler.py
new file mode 100644
index 000000000000..620d1dcad41a
--- /dev/null
+++ b/nemo/collections/multimodal/parts/stable_diffusion/lr_scheduler.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+
+    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.0
+        self.verbosity_interval = verbosity_interval
+
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (1 + np.cos(t * np.pi))
+            self.last_lr = lr
+            return lr
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+
+
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+
+    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
+        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.0
+        self.verbosity_interval = verbosity_interval
+
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (1 + np.cos(t * np.pi))
+            self.last_f = f
+            return f
+
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+
+
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0:
+                print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " f"current cycle {cycle}")
+
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (
+                self.cycle_lengths[cycle]
+            )
+            self.last_f = f
+            return f
diff --git a/nemo/collections/multimodal/parts/stable_diffusion/utils.py b/nemo/collections/multimodal/parts/stable_diffusion/utils.py
index 3e6697747c13..47924534a803 100644
--- a/nemo/collections/multimodal/parts/stable_diffusion/utils.py
+++ b/nemo/collections/multimodal/parts/stable_diffusion/utils.py
@@ -46,7 +46,7 @@ def log_txt_as_img(wh, xc, size=10):
         try:
             draw.text((0, 0), lines, fill="black")
         except UnicodeEncodeError:
-            logging("Cant encode string for logging. Skipping.")
+            logging.info("Cant encode string for logging. Skipping.")
 
         txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
         txts.append(txt)
@@ -88,7 +88,7 @@ def mean_flat(tensor):
 def count_params(model, verbose=False):
     total_params = sum(p.numel() for p in model.parameters())
     if verbose:
-        logging(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+        logging.info(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
     return total_params
 
 
@@ -104,7 +104,7 @@ def instantiate_from_config(config):
 
 def get_obj_from_str(string, reload=False):
     module, cls = string.rsplit(".", 1)
-    logging(f'Getting module=<{module}>, cls=<{cls}>')
+    logging.info(f'Getting module=<{module}>, cls=<{cls}>')
     if reload:
         module_imp = importlib.import_module(module)
         importlib.reload(module_imp)
@@ -130,7 +130,7 @@ def parallel_data_prefetch(
         raise ValueError("list expected but function got ndarray.")
     elif isinstance(data, abc.Iterable):
         if isinstance(data, dict):
-            logging(
+            logging.info(
                 f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.'
             )
             data = list(data.values())
@@ -164,7 +164,7 @@ def parallel_data_prefetch(
         processes += [p]
 
     # start processes
-    logging(f"Start prefetching...")
+    logging.info(f"Start prefetching...")
     import time
 
     start = time.time()
@@ -183,7 +183,7 @@ def parallel_data_prefetch(
                 gather_res[res[0]] = res[1]
 
     except Exception as e:
-        logging("Exception: ", e)
+        logging.info("Exception: ", e)
         for p in processes:
             p.terminate()
 
@@ -191,7 +191,7 @@ def parallel_data_prefetch(
     finally:
         for p in processes:
             p.join()
-        logging(f"Prefetching complete. [{time.time() - start} sec.]")
+        logging.info(f"Prefetching complete. [{time.time() - start} sec.]")
 
     if target_data_type == 'ndarray':
         if not isinstance(gather_res[0], np.ndarray):
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
new file mode 100644
index 000000000000..a2506f7a5fff
--- /dev/null
+++ b/requirements/requirements_multimodal.txt
@@ -0,0 +1,16 @@
+addict
+clip
+diffusers>=0.19.3
+einops_exts
+# webdataset>=0.2.48 conflict with ASR models, manually pip install when using multimodal
+# flash-attn>=2.0.1
+imageio
+kornia
+nerfacc>=0.5.3
+open_clip_torch
+opencv-python>=4.8.0.74
+PyMCubes
+taming-transformers
+torchdiffeq
+torchsde
+trimesh
diff --git a/setup.py b/setup.py
index 011e2d15d89d..1195bf0cbb0d 100644
--- a/setup.py
+++ b/setup.py
@@ -87,6 +87,7 @@ def req_file(filename, folder="requirements"):
     'nlp': req_file("requirements_nlp.txt"),
     'tts': req_file("requirements_tts.txt"),
     'slu': req_file("requirements_slu.txt"),
+    'multimodal': req_file("requirements_multimodal.txt"),
 }
 
 
@@ -98,6 +99,9 @@ def req_file(filename, folder="requirements"):
 extras_require['asr'] = list(chain([extras_require['asr'], extras_require['core'], extras_require['common']]))
 extras_require['nlp'] = list(chain([extras_require['nlp'], extras_require['core'], extras_require['common'],]))
 extras_require['tts'] = list(chain([extras_require['tts'], extras_require['core'], extras_require['common'],]))
+extras_require['multimodal'] = list(
+    chain([extras_require['multimodal'], extras_require['nlp'], extras_require['core'], extras_require['common'],])
+)
 
 # TTS has extra dependencies
 extras_require['tts'] = list(chain([extras_require['tts'], extras_require['asr']]))