diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml
index 8ebb698f..307ade0e 100644
--- a/.github/workflows/build-and-publish.yml
+++ b/.github/workflows/build-and-publish.yml
@@ -14,6 +14,7 @@ jobs:
           - "framework"
           - "accelerated-peft"
           - "fused-ops-and-kernels"
+          - "attention-and-distributed-packing"
 
     permissions:
       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
index d6d6b089..90f7210a 100644
--- a/.github/workflows/format.yml
+++ b/.github/workflows/format.yml
@@ -29,7 +29,7 @@ jobs:
           - "framework"
           - "accelerated-peft"
           - "fused-ops-and-kernels"
-          - "instruct-lab"
+          - "attention-and-distributed-packing"
 
     steps:
       - uses: actions/checkout@v4
diff --git a/README.md b/README.md
index f79026f4..1158550c 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Plugin | Description | Depends | License | Status
 [framework](./plugins/framework/README.md) | This acceleration framework for integration with huggingface trainers | | | Alpha
 [accelerated-peft](./plugins/accelerated-peft/README.md) | For PEFT-training, e.g., 4bit QLoRA. | Huggingface<br>AutoGPTQ | Apache 2.0<br>MIT | Alpha
 [fused-op-and-kernels](./plugins/fused-ops-and-kernels/README.md)  | Fused LoRA and triton kernels (e.g., fast cross-entropy, rms, rope) | -- | Apache 2.0 [(contains extracted code)](./plugins/fused-ops-and-kernels/README.md#code-extracted-from-unsloth)| Beta
-[instruct-lab](./plugins/instruct-lab/README.md)  | Padding-Free Flash Attention Computation | flash-attn | Apache 2.0 | Beta
+[attention-and-distributed-packing](./plugins/attention-and-distributed-packing/README.md)  | Padding-Free Flash Attention Computation | flash-attn | Apache 2.0 | Beta
  MOE-training-acceleration  | [MegaBlocks](https://github.com/databricks/megablocks) inspired triton Kernels and acclerations for Mixture-of-Expert models |  | Apache 2.0 | Coming Soon
 
 ## Usage with FMS HF Tuning
diff --git a/plugins/instruct-lab/.isort.cfg b/plugins/attention-and-distributed-packing/.isort.cfg
similarity index 80%
rename from plugins/instruct-lab/.isort.cfg
rename to plugins/attention-and-distributed-packing/.isort.cfg
index 4aa62fac..98382601 100644
--- a/plugins/instruct-lab/.isort.cfg
+++ b/plugins/attention-and-distributed-packing/.isort.cfg
@@ -8,6 +8,3 @@ import_heading_firstparty=First Party
 import_heading_localfolder=Local
 known_firstparty=
 known_localfolder=tuning
-
-# skip code imported from unsloth
-skip_glob=**/unsloth*/**
diff --git a/plugins/instruct-lab/.pylintrc b/plugins/attention-and-distributed-packing/.pylintrc
similarity index 100%
rename from plugins/instruct-lab/.pylintrc
rename to plugins/attention-and-distributed-packing/.pylintrc
diff --git a/plugins/instruct-lab/README.md b/plugins/attention-and-distributed-packing/README.md
similarity index 96%
rename from plugins/instruct-lab/README.md
rename to plugins/attention-and-distributed-packing/README.md
index d76f327e..cad6ec63 100644
--- a/plugins/instruct-lab/README.md
+++ b/plugins/attention-and-distributed-packing/README.md
@@ -1,4 +1,4 @@
-# FMS Acceleration for Instruct Lab
+# FMS Acceleration for Attention And Distributed Packing Plugin
 
 This library contains plugins to accelerate finetuning with the following optimizations:
 
diff --git a/plugins/instruct-lab/configs/instruct_lab.yaml b/plugins/attention-and-distributed-packing/configs/aadp.yaml
similarity index 100%
rename from plugins/instruct-lab/configs/instruct_lab.yaml
rename to plugins/attention-and-distributed-packing/configs/aadp.yaml
diff --git a/plugins/instruct-lab/pyproject.toml b/plugins/attention-and-distributed-packing/pyproject.toml
similarity index 81%
rename from plugins/instruct-lab/pyproject.toml
rename to plugins/attention-and-distributed-packing/pyproject.toml
index e6e4adb1..00f1a155 100644
--- a/plugins/instruct-lab/pyproject.toml
+++ b/plugins/attention-and-distributed-packing/pyproject.toml
@@ -3,9 +3,9 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [project]
-name = "fms-acceleration-ilab"
+name = "fms-acceleration-aadp"
 version = '0.0.1'
-description = "FMS Acceleration Plugin for Functionalities Used in Instruct Lab Training"
+description = "FMS Acceleration Plugin for Attention and Distributed Packing Optimizations"
 authors = [
   {name = "Fabian Lim", email = "flim@sg.ibm.com"},
   {name = "Aaron Chew", email = "aaron.chew1@ibm.com"},
@@ -24,7 +24,7 @@ classifiers=[
 ]
 
 [tool.hatch.build.targets.wheel]
-only-include = ["src/fms_acceleration_ilab"]
+only-include = ["src/fms_acceleration_aadp"]
 
 [tool.hatch.build.targets.wheel.sources]
 "src" = ""
diff --git a/plugins/instruct-lab/src/fms_acceleration_ilab/__init__.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/__init__.py
similarity index 100%
rename from plugins/instruct-lab/src/fms_acceleration_ilab/__init__.py
rename to plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/__init__.py
diff --git a/plugins/instruct-lab/src/fms_acceleration_ilab/ilab_utils.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/aadp_utils.py
similarity index 75%
rename from plugins/instruct-lab/src/fms_acceleration_ilab/ilab_utils.py
rename to plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/aadp_utils.py
index 330bf5eb..08314e68 100644
--- a/plugins/instruct-lab/src/fms_acceleration_ilab/ilab_utils.py
+++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/aadp_utils.py
@@ -15,7 +15,7 @@
 from dataclasses import dataclass
 import warnings
 from transformers import DefaultDataCollator, default_data_collator
-
+import torch
 
 @dataclass
 class DataCollatorWithFlattening(DefaultDataCollator):
@@ -51,4 +51,24 @@ def __call__(self, features, return_tensors=None):
                 ret["labels"] += [-100] + feature["labels"][1:]
             else:
                 ret["labels"] += [-100] + feature["input_ids"][1:]
-        return default_data_collator([ret], return_tensors)
+
+        position_ids = torch.tensor(ret["position_ids"]).flatten()
+        indices_q = torch.arange(
+            position_ids.size(0), device=position_ids.device, dtype=torch.int32
+        )
+        cu_seq_lens = torch.cat(
+            (
+                indices_q[position_ids == 0],
+                torch.tensor(
+                    position_ids.size(), dtype=torch.int32
+                ),
+            )
+        )
+        max_length = position_ids.max() + 1
+
+        # return default_data_collator([ret], return_tensors)
+        return {
+            **default_data_collator([ret], return_tensors),
+            "cu_seq_lens": cu_seq_lens,
+            "max_length": max_length,
+        }
diff --git a/plugins/instruct-lab/src/fms_acceleration_ilab/flash_attn.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/flash_attn.py
similarity index 80%
rename from plugins/instruct-lab/src/fms_acceleration_ilab/flash_attn.py
rename to plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/flash_attn.py
index 26e26d01..1ecb978c 100644
--- a/plugins/instruct-lab/src/fms_acceleration_ilab/flash_attn.py
+++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/flash_attn.py
@@ -32,36 +32,27 @@
         inspect.signature(flash_attn_func).parameters
     )
 
+# model id -> position_ids
+POSITION_IDS_CACHE = {}
+CU_SEQ_LENS_CACHE = {}
+MAX_SEQ_LENS_CACHE = {}
 
-def prepare_fa2_from_position_ids(query, key, value, position_ids, query_length):
-    query = query.view(-1, query.size(-2), query.size(-1))
-    key = key.view(-1, key.size(-2), key.size(-1))
-    value = value.view(-1, value.size(-2), value.size(-1))
-    position_ids = position_ids.flatten()
-    indices_q = torch.arange(
-        position_ids.size(0), device=position_ids.device, dtype=torch.int32
-    )
-    cu_seq_lens = torch.cat(
-        (
-            indices_q[position_ids == 0],
-            torch.tensor(
-                position_ids.size(), device=position_ids.device, dtype=torch.int32
-            ),
-        )
-    )
-    max_length = position_ids.max() + 1
-    return (
-        query,
-        key,
-        value,
-        indices_q,
-        (cu_seq_lens, cu_seq_lens),
-        (max_length, max_length),
-    )
+# This is used to patch the top-level model to accept cuseqlen
+# and maxseqlen as additional args that are cached for attention
+# computation
+def build_toplevel_model_forward(
+    model: torch.nn.Module,
+    model_id: str,
+):
+    # forward
+    old_forward = model.forward
 
+    def forward(self, *args, cu_seq_lens, max_length, **kwargs):
+        CU_SEQ_LENS_CACHE[model_id] = (cu_seq_lens, cu_seq_lens)
+        MAX_SEQ_LENS_CACHE[model_id] = (max_length, max_length)
+        return old_forward(*args, **kwargs)
 
-# model id -> position_ids
-POSITION_IDS_CACHE = {}
+    return forward
 
 
 # - needed to store position ids when first come into model
@@ -123,6 +114,8 @@ def _flash_attention_forward_with_posids(
 ):
     # get the position ids out here
     position_ids = POSITION_IDS_CACHE[model_id]
+    cu_seqlens_q, cu_seqlens_k = CU_SEQ_LENS_CACHE[model_id]
+    max_seqlen_in_batch_q, max_seqlen_in_batch_k = MAX_SEQ_LENS_CACHE[model_id]
 
     if not use_top_left_mask:
         causal = is_causal
@@ -161,19 +154,10 @@ def _flash_attention_forward_with_posids(
     assert attention_mask is None, "should not be using attention mask"
     assert position_ids is not None, "should be expecting position ids"
     batch_size = query_states.size(0)
-    (
-        query_states,
-        key_states,
-        value_states,
-        _,
-        cu_seq_lens,
-        max_seq_lens,
-    ) = prepare_fa2_from_position_ids(
-        query_states, key_states, value_states, position_ids, query_length
-    )
 
-    cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-    max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+    query_states = query_states.view(-1, query_states.size(-2), query_states.size(-1))
+    key_states = key_states.view(-1, key_states.size(-2), key_states.size(-1))
+    value_states = value_states.view(-1, value_states.size(-2), value_states.size(-1))
 
     attn_output = flash_attn_varlen_func(
         query_states,
diff --git a/plugins/instruct-lab/src/fms_acceleration_ilab/framework_plugin_padding_free.py b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py
similarity index 92%
rename from plugins/instruct-lab/src/fms_acceleration_ilab/framework_plugin_padding_free.py
rename to plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py
index 33a592ee..70d6fcbf 100644
--- a/plugins/instruct-lab/src/fms_acceleration_ilab/framework_plugin_padding_free.py
+++ b/plugins/attention-and-distributed-packing/src/fms_acceleration_aadp/framework_plugin_padding_free.py
@@ -93,7 +93,7 @@ def _is_backbone(module: torch.nn.Module):
         # - patch backbone
         model_type = model.config.model_type
         # pylint: disable=import-outside-toplevel
-        from .flash_attn import build_backbone_forward
+        from .flash_attn import build_backbone_forward, build_toplevel_model_forward
 
         ModelPatcher.register(
             ModelPatcherRule(
@@ -106,6 +106,19 @@ def _is_backbone(module: torch.nn.Module):
             ),
         )
 
+        # Need to patch the top-level model to accept and cache additional
+        # kwargs, cu_seq_lens and max_len from data collator
+        ModelPatcher.register(
+            ModelPatcherRule(
+                rule_id=f"{model_type}-cumseqlen-cache",
+                trigger=ModelPatcherTrigger(check=model.__class__),
+                forward_builder=partial(
+                    build_toplevel_model_forward,
+                    model_id=id(model),
+                ),
+            ),
+        )
+
         # Next, the flash attention function needs to be patched
         # how it is patched depends on the transformers version
         try:
@@ -185,7 +198,7 @@ def _patch_dataloader(
         except ImportError:
             # Otherwise, use the locally implemented DataCollatorWithFlattening
             # pylint: disable=import-outside-toplevel
-            from .ilab_utils import (
+            from .aadp_utils import (
                 DataCollatorWithFlattening,
             )
 
diff --git a/plugins/instruct-lab/tests/__init__.py b/plugins/attention-and-distributed-packing/tests/__init__.py
similarity index 100%
rename from plugins/instruct-lab/tests/__init__.py
rename to plugins/attention-and-distributed-packing/tests/__init__.py
diff --git a/plugins/instruct-lab/tests/test_ilab_plugin.py b/plugins/attention-and-distributed-packing/tests/test_aadp_plugin.py
similarity index 83%
rename from plugins/instruct-lab/tests/test_ilab_plugin.py
rename to plugins/attention-and-distributed-packing/tests/test_aadp_plugin.py
index c3185d83..ea38158b 100644
--- a/plugins/instruct-lab/tests/test_ilab_plugin.py
+++ b/plugins/attention-and-distributed-packing/tests/test_aadp_plugin.py
@@ -17,13 +17,13 @@
     instantiate_framework,
     read_configuration,
 )
-from fms_acceleration_ilab import PaddingFreeAccelerationPlugin
+from fms_acceleration_aadp import PaddingFreeAccelerationPlugin
 
 # configuration
 DIRNAME = os.path.dirname(__file__)
-CONFIG_PATH_ILAB = os.path.join(DIRNAME, "../configs/instruct_lab.yaml")
+CONFIG_PATH_ILAB = os.path.join(DIRNAME, "../configs/aadp.yaml")
 
-def test_framework_installs_ilab_padding_free_plugin():
+def test_framework_installs_aadp_padding_free_plugin():
     with instantiate_framework(
         read_configuration(CONFIG_PATH_ILAB), require_packages_check=False
     ) as framework:
diff --git a/plugins/instruct-lab/tox.ini b/plugins/attention-and-distributed-packing/tox.ini
similarity index 100%
rename from plugins/instruct-lab/tox.ini
rename to plugins/attention-and-distributed-packing/tox.ini
diff --git a/plugins/framework/src/fms_acceleration/constants.py b/plugins/framework/src/fms_acceleration/constants.py
index 3cdef252..6a81d977 100644
--- a/plugins/framework/src/fms_acceleration/constants.py
+++ b/plugins/framework/src/fms_acceleration/constants.py
@@ -21,4 +21,4 @@
 # and activated.
 # - hence the plugins that have model loaders should be on top of this list
 
-PLUGINS = ["peft", "foak", "ilab"]
+PLUGINS = ["peft", "foak", "aadp"]
diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml
index f5dc6819..e2eccbc1 100644
--- a/sample-configurations/CONTENTS.yaml
+++ b/sample-configurations/CONTENTS.yaml
@@ -33,7 +33,7 @@ framework_configs:
         - fused-ops-and-kernels
       filename: accelerated-peft-bnb-nf4-foak-sample-configuration.yaml
 
-    - shortname: ilab-padding-free
+    - shortname: aadp-padding-free
       plugins:
-        - instruct-lab
-      filename: ilab-padding-free-sample-configuration.yaml
\ No newline at end of file
+        - attention-and-distributed-packing
+      filename: aadp-padding-free-sample-configuration.yaml
\ No newline at end of file
diff --git a/sample-configurations/ilab-padding-free-sample-configuration.yaml b/sample-configurations/aadp-padding-free-sample-configuration.yaml
similarity index 100%
rename from sample-configurations/ilab-padding-free-sample-configuration.yaml
rename to sample-configurations/aadp-padding-free-sample-configuration.yaml
diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py
index c147df6a..3dd80b92 100644
--- a/scripts/generate_sample_configurations.py
+++ b/scripts/generate_sample_configurations.py
@@ -144,7 +144,7 @@ def read_configuration(path: str) -> Dict:
 KEY_BNB_NF4_BASELINE = "baseline-bnb-nf4"
 KEY_AUTO_GPTQ_FOAK = "auto-gptq-foak"
 KEY_BNB_NF4_FOAK = "bnb-nf4-foak"
-KEY_ILAB_PADDING_FREE = "ilab-padding-free"
+KEY_AADP_PADDING_FREE = "aadp-padding-free"
 
 CONFIGURATIONS = {
     KEY_AUTO_GPTQ: "plugins/accelerated-peft/configs/autogptq.yaml",
@@ -167,7 +167,7 @@ def read_configuration(path: str) -> Dict:
         "plugins/fused-ops-and-kernels/configs/fast_quantized_peft.yaml",
         [("peft.quantization.fused_ops_and_kernels.base_layer", "bitsandbytes")],
     ),
-    KEY_ILAB_PADDING_FREE: "plugins/instruct-lab/configs/instruct_lab.yaml",
+    KEY_AADP_PADDING_FREE: "plugins/attention-and-distributed-packing/configs/aadp.yaml",
 }
 
 # list of (tag, combi) tuples
@@ -181,7 +181,7 @@ def read_configuration(path: str) -> Dict:
     ("baseline-peft-bnb-nf4", (KEY_BNB_NF4_BASELINE,)),
     ("accelerated-peft-autogptq-foak", (KEY_AUTO_GPTQ, KEY_AUTO_GPTQ_FOAK)),
     ("accelerated-peft-bnb-nf4-foak", (KEY_BNB_NF4, KEY_BNB_NF4_FOAK)),
-    ("ilab-padding-free", (KEY_ILAB_PADDING_FREE,)),
+    ("aadp-padding-free", (KEY_AADP_PADDING_FREE,)),
 ]