diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td
index 21670b6827..04f0a0aa96 100644
--- a/include/aie/Dialect/AIE/IR/AIEOps.td
+++ b/include/aie/Dialect/AIE/IR/AIEOps.td
@@ -1689,7 +1689,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol]
         // via_shared_mem==1 means use consumer tile's memory module
         OptionalAttr<AIEI32Attr>:$via_shared_mem,
         // memtile_repeat==0 means "do it once" and don't repeat
-        OptionalAttr<AIEI32Attr>:$memtile_repeat
+        OptionalAttr<AIEI32Attr>:$memtile_repeat,
+        OptionalAttr<BDPadLayoutArrayAttr>:$padDimensions
   );
 
   let assemblyFormat = [{
@@ -1728,7 +1729,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol]
     OpBuilder<(ins "mlir::StringAttr":$sym_name, "mlir::Value":$producerTile,
                    "mlir::ValueRange":$consumerTiles, "mlir::Attribute":$elemNumber, "mlir::Type":$elem_type,
                    CArg<"llvm::ArrayRef<AIE::BDDimLayoutAttr>", "{}">:$dimensionsToStream,
-                   CArg<"llvm::ArrayRef<AIE::BDDimLayoutArrayAttr>", "{}">:$dimensionsFromStreamPerConsumer), [{
+                   CArg<"llvm::ArrayRef<AIE::BDDimLayoutArrayAttr>", "{}">:$dimensionsFromStreamPerConsumer,
+                   CArg<"llvm::ArrayRef<AIE::BDPadLayoutArrayAttr>", "{}">:$padDimensions), [{
       odsState.addOperands(producerTile);
       odsState.addOperands(consumerTiles);
       odsState.addAttribute(getSymNameAttrName(odsState.name), sym_name);
diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td
index ce04f131e0..6e7d9968df 100644
--- a/include/aie/Dialect/AIEX/IR/AIEX.td
+++ b/include/aie/Dialect/AIEX/IR/AIEX.td
@@ -570,7 +570,13 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
         OptionalAttr<PacketInfoAttr>:$packet,
         FlatSymbolRefAttr:$metadata,
         I64Attr:$id,
-        DefaultValuedOptionalAttr<BoolAttr, "false">:$issue_token
+        DefaultValuedOptionalAttr<BoolAttr, "false">:$issue_token,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d0_zero_before,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d1_zero_before,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d2_zero_before,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d0_zero_after,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d1_zero_after,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$d2_zero_after
   );
 
   let assemblyFormat = [{
@@ -840,7 +846,13 @@ def AIE_NpuWriteBdOp: AIEX_Op<"npu.writebd", []> {
         I32Attr:$lock_rel_id,
         I32Attr:$lock_acq_enable,
         I32Attr:$lock_acq_val,
-        I32Attr:$lock_acq_id
+        I32Attr:$lock_acq_id,
+        I32Attr:$d0_zero_before,
+        I32Attr:$d1_zero_before,
+        I32Attr:$d2_zero_before,
+        I32Attr:$d0_zero_after,
+        I32Attr:$d1_zero_after,
+        I32Attr:$d2_zero_after
   );
   let results = (outs );
   let assemblyFormat = [{ attr-dict }];
diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index 6b77f498b1..535912a9ff 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -459,14 +459,19 @@ struct AIEObjectFifoStatefulTransformPass
   void createBd(OpBuilder &builder, LockOp acqLock, int acqMode,
                 LockAction acqLockAction, LockOp relLock, int relMode,
                 MyOp buff, int offset, int len, Block *succ,
-                BDDimLayoutArrayAttr dims) {
+                BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions) {
     if (acqLock)
       builder.create<UseLockOp>(builder.getUnknownLoc(), acqLock, acqLockAction,
                                 acqMode);
-    if (!dims.getValue().empty())
+
+    if (!dims.getValue().empty() && !padDimensions.getValue().empty()) {
+      builder.create<DMABDOp>(builder.getUnknownLoc(), buff, offset, len, dims,
+                              padDimensions);
+    } else if (!dims.getValue().empty()) {
       builder.create<DMABDOp>(builder.getUnknownLoc(), buff, offset, len, dims);
-    else
+    } else {
       builder.create<DMABDOp>(builder.getUnknownLoc(), buff, offset, len);
+    }
     if (acqLock)
       builder.create<UseLockOp>(builder.getUnknownLoc(), relLock,
                                 LockAction::Release, relMode);
@@ -480,7 +485,8 @@ struct AIEObjectFifoStatefulTransformPass
   void createBdBlock(OpBuilder &builder, ObjectFifoCreateOp op, int lockMode,
                      int acqNum, int relNum, MyOp buff, int offset, int len,
                      DMAChannelDir channelDir, size_t blockIndex, Block *succ,
-                     BDDimLayoutArrayAttr dims) {
+                     BDDimLayoutArrayAttr dims,
+                     BDPadLayoutArrayAttr padDimensions) {
     LockOp acqLock;
     LockOp relLock;
     int acqMode = 1;
@@ -505,20 +511,25 @@ struct AIEObjectFifoStatefulTransformPass
       }
     }
     createBd(builder, acqLock, acqMode, acqLockAction, relLock, relMode, buff,
-             offset, len, succ, dims);
+             offset, len, succ, dims, padDimensions);
   }
 
   /// Function that either calls createAIETileDMA(), createShimDMA() or
   /// createMemTileDMA() based on op tile row value.
   void createDMA(DeviceOp &device, OpBuilder &builder, ObjectFifoCreateOp op,
                  DMAChannelDir channelDir, int channelIndex, int lockMode,
-                 BDDimLayoutArrayAttr dims) {
+                 BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr pad_dims) {
     if (op.getProducerTileOp().isShimTile()) {
       createShimDMA(device, builder, op, channelDir, channelIndex, lockMode,
                     dims);
-    } else if (op.getProducerTileOp().isMemTile()) {
+    } else if (op.getProducerTileOp().isMemTile() &&
+               channelDir == DMAChannelDir::MM2S) {
       createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode,
-                       dims);
+                       dims, pad_dims);
+    } else if (op.getProducerTileOp().isMemTile() &&
+               channelDir == DMAChannelDir::S2MM) {
+      createMemTileDMA(device, builder, op, channelDir, channelIndex, lockMode,
+                       dims, nullptr);
     } else {
       createAIETileDMA(device, builder, op, channelDir, channelIndex, lockMode,
                        dims);
@@ -602,7 +613,7 @@ struct AIEObjectFifoStatefulTransformPass
       builder.setInsertionPointToStart(curr);
       createBdBlock<BufferOp>(builder, target, lockMode, acqNum, relNum,
                               buffersPerFifo[target][blockIndex], /*offset*/ 0,
-                              len, channelDir, blockIndex, succ, dims);
+                              len, channelDir, blockIndex, succ, dims, nullptr);
       curr = succ;
       blockIndex++;
     }
@@ -678,7 +689,7 @@ struct AIEObjectFifoStatefulTransformPass
       createBdBlock<ExternalBufferOp>(builder, op, lockMode, acqNum, relNum,
                                       externalBuffersPerFifo[op][blockIndex],
                                       /*offset*/ 0, len, channelDir, blockIndex,
-                                      succ, dims);
+                                      succ, dims, nullptr);
       curr = succ;
       blockIndex++;
     }
@@ -689,7 +700,8 @@ struct AIEObjectFifoStatefulTransformPass
   void createMemTileDMA(DeviceOp &device, OpBuilder &builder,
                         ObjectFifoCreateOp op, DMAChannelDir channelDir,
                         int channelIndex, int lockMode,
-                        BDDimLayoutArrayAttr dims) {
+                        BDDimLayoutArrayAttr dims,
+                        BDPadLayoutArrayAttr padDimensions) {
     size_t numBlocks = op.size();
     if (numBlocks == 0)
       return;
@@ -710,6 +722,7 @@ struct AIEObjectFifoStatefulTransformPass
                                               dims.getValue().drop_front(1));
       }
     }
+
     if (op.getMemtileRepeat().has_value())
       repeatCount = op.getMemtileRepeat().value();
 
@@ -839,7 +852,8 @@ struct AIEObjectFifoStatefulTransformPass
         offset = extraOffset;
       createBdBlock<BufferOp>(builder, target, lockMode, acqNum, relNum,
                               buffersPerFifo[target][blockIndex], offset,
-                              lenOut, channelDir, blockIndex, succ, dims);
+                              lenOut, channelDir, blockIndex, succ, dims,
+                              padDimensions);
       curr = succ;
       blockIndex++;
     }
@@ -1303,7 +1317,6 @@ struct AIEObjectFifoStatefulTransformPass
     auto consumerWireType = WireBundle::DMA;
     std::set<TileOp>
         objectFifoTiles; // track cores to check for loops during unrolling
-
     //===------------------------------------------------------------------===//
     // Split objectFifos into a consumer end and producer end if needed
     //===------------------------------------------------------------------===//
@@ -1446,7 +1459,8 @@ struct AIEObjectFifoStatefulTransformPass
       DMAChannel producerChan =
           dmaAnalysis.getMasterDMAChannel(producer.getProducerTile());
       createDMA(device, builder, producer, producerChan.direction,
-                producerChan.channel, 0, producer.getDimensionsToStreamAttr());
+                producerChan.channel, 0, producer.getDimensionsToStreamAttr(),
+                producer.getPadDimensionsAttr());
       // generate objectFifo allocation info
       builder.setInsertionPoint(&device.getBody()->back());
 
@@ -1464,7 +1478,8 @@ struct AIEObjectFifoStatefulTransformPass
         BDDimLayoutArrayAttr consumerDims =
             consumer.getDimensionsFromStreamPerConsumer()[0];
         createDMA(device, builder, consumer, consumerChan.direction,
-                  consumerChan.channel, 1, consumerDims);
+                  consumerChan.channel, 1, consumerDims,
+                  consumer.getPadDimensionsAttr());
         // generate objectFifo allocation info
         builder.setInsertionPoint(&device.getBody()->back());
 
diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp
index f56f1cee3e..c064ad6702 100644
--- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp
@@ -129,7 +129,7 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase<AIECtrlPacketToDmaPass> {
                   SmallVector<Value>{}, SmallVector<Value>{},
                   SmallVector<Value>{}, ArrayRef(staticOffsets),
                   ArrayRef(staticSizes), ArrayRef(staticStrides),
-                  controllerIdPkt, metadata, 0, true);
+                  controllerIdPkt, metadata, 0, true, 0, 0, 0, 0, 0, 0);
 
               auto shimRow = builder.getI32IntegerAttr(0);
               auto shimCol = builder.getI32IntegerAttr(col);
diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
index 8c889553da..ac5a2fa822 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
@@ -216,7 +216,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
   }
 
   LogicalResult rewriteSingleBD(OpBuilder &builder, Block &block,
-                                AIE::TileOp &tile) {
+                                AIE::TileOp &tile,
+                                AIE::DMAChannelDir channelDir) {
     AIE::DMABDOp bd_op = getBdForBlock(block);
     const auto &target_model = AIE::getTargetModel(bd_op);
     MemRefType buffer_type = bd_op.getBuffer().getType();
@@ -237,12 +238,21 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
              << len << " bytes falls below minimum hardware transfer unit of "
              << (addr_granularity / 8) << " bytes.";
     }
-
     // Process strides/wraps
     std::optional<llvm::ArrayRef<AIE::BDDimLayoutAttr>> dims =
         bd_op.getDimensions();
     llvm::SmallVector<int64_t, 4> sizes = llvm::SmallVector<int64_t, 4>(4, 0);
     llvm::SmallVector<int64_t, 4> strides = llvm::SmallVector<int64_t, 4>(4, 0);
+    // Padding
+    std::optional<llvm::ArrayRef<AIE::BDPadLayoutAttr>> padDims =
+        bd_op.getPadDimensions();
+    llvm::SmallVector<int64_t, 4> padBefore =
+        llvm::SmallVector<int64_t, 4>(4, 0);
+    llvm::SmallVector<int64_t, 4> padAfter =
+        llvm::SmallVector<int64_t, 4>(4, 0);
+    std::fill(padBefore.begin(), padBefore.end(), 0);
+    std::fill(padAfter.begin(), padAfter.end(), 0);
+
     if (dims && dims->size() > 0) {
       llvm::SmallVector<int64_t, 4> input_sizes =
           llvm::SmallVector<int64_t, 4>(4, 1);
@@ -260,6 +270,23 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
         input_sizes[i] = (*dims)[j].getSize();
         input_strides[i] = (*dims)[j].getStride();
       }
+
+      if (target_model.isMemTile(tile.getCol(), tile.getRow()) &&
+          channelDir == AIE::DMAChannelDir::MM2S) {
+        if (padDims && (padDims->size() > dims->size()))
+          return bd_op->emitOpError()
+                 << "Mismatch number of dimensions between padding(s)"
+                 << " and wrap(s) and stride(s).";
+        else if (padDims)
+          for (size_t i = 0; i < padDims->size(); i++) {
+            int j = padDims->size() - i - 1;
+            padBefore[i] = (*padDims)[j].getConstPadBefore();
+            padAfter[i] = (*padDims)[j].getConstPadAfter();
+          }
+      } else if (padDims) {
+        return bd_op->emitOpError()
+               << "supports padding only for MM2S direction on MemTiles.";
+      }
       getHardwareStridesWraps(target_model, buffer_type, input_sizes,
                               input_strides, sizes, strides);
       if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(),
@@ -290,8 +317,16 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
                             "transfer length, as this is the BD repeat count.";
         return failure();
       }
+    } else {
+      if (padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) &&
+          channelDir == AIE::DMAChannelDir::MM2S) {
+        return bd_op->emitOpError()
+               << "Padding requires n-d data layouts expressed as "
+               << "wrap(s) and stride(s).";
+      } else if (padDims) {
+        return bd_op->emitOpError() << "Padding is supported only on MemTiles.";
+      }
     }
-
     // find next BD ID, if any
     uint32_t use_next_bd = 0;
     uint32_t next_bd_id = 0;
@@ -316,7 +351,10 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
         /*valid_bd=*/1,
         /* TODO: Locks */
         /*lock_rel_val=*/0, /*lock_rel_id=*/0, /*lock_acq_enable=*/0,
-        /*lock_acq_val=*/0, /*lock_ackq_id=*/0);
+        /*lock_acq_val=*/0, /*lock_ackq_id=*/0, /*d0_zero_before=*/padBefore[0],
+        /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2],
+        /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1],
+        /*d2_zero_after=*/padAfter[2]);
 
     return setAddressForSingleBD(builder, bd_op, tile);
   }
@@ -392,13 +430,15 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
       return failure();
     }
 
+    auto channelDir = op.getDirection();
+
     // Lower all BDs
     for (auto it = body.begin(); it != body.end(); ++it) {
       Block &block = *it;
       if (shouldSkipBlock(block)) {
         continue;
       }
-      if (failed(rewriteSingleBD(builder, block, tile))) {
+      if (failed(rewriteSingleBD(builder, block, tile, channelDir))) {
         return failure();
       }
     }
diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index ca6f7f1a69..54717f2935 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -338,6 +338,12 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     auto lock_acq_enable = zero;
     auto lock_acq_val = zero;
     auto lock_acq_id = zero;
+    auto d0_zero_before = zero;
+    auto d1_zero_before = zero;
+    auto d2_zero_before = zero;
+    auto d0_zero_after = zero;
+    auto d1_zero_after = zero;
+    auto d2_zero_after = zero;
 
     auto issue_token = BoolAttr::get(ctx, false);
     auto repeat_count = zero;
@@ -357,6 +363,9 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     // column
     column = IntegerAttr::get(i32ty, col);
 
+    // row
+    row = IntegerAttr::get(i32ty, 0);
+
     // arg_idx
     AIEX::RuntimeSequenceOp seq_op =
         op->getParentOfType<AIEX::RuntimeSequenceOp>();
@@ -447,6 +456,24 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
 
     // lock_acq_id
 
+    // d0_zero_before
+    d0_zero_before = IntegerAttr::get(i32ty, op.getD0ZeroBefore());
+
+    // d1_zero_before
+    d1_zero_before = IntegerAttr::get(i32ty, op.getD1ZeroBefore());
+
+    // d2_zero_before
+    d2_zero_before = IntegerAttr::get(i32ty, op.getD2ZeroBefore());
+
+    // d0_zero_after
+    d0_zero_after = IntegerAttr::get(i32ty, op.getD0ZeroAfter());
+
+    // d1_zero_after
+    d1_zero_after = IntegerAttr::get(i32ty, op.getD1ZeroAfter());
+
+    // d2_zero_after
+    d2_zero_after = IntegerAttr::get(i32ty, op.getD2ZeroAfter());
+
     // Set the issue_token
     issue_token = BoolAttr::get(ctx, op.getIssueToken());
     // Earlier, all S2MM channels were implicitly assumed to issue a token.
@@ -454,12 +481,18 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     if (!isMM2S)
       issue_token = BoolAttr::get(ctx, true);
 
+    // TODO: Need to add a check to only allow zero padding on MM2S channel of
+    // MemTile As of now, run time MemTile DMA configuration is supported only
+    // from BD level, not at NpuDmaMemcpyNdOp.
+
     rewriter.create<NpuWriteBdOp>(
         op->getLoc(), column, bd_id, buffer_length, buffer_offset,
         enable_packet, out_of_order_id, packet_id, packet_type, d0_size,
         d0_stride, d1_size, d1_stride, d2_stride, iteration_current,
         iteration_size, iteration_stride, next_bd, row, use_next_bd, valid_bd,
-        lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id);
+        lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id,
+        d0_zero_before, d1_zero_before, d2_zero_before, d0_zero_after,
+        d1_zero_after, d2_zero_after);
 
     uint64_t addr = getBufferDescriptorAddressRegisterAddress(
         targetModel, op.getId(), col, 0);
@@ -581,6 +614,12 @@ struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
       words[7] |= (op.getLockAcqEnable() & 0x1) << 12;
       words[7] |= (op.getLockAcqVal() & 0xef) << 5;
       words[7] |= op.getLockAcqId() & 0xf;
+
+      if (op.getD0ZeroBefore() || op.getD1ZeroBefore() ||
+          op.getD2ZeroBefore() || op.getD0ZeroAfter() || op.getD1ZeroAfter() ||
+          op.getD2ZeroAfter()) {
+        op->emitError("Zero padding is only available on MemTile");
+      }
     } else if (tm.isMemTile(op.getColumn(), op.getRow())) {
       bd_addr = (op.getColumn() << tm.getColumnShift()) |
                 (op.getRow() << tm.getRowShift()) | (0xA0000 + bd_id * 0x20);
@@ -592,6 +631,7 @@ struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
       words[0] |= op.getBufferLength() & 0x1ffff;
 
       // DMA_BDX_1
+      words[1] |= (op.getD0ZeroBefore() & 0x3F) << 26;
       words[1] |= (op.getNextBd() & 0x3f) << 20;
       words[1] |= (op.getUseNextBd() & 0x1) << 19;
       words[1] |= op.getBufferOffset() & 0x7ffff;
@@ -602,15 +642,20 @@ struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
 
       // DMA_BDX_3
       // TODO: Secure Access
+      words[3] |= (op.getD1ZeroBefore() & 0x1F) << 27;
       words[3] |= (op.getD1Size() & 0x3ff) << 17;
       words[3] |= op.getD1Stride() & 0x1ffff;
 
       // DMA_BDX_4
       // TODO: D2Size
+      words[4] |= (op.getD2ZeroBefore() & 0xF) << 27;
       words[4] |= op.getD2Stride() & 0x1ffff;
 
       // DMA_BDX_5
       // ToDO: D3Stride
+      words[5] |= (op.getD2ZeroAfter() & 0xF) << 28;
+      words[5] |= (op.getD1ZeroAfter() & 0x1F) << 23;
+      words[5] |= (op.getD0ZeroAfter() & 0x3F) << 17;
 
       // DMA_BDX_6
       words[6] |= (op.getIterationCurrent() & 0x3f) << 23;
diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp
index 9c11596119..3e227310cf 100644
--- a/programming_examples/basic/passthrough_dmas/test.cpp
+++ b/programming_examples/basic/passthrough_dmas/test.cpp
@@ -192,4 +192,4 @@ int main(int argc, const char *argv[]) {
     std::cout << std::endl << "fail." << std::endl << std::endl;
     return 1;
   }
-}
+}
\ No newline at end of file
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
index 4d672b45be..717f430f92 100644
--- a/python/dialects/aie.py
+++ b/python/dialects/aie.py
@@ -107,6 +107,12 @@ def bd_dim_layout(size, stride):
     return Attribute.parse(f"#aie.bd_dim_layout<{size=}, {stride=}>")
 
 
+def bd_pad_layout(const_pad_before, const_pad_after):
+    return Attribute.parse(
+        f"#aie.bd_pad_layout<{const_pad_before=}, {const_pad_after=}>"
+    )
+
+
 @register_attribute_builder("BDDimLayoutArrayAttr")
 def bd_dim_layout_array_attr_builder(tups: List[Attribute | Tuple[int]], context=None):
     if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups):
@@ -125,6 +131,17 @@ def bd_dim_layout_array_array_attr_builder(tup_arrs: List[List[tuple]], context=
     )
 
 
+@register_attribute_builder("BDPadLayoutArrayAttr")
+def bd_pad_layout_array_attr_builder(
+    tups: List[Union[Attribute, Tuple[int]]], context=None
+):
+    if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups):
+        tups = list(map(lambda t: bd_pad_layout(*t), tups))
+    return Attribute.parse(
+        f'#aie<bd_pad_layout_array[{", ".join(map(str, tups))}]>', context=context
+    )
+
+
 @register_attribute_builder("AIEI1Attr")
 def _i1Attr(x, context):
     return IntegerAttr.get(IntegerType.get_signless(1, context=context), x)
@@ -378,6 +395,7 @@ def __init__(
         dimensionsFromStreamPerConsumer=None,
         via_DMA=None,
         plio=None,
+        padDimensions=None,
         disable_synchronization=None,
     ):
         self.datatype = try_convert_np_type_to_mlir_type(datatype)
@@ -398,6 +416,7 @@ def __init__(
             dimensionsFromStreamPerConsumer=dimensionsFromStreamPerConsumer,
             via_DMA=via_DMA,
             plio=plio,
+            padDimensions=padDimensions,
             disable_synchronization=disable_synchronization,
         )
 
diff --git a/python/utils/trace.py b/python/utils/trace.py
index 668455881e..07a7a3dcb8 100644
--- a/python/utils/trace.py
+++ b/python/utils/trace.py
@@ -527,9 +527,15 @@ def configure_shimtile_tracing_aie2(
         column=int(shim.col),
         d0_size=0,
         d0_stride=0,
+        d0_zero_after=0,
+        d0_zero_before=0,
         d1_size=0,
         d1_stride=0,
+        d1_zero_after=0,
+        d1_zero_before=0,
         d2_stride=0,
+        d2_zero_after=0,
+        d2_zero_before=0,
         iteration_current=0,
         iteration_size=0,
         iteration_stride=0,
diff --git a/test/Targets/NPU/npu_blockwrite_instgen.mlir b/test/Targets/NPU/npu_blockwrite_instgen.mlir
index 4ba0b41342..f38a24d5d0 100644
--- a/test/Targets/NPU/npu_blockwrite_instgen.mlir
+++ b/test/Targets/NPU/npu_blockwrite_instgen.mlir
@@ -46,9 +46,15 @@ module {
                          row = 1 : i32,
                          d0_stride = 5 : i32,
                          d0_size = 6 : i32,
+                         d0_zero_after = 0 : i32,
+                         d0_zero_before = 0 : i32,
                          d1_stride = 7 : i32,
                          d1_size = 8 : i32,
+                         d1_zero_after = 0 : i32,
+                         d1_zero_before = 0 : i32,
                          d2_stride = 9 : i32,
+                         d2_zero_after = 0 : i32,
+                         d2_zero_before = 0 : i32,
                          ddr_id = 10 : i32,
                          iteration_current = 11 : i32,
                          iteration_stride = 12 : i32,
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir
new file mode 100644
index 0000000000..8ff16ccaf1
--- /dev/null
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-12.mlir
@@ -0,0 +1,25 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s 
+       
+module {
+  aie.device(npu1_4col) {
+    %tile_0_1 = aie.tile(0, 1)
+    %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> 
+
+    aiex.runtime_sequence(%arg0: memref<32xi8>) { 
+      %t1 = aiex.dma_configure_task(%tile_0_1, S2MM, 0) {
+      // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}}
+          aie.dma_bd(%buf : memref<32xi8>, 4, 16,
+                     [<size=2, stride=4>, <size=2, stride=8>, <size=4, stride=1>], [<const_pad_before=2, const_pad_after=1>]) {bd_id = 0 : i32}
+          aie.end
+      }
+    }
+  }
+}
+
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir
new file mode 100644
index 0000000000..d0291b038f
--- /dev/null
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-13.mlir
@@ -0,0 +1,26 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s 
+       
+module {
+  aie.device(npu1_4col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+    %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> 
+
+    aiex.runtime_sequence(%arg0: memref<32xi8>) {
+      %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) {
+      // expected-error@+1 {{supports padding only for MM2S direction on MemTiles.}} 
+          aie.dma_bd(%buf : memref<32xi8>, 4, 16,
+                    [<size=2, stride=4>, <size=2, stride=8>, <size=4, stride=1>], [<const_pad_before=2, const_pad_after=1>]) {bd_id = 0 : i32}
+          aie.end
+      }
+    }
+  }
+}
+
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir
new file mode 100644
index 0000000000..466c73b929
--- /dev/null
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-14.mlir
@@ -0,0 +1,26 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s 
+       
+module {
+  aie.device(npu1_4col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> 
+
+    aiex.runtime_sequence(%arg0: memref<32xi8>) {
+      %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) {
+          // expected-error@+1 {{Padding requires n-d data layouts expressed as wrap(s) and stride(s).}} 
+          aie.dma_bd(%buf : memref<32xi8>, 4, 16,
+                     [], [<const_pad_before=2, const_pad_after=1>]) {bd_id = 0 : i32}
+          aie.end
+      }
+    }
+  }
+}
+
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir
new file mode 100644
index 0000000000..45f95e0056
--- /dev/null
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-15.mlir
@@ -0,0 +1,27 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s 
+       
+module {
+  aie.device(npu1_4col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> 
+
+    aiex.runtime_sequence(%arg0: memref<32xi8>) {
+      %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) {
+      // expected-error@+1 {{Mismatch number of dimensions between padding(s) and wrap(s) and stride(s).}} 
+          aie.dma_bd(%buf : memref<32xi8>, 4, 16,
+                     [<size=2, stride=4>], [<const_pad_before=2, const_pad_after=1>, <const_pad_before=1, const_pad_after=1>]) 
+                     {bd_id = 0 : i32}
+          aie.end
+      }
+    }
+  }
+}
+
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir
new file mode 100644
index 0000000000..3e58b8a5af
--- /dev/null
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/bad-16.mlir
@@ -0,0 +1,26 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+// RUN: aie-opt --verify-diagnostics --aie-dma-tasks-to-npu %s 
+       
+module {
+  aie.device(npu1_4col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+    %buf = aie.buffer(%tile_0_2) { address = 0xBEEF : i32 } : memref<32xi8> 
+
+    aiex.runtime_sequence(%arg0: memref<32xi8>) {
+      %t1 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) {
+      // expected-error@+1 {{Padding is supported only on MemTiles.}} 
+          aie.dma_bd(%buf : memref<32xi8>, 4, 16,
+                    [], [<const_pad_before=2, const_pad_after=1>]) {bd_id = 0 : i32}
+          aie.end
+      }
+    }
+  }
+}
+
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir
index 2ad275b804..61601b91b3 100644
--- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-1.mlir
@@ -17,13 +17,13 @@ module {
     %tile_2_0 = aie.tile(2, 0)
 
     aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) {
-      // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
       %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) {
         aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32}
         aie.end
       } {issue_token = true}
-      // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} 
       %t2 = aiex.dma_configure_task(%tile_2_0, S2MM, 1) {
         aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32}
@@ -40,5 +40,4 @@ module {
       aiex.dma_await_task(%t2)
     }
   }
-}
-
+}
\ No newline at end of file
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir
index b57cbc81bd..286ad32f52 100644
--- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-2.mlir
@@ -16,11 +16,11 @@ module {
     %tile_0_2 = aie.tile(0, 2)
 
     aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) {
-      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 1 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
-      // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 10 : i32, buffer_offset = 8 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 2 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 1 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.address_patch {addr = 118820 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32}
-      // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 5 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.address_patch {addr = 118852 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32}
       %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) {
           aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 0 : i32}
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir
index 191f1511ee..798201879e 100644
--- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-3.mlir
@@ -16,7 +16,7 @@ module {
     %tile_0_2 = aie.tile(0, 2)
 
     aiex.runtime_sequence(%arg0: memref<32xi8>) {
-      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 4 : i32}
       %t1 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) {
           aie.dma_bd(%arg0 : memref<32xi8>, 4, 16,
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir
index 22df05bca5..143feb9e1b 100644
--- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-4.mlir
@@ -19,7 +19,7 @@ module {
     %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> 
 
     aiex.runtime_sequence(%arg0: memref<32xi8>) {
-      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32}
       %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) {
           aie.dma_bd(%buf : memref<32xi8>, 4, 16,
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir
index 5a6519a4ee..e026d9e829 100644
--- a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-6.mlir
@@ -16,14 +16,14 @@ module {
     aie.shim_dma_allocation @alloc1 (S2MM, 1, 2)
 
     aiex.runtime_sequence(%arg0: memref<8xi16>, %arg1: memref<10xi32>) {
-      // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 4 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       // CHECK: aiex.npu.address_patch {addr = 119012 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
       %t1 = aiex.dma_configure_task_for @alloc0 {
         aie.dma_bd(%arg0 : memref<8xi16>, 0, 8) {bd_id = 7 : i32}
         aie.end
       } {issue_token = true}
-      // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-      // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} 
+      // CHECK: aiex.npu.writebd {bd_id = 8 : i32, buffer_length = 10 : i32, buffer_offset = 0 : i32, column = 2 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.address_patch {addr = 67227908 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32}
       %t2 = aiex.dma_configure_task_for @alloc1 {
         aie.dma_bd(%arg1 : memref<10xi32>, 0, 10) {bd_id = 8 : i32}
         aie.end
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir
new file mode 100644
index 0000000000..a6b9fa6bbb
--- /dev/null
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-7.mlir
@@ -0,0 +1,28 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s
+
+module {
+  aie.device(npu1_4col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> 
+
+    aiex.runtime_sequence(%arg0: memref<32xi8>) {
+      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 2 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32}
+      %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) {
+          aie.dma_bd(%buf : memref<32xi8>, 4, 16,
+                     [<size=2, stride=4>, <size=2, stride=8>, <size=4, stride=1>], [<const_pad_before=2, const_pad_after=1>]) {bd_id = 0 : i32}
+          aie.end
+      }
+    }
+  }
+}
+
diff --git a/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir
new file mode 100644
index 0000000000..6baa6be36a
--- /dev/null
+++ b/test/bd-chains-and-dma-tasks/dma-tasks-to-npu/good-8.mlir
@@ -0,0 +1,28 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+// RUN: aie-opt --aie-dma-tasks-to-npu %s | FileCheck %s
+
+module {
+  aie.device(npu1_4col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %buf = aie.buffer(%tile_0_1) { address = 0xBEEF : i32 } : memref<32xi8> 
+
+    aiex.runtime_sequence(%arg0: memref<32xi8>) {
+      // CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 4 : i32, buffer_offset = 4 : i32, column = 0 : i32, d0_size = 1 : i32, d0_stride = 0 : i32, d0_zero_after = 1 : i32, d0_zero_before = 1 : i32, d1_size = 2 : i32, d1_stride = 1 : i32, d1_zero_after = 2 : i32, d1_zero_before = 2 : i32, d2_stride = 0 : i32, d2_zero_after = 1 : i32, d2_zero_before = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 1 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      // CHECK: aiex.npu.write32 {address = 1167364 : ui32, value = 48879 : ui32}
+      %t1 = aiex.dma_configure_task(%tile_0_1, MM2S, 0) {
+          aie.dma_bd(%buf : memref<32xi8>, 4, 16,
+                     [<size=2, stride=4>, <size=2, stride=8>, <size=4, stride=1>], [<const_pad_before=2, const_pad_after=1>, <const_pad_before=2, const_pad_after=2>, <const_pad_before=1, const_pad_after=1>]) {bd_id = 0 : i32}
+          aie.end
+      }
+    }
+  }
+}
+
diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir
index 383f6ac567..5be345197e 100644
--- a/test/dialect/AIEX/bad_npu_write_bd.mlir
+++ b/test/dialect/AIEX/bad_npu_write_bd.mlir
@@ -15,7 +15,7 @@ module {
   aie.device(npu1_4col) {
     aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{BD ID exceeds the maximum ID.}}
-      aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
     }
   }
 }
@@ -26,7 +26,7 @@ module {
   aie.device(npu1_4col) {
     aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}}
-      aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
     }
   }
 }
@@ -37,7 +37,7 @@ module {
   aie.device(npu1_4col) {
     aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}}
-      aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
     }
   }
 }
@@ -48,7 +48,7 @@ module {
   aie.device(npu1_4col) {
     aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}}
-      aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 15 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
     }
   }
 }
\ No newline at end of file
diff --git a/test/objectFifo-stateful-transform/memtile_padding_test.mlir b/test/objectFifo-stateful-transform/memtile_padding_test.mlir
new file mode 100644
index 0000000000..0fe3b75af7
--- /dev/null
+++ b/test/objectFifo-stateful-transform/memtile_padding_test.mlir
@@ -0,0 +1,182 @@
+//===- memtile_padding_test.mlir --------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//
+
+// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s
+    // CHECK: %tile_0_0 = aie.tile(0, 0)
+    // CHECK: %tile_0_1 = aie.tile(0, 1)
+    // CHECK: %tile_0_2 = aie.tile(0, 2)
+    // CHECK: %objFifo_out0_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "objFifo_out0_cons_prod_lock"}
+    // CHECK: %objFifo_out0_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_cons_lock"}
+    // CHECK: %objFifo_out1_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_0"} : memref<64x64xi8> 
+    // CHECK: %objFifo_out1_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out1_cons_buff_1"} : memref<64x64xi8> 
+    // CHECK: %objFifo_out1_cons_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out1_cons_prod_lock"}
+    // CHECK: %objFifo_out1_cons_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_cons_lock"}
+    // CHECK: %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8> 
+    // CHECK: %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8> 
+    // CHECK: %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"}
+    // CHECK: %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"}
+    // CHECK: %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8> 
+    // CHECK: %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8> 
+    // CHECK: %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"}
+    // CHECK: %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"}
+    // CHECK: %objFifo_in1_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_0"} : memref<64x64xi8> 
+    // CHECK: %objFifo_in1_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in1_buff_1"} : memref<64x64xi8> 
+    // CHECK: %objFifo_in1_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in1_prod_lock"}
+    // CHECK: %objFifo_in1_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_lock"}
+    // CHECK: %objFifo_in0_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "objFifo_in0_prod_lock"}
+    // CHECK: %objFifo_in0_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_lock"}
+    // CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    // CHECK: aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    // CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
+    // CHECK: aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
+    // CHECK: %core_0_2 = aie.core(%tile_0_2) {
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+    // CHECK:   %c0 = arith.constant 0 : index
+    // CHECK:   %c1 = arith.constant 1 : index
+    // CHECK:   %c64 = arith.constant 64 : index
+    // CHECK:   %c12_i8 = arith.constant 12 : i8
+    // CHECK:   scf.for %arg0 = %c0 to %c64 step %c1 {
+    // CHECK:     scf.for %arg1 = %c0 to %c64 step %c1 {
+    // CHECK:       %0 = memref.load %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8>
+    // CHECK:       %1 = arith.addi %0, %c12_i8 : i8
+    // CHECK:       memref.store %1, %objFifo_in1_cons_buff_0[%arg0, %arg1] : memref<64x64xi8>
+    // CHECK:     }
+    // CHECK:   }
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+    // CHECK:   aie.end
+    // CHECK: }
+    // CHECK: aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
+    // CHECK: aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) {
+    // CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8>
+    // CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64x64xi8>
+    // CHECK:   aiex.npu.dma_wait {symbol = @objFifo_out0}
+    // CHECK: }
+    // CHECK: %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+    // CHECK:   %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+    // CHECK: ^bb1: 
+    // CHECK:   aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb2
+    // CHECK: ^bb2: 
+    // CHECK:   aie.use_lock(%objFifo_in1_prod_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb1
+    // CHECK: ^bb3:
+    // CHECK:   %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
+    // CHECK: ^bb4:
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_in1_buff_0 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_in1_prod_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb5
+    // CHECK: ^bb5: 
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_in1_buff_1 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_in1_prod_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb4
+    // CHECK: ^bb6: 
+    // CHECK:   %2 = aie.dma_start(S2MM, 1, ^bb7, ^bb9)
+    // CHECK: ^bb7: 
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb8
+    // CHECK: ^bb8:
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_prod_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_cons_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb7
+    // CHECK: ^bb9:
+    // CHECK:   %3 = aie.dma_start(MM2S, 1, ^bb10, ^bb12)
+    // CHECK: ^bb10: 
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_out1_cons_buff_0 : memref<64x64xi8>, 0, 4096, [<size = 61, stride = 56>, <size = 56, stride = 1>], [<const_pad_before = 2, const_pad_after = 1>, <const_pad_before = 4, const_pad_after = 4>])
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb11
+    // CHECK: ^bb11:
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_cons_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_out1_cons_buff_1 : memref<64x64xi8>, 0, 4096, [<size = 61, stride = 56>, <size = 56, stride = 1>], [<const_pad_before = 2, const_pad_after = 1>, <const_pad_before = 4, const_pad_after = 4>])
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_prod_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb10
+    // CHECK: ^bb12:
+    // CHECK:   aie.end
+    // CHECK: }
+    // CHECK: aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0)
+    // CHECK: %mem_0_2 = aie.mem(%tile_0_2) {
+    // CHECK:   %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+    // CHECK: ^bb1: 
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb2
+    // CHECK: ^bb2:
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb1
+    // CHECK: ^bb3: 
+    // CHECK:   %1 = aie.dma_start(MM2S, 0, ^bb4, ^bb6)
+    // CHECK: ^bb4: 
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_out1_buff_0 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb5
+    // CHECK: ^bb5: 
+    // CHECK:   aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+    // CHECK:   aie.dma_bd(%objFifo_out1_buff_1 : memref<64x64xi8>, 0, 4096)
+    // CHECK:   aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+    // CHECK:   aie.next_bd ^bb4
+    // CHECK: ^bb6:
+    // CHECK:   aie.end
+    // CHECK:   }
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    aie.objectfifo @objFifo_in0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<56x56xi8>>
+    aie.objectfifo @objFifo_in1(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<64x64xi8>>
+    aie.objectfifo.link [@objFifo_in0] -> [@objFifo_in1] ([] [])
+    aie.objectfifo @objFifo_out1(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xi8>>
+    aie.objectfifo @objFifo_out0(%tile_0_1 dimensionsToStream [<size = 61, stride = 56>, <size = 56, stride = 1>], {%tile_0_0}, 2 : i32) {padDimensions = #aie<bd_pad_layout_array[<const_pad_before = 2, const_pad_after = 1>, <const_pad_before = 4, const_pad_after = 4>]>} : !aie.objectfifo<memref<64x64xi8>>
+    aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ([] [])
+    %core_0_2 = aie.core(%tile_0_2) {
+      %subview = aie.objectfifo.acquire @objFifo_in1 (Consume, 1) : !aie.objectfifosubview<memref<64x64xi8>>
+      %subview1 = aie.objectfifo.acquire @objFifo_out1 (Produce, 1) : !aie.objectfifosubview<memref<64x64xi8>>
+      %elem = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<64x64xi8>> -> memref<64x64xi8>
+      %elem1 = aie.objectfifo.subview.access %subview[0] : !aie.objectfifosubview<memref<64x64xi8>> -> memref<64x64xi8>
+      
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c64 = arith.constant 64 : index
+      %c12_i8 = arith.constant 12 : i8
+      scf.for %arg1 = %c0 to %c64 step %c1 {
+        scf.for %arg2 = %c0 to %c64 step %c1 {
+          %0 = memref.load %elem[%arg1, %arg2] : memref<64x64xi8>
+          %1 = arith.addi %0, %c12_i8 : i8
+          memref.store %1, %elem1[%arg1, %arg2] : memref<64x64xi8>
+        }
+      }
+      aie.objectfifo.release @objFifo_in1 (Consume, 1)
+      aie.objectfifo.release @objFifo_out1 (Produce, 1)
+      aie.end
+    }
+
+    aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) {
+      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8>
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8>
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
+    }
+  }
+}
\ No newline at end of file
diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py
index c3a02a201a..8039dda9ea 100644
--- a/test/python/trace_utils.py
+++ b/test/python/trace_utils.py
@@ -13,7 +13,7 @@
 # CHECK: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32}
 # CHECK: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32}
 # CHECK: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
-# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+# CHECK: aiex.npu.writebd {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d0_zero_after = 0 : i32, d0_zero_before = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d1_zero_after = 0 : i32, d1_zero_before = 0 : i32, d2_stride = 0 : i32, d2_zero_after = 0 : i32, d2_zero_before = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
 # CHECK: aiex.npu.address_patch {addr = 118884 : ui32, arg_idx = 2 : i32, arg_plus = 1024 : i32}
 # CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}
 
diff --git a/test/python/zero_pad.py b/test/python/zero_pad.py
new file mode 100644
index 0000000000..8257a18271
--- /dev/null
+++ b/test/python/zero_pad.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# RUN: %python %s | FileCheck %s
+# CHECK: aie.objectfifo @out(%tile_0_1 dimensionsToStream [<size = 5, stride = 5>, <size = 5, stride = 5>], {%tile_0_0}, 1 : i32) {padDimensions = #aie<bd_pad_layout_array[<const_pad_before = 2, const_pad_after = 0>, <const_pad_before = 3, const_pad_after = 0>]>} : !aie.objectfifo<memref<56xi32>>
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext.scf import _for as range_
+
+N = 56
+dev = AIEDevice.npu1_1col
+col = 0
+
+if len(sys.argv) > 1:
+    N = int(sys.argv[1])
+
+if len(sys.argv) > 2:
+    if sys.argv[2] == "npu":
+        dev = AIEDevice.npu1_1col
+    elif sys.argv[2] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2]))
+
+if len(sys.argv) > 3:
+    col = int(sys.argv[3])
+
+
+def my_passthrough():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            memRef_ty = T.memref(25, T.i32())
+            memRef_ty2 = T.memref(56, T.i32())
+
+            # Tile declarations
+            ShimTile = tile(col, 0)
+            MemTile = tile(col, 1)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, MemTile, 1, memRef_ty)
+            of_out = object_fifo(
+                "out",
+                MemTile,
+                ShimTile,
+                1,
+                memRef_ty2,
+                dimensionsToStream=[(5, 5), (5, 5)],
+                padDimensions=[(2, 0), (3, 0)],
+            )
+            object_fifo_link(of_in, of_out)
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(A, B, C):
+                npu_dma_memcpy_nd(
+                    metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True
+                )
+                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                dma_wait(of_in, of_out)
+
+    print(ctx.module)
+
+
+my_passthrough()