pytorch · pbontrager · Sep 11, 2024 · Jul 9, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/docs/source/api_ref_data.rst b/docs/source/api_ref_data.rst
@@ -73,6 +73,7 @@ Collaters used to collect samples into batches and handle any padding.
     :nosignatures:
 
     padded_collate
+    padded_collate_tiled_images_with_cross_attention
     padded_collate_sft
     padded_collate_dpo
     left_pad_sequence

diff --git a/tests/torchtune/data/test_collate.py b/tests/torchtune/data/test_collate.py
@@ -13,6 +13,7 @@
     padded_collate,
     padded_collate_dpo,
     padded_collate_sft,
+    padded_collate_tiled_images_with_cross_attention,
 )
 
 
@@ -47,6 +48,65 @@ def test_batch_pad_sequence(self):
             padded_label, torch.tensor([10, ignore_idx, ignore_idx])
         )
 
+    def test_padded_collate_tiled_images_with_cross_attention(self):
+        batch = [
+            {
+                "tokens": [1, 2, 1, 3],
+                "labels": [4, 5, 6, 7],
+                "encoder_input": {
+                    "images": [torch.ones(2, 1, 1, 1), torch.ones(3, 1, 1, 1)],
+                    "aspect_ratio": [torch.tensor([1, 2]), torch.tensor([1, 3])],
+                },
+                "encoder_mask": [torch.ones(4, 5 * 2), torch.ones(4, 5 * 3)],
+            },
+            {
+                "tokens": [1, 4],
+                "labels": [8, 9],
+                "encoder_input": {
+                    "images": [torch.ones(4, 1, 1, 1)],
+                    "aspect_ratio": [torch.tensor([2, 2])],
+                },
+                "encoder_mask": [torch.ones(2, 5 * 4)],
+            },
+        ]
+        actual = padded_collate_tiled_images_with_cross_attention(
+            batch=batch, padding_idx=0, ignore_idx=-100
+        )
+
+        mask_1 = torch.concat([torch.ones(4, 5 * 2), torch.zeros(4, 10)], dim=1)
+        mask_2 = torch.concat([torch.ones(4, 5 * 3), torch.zeros(4, 5)], dim=1)
+        mask_3 = torch.concat([torch.ones(2, 5 * 4), torch.zeros(2, 20)], dim=0)
+        sample_1 = torch.stack([mask_1, mask_2])
+        sample_2 = torch.stack([mask_3, torch.zeros(4, 20)])
+        expected_mask = torch.stack([sample_1, sample_2]).view(2, 4, -1)
+
+        expected = {
+            "tokens": torch.tensor([[1, 2, 1, 3], [1, 4, 0, 0]]),
+            "labels": torch.tensor([[4, 5, 6, 7], [8, 9, -100, -100]]),
+            "encoder_input": {
+                "images": torch.tensor(
+                    [
+                        [
+                            [[[[1.0]]], [[[1.0]]], [[[0.0]]], [[[0.0]]]],
+                            [[[[1.0]]], [[[1.0]]], [[[1.0]]], [[[0.0]]]],
+                        ],
+                        [
+                            [[[[1.0]]], [[[1.0]]], [[[1.0]]], [[[1.0]]]],
+                            [[[[0.0]]], [[[0.0]]], [[[0.0]]], [[[0.0]]]],
+                        ],
+                    ]
+                ),
+                "aspect_ratio": torch.tensor([[[1, 2], [1, 3]], [[2, 2], [1, 1]]]),
+            },
+            "encoder_mask": expected_mask,
+        }
+
+        for k in expected:
+            if isinstance(expected[k], dict):
+                for k1 in expected[k]:
+                    torch.testing.assert_close(actual[k][k1], expected[k][k1])
+            torch.testing.assert_close(actual[k], expected[k])
+
 
 class TestLeftPadSequence:
     def test_left_pad_sequence(self):

diff --git a/torchtune/data/__init__.py b/torchtune/data/__init__.py
@@ -10,6 +10,7 @@
     padded_collate,
     padded_collate_dpo,
     padded_collate_sft,
+    padded_collate_tiled_images_with_cross_attention,
 )
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
 from torchtune.data._converters import get_openai_messages, get_sharegpt_messages
@@ -58,4 +59,5 @@
     "padded_collate_dpo",
     "left_pad_sequence",
     "padded_collate",
+    "padded_collate_tiled_images_with_cross_attention",
 ]
diff --git a/torchtune/data/_collate.py b/torchtune/data/_collate.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -214,6 +214,169 @@ def padded_collate_sft(
     return {"tokens": input_ids.long(), "labels": labels.long()}
 
 
+# TODO: Generalize this to support any type of encoder input, right now this assumes
+# a specific encoder_input signature
+def padded_collate_tiled_images_with_cross_attention(
+    batch: List[Dict[str, Any]],
+    padding_idx: int = 0,
+    ignore_idx: int = CROSS_ENTROPY_IGNORE_IDX,
+) -> Dict[str, torch.Tensor]:
+    """Pad a batch of text sequences, tiled image tensors, aspect ratios,
+    and cross attention masks.
+
+    ``batch`` is expected to be a list of sample dicts containing the following:
+        - "tokens": List[int] of length text_seq_len, varies across samples
+        - "labels": List[int] of length text_seq_len, varies across samples
+        - "encoder_input": Dict[str, List[torch.Tensor]]
+            - "images": List[torch.Tensor], each with shape (n_tiles, c, h, w)
+            - "aspect_ratio": List[torch.Tensor], each with shape (2, ) to indicate h_ratio, w_ratio
+        - "encoder_mask": List[Tensor], each with shape (text_seq_len, image_seq_len)
+
+    where c = channel dim, h = height dim, w = weight dim. For each element in the batch,
+    len(images) == len(encoder_mask) == len(aspect_ratio).
+
+    This collater does the following:
+        (1) Pad text sequence and encoder mask to the longest sequence length in the batch
+        (2) Pad image tensors in the tile dimension with zeros to the largest number
+            of tiles in the batch
+        (3) Add empty images of zeros to samples up to max number of images in the batch
+        (4) Pad aspect ratios with (1,1) for all added padding images
+
+    Args:
+        batch (List[Dict[str, Any]]): A list of sample dicts containing tokens,
+            labels, images, encoder_mask, and aspect_ratio.
+        padding_idx (int): Padding index for input token ids. Defaults to 0.
+        ignore_idx (int): Padding index for labels. Defaults to -100.
+
+    Returns:
+        Dict[str, Tensor]: Collated tokens, labels, images, encoder_mask, aspect_ratio tensors.
+            - tokens: Tensor of shape (bsz, max_seq_len)
+            - labels: Tensor of shape (bsz, max_seq_len)
+            - images: Tensor of shape (bsz, max_num_images, max_num_tiles, c, h, w)
+            - encoder_mask: Tensor of shape (bsz, max_seq_len, tokens_per_tile * max_num_tiles * max_num_images)
+            - aspect_ratio: Tensor of shape (bsz, max_num_images, 2)
+
+    Example:
+        >>> image_id = 1
+        >>> tokens_per_tile = 5
+        >>> c, h, w = 1, 1, 1
+        >>> batch = [
+        ...     {
+        ...         "tokens": [1, 2, 1, 3], "labels": [4, 5, 6, 7],
+        ...         "encoder_input": {
+        ...             # One image with two tiles, one image with three tiles
+        ...             "images": [torch.ones(2, c, h, w), torch.ones(3, c, h, w)],
+        ...             "aspect_ratio": [torch.tensor([1, 2]), torch.tensor([1, 3])],
+        ...         },
+        ...         # Mask is shape (text_seq_len, tokens_per_tile * n_tiles)
+        ...         "encoder_mask": [torch.ones(4, 5 * 2), torch.ones(4, 5 * 3)],
+        ...     },
+        ...     {
+        ...         "tokens": [1, 4], "labels": [8, 9],
+        ...         "encoder_input": {
+        ...             # One image with four tiles
+        ...             "images": [torch.ones(4, c, h, w)],
+        ...             "aspect_ratio": [torch.tensor([2, 2])],
+        ...         },
+        ...         # Mask is shape (text_seq_len, tokens_per_tile * n_tiles)
+        ...         "encoder_mask": [torch.ones(2, 5 * 4)],
+        ...     },
+        ... ]
+        >>> model_inputs = padded_collate_vision_text(batch=batch)
+        >>> print(model_inputs["tokens"])
+        tensor([[1, 2, 1, 3],
+                [1, 4, 0, 0]])
+        >>> print(model_inputs["labels"])
+        tensor([[4, 5, 6, 7],
+                [8, 9, -100, -100]])
+        >>> print(model_inputs["encoder_input"]["images"].shape)  # (bsz, max_num_images, max_num_tiles, c, h, w)
+        torch.Size([2, 2, 4, 1, 1, 1])
+        >>> print(model_inputs["encoder_mask"].shape)  # (bsz, max_text_seq_len, tokens_per_tile * max_num_tiles * max_num_images)
+        torch.Size([2, 4, 40])
+        >>> print(model_inputs["encoder_input"]["aspect_ratio"].shape)  # (bsz, max_num_images, 2)
+        torch.Size([2, 2, 2])
+        >>> print(model_inputs["encoder_input"]["images"][0, 0, ...])  # Image with two tiles got padded to four
+        tensor([[[[1.]]], [[[1.]]], [[[0.]]], [[[0.]]]])
+        >>> print(model_inputs["encoder_input"]["images"][0, 1, ...])  # Image with three tiles got padded to four
+        tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[0.]]]])
+        >>> print(model_inputs["encoder_input"]["images"][1, 0, ...])  # Image with four tiles did not get padded
+        tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[1.]]]])
+        >>> print(model_inputs["encoder_input"]["images"][1, 1, ...])  # Extra padding image was added to second sample
+        tensor([[[[0.]]], [[[0.]]], [[[0.]]], [[[0.]]]])
+    """
+    # Text tokens can be handled independently by existing collater
+    text_only = [
+        {"tokens": sample["tokens"], "labels": sample["labels"]} for sample in batch
+    ]
+    collated_text = padded_collate_sft(text_only, padding_idx, ignore_idx)
+    max_seq_len = collated_text["tokens"].shape[-1]
+    bsz = len(batch)
+
+    # TODO: Figure out how to make this more efficient or vectorized. Setting
+    # max_num_tiles beforehand will save one nested for loop but may incur more
+    # memory and compute costs in attention if max_num_tiles > batch_max_num_tiles
+
+    # First loop: get max number of tiles in batch
+    max_num_tiles = max(
+        image.shape[0]
+        for sample in batch
+        for image in sample["encoder_input"]["images"]
+    )
+    # Second loop: pad images and masks to max number of tiles, max text seq len in batch
+    batch_images = []
+    batch_masks = []
+    batch_aspect_ratios = []
+    for sample in batch:
+        sample_images = []
+        sample_masks = []
+        for image, mask in zip(
+            sample["encoder_input"]["images"], sample["encoder_mask"]
+        ):
+            # Single image in each sample has shape (n_tiles, c, h, w)
+            n_tiles = image.shape[0]
+            # Single mask in each sample corresponds to a single image and has shape (text_seq_len, image_seq_len)
+            # where image_seq_len = n_tiles * tokens_per_tile
+            text_seq_len, image_seq_len = mask.shape
+            tokens_per_tile = image_seq_len // n_tiles
+            padding_tiles = max_num_tiles - n_tiles
+            padding_text = max_seq_len - text_seq_len
+            # Image should now have shape (max_num_tiles, c, h, w)
+            padded_image = F.pad(image, (0, 0, 0, 0, 0, 0, 0, padding_tiles), value=0)
+            # Mask should now have shape (max_seq_len, max_image_seq_len), where
+            # max_image_seq_len = max_num_tiles * tokens_per_tile
+            padded_mask = F.pad(
+                mask, (0, padding_tiles * tokens_per_tile, 0, padding_text), value=0
+            )
+            sample_images.append(padded_image)
+            sample_masks.append(padded_mask)
+        # Stack multiple images and masks per sample in num_images dimension
+        batch_images.append(torch.stack(sample_images))
+        batch_masks.append(torch.stack(sample_masks))
+        batch_aspect_ratios.append(torch.stack(sample["encoder_input"]["aspect_ratio"]))
+    # Finally, pad images, masks, aspect ratios to max number of images in batch
+    # (bsz, max_num_images, max_num_tiles, c, h, w)
+    collated_images = pad_sequence(batch_images, batch_first=True, padding_value=0)
+    # (bsz, max_num_images, max_seq_len, max_image_seq_len)
+    collated_masks = pad_sequence(batch_masks, batch_first=True, padding_value=0)
+    # (bsz, max_num_images, 2)
+    collated_aspect_ratios = pad_sequence(
+        batch_aspect_ratios, batch_first=True, padding_value=1
+    )
+
+    # Concatenate masks for multiple images across image_seq_len dimension
+    concat_masks = collated_masks.view(bsz, max_seq_len, -1)
+
+    return {
+        "tokens": collated_text["tokens"],
+        "labels": collated_text["labels"],
+        "encoder_input": {
+            "images": collated_images,
+            "aspect_ratio": collated_aspect_ratios,
+        },
+        "encoder_mask": concat_masks,
+    }
+
+
 def padded_collate_dpo(
     batch: List[Dict[str, List[int]]],
     padding_idx: int = 0,