Skip to content

Commit

Permalink
remove unused util
Browse files Browse the repository at this point in the history
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
  • Loading branch information
kylesayrs committed Dec 18, 2024
1 parent 9b61145 commit 2f65d01
Showing 1 changed file with 0 additions and 40 deletions.
40 changes: 0 additions & 40 deletions src/llmcompressor/transformers/finetune/data/data_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import torch
from datasets import Dataset, load_dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers.data import default_data_collator

Expand All @@ -15,7 +14,6 @@
LABELS_MASK_VALUE = -100

__all__ = [
"create_batch_dataloader",
"format_calibration_data",
"get_raw_dataset",
"make_dataset_splits",
Expand All @@ -24,44 +22,6 @@
]


def create_batch_dataloader(
dataloader: torch.utils.data.DataLoader,
batch_size: int,
) -> torch.utils.data.DataLoader:
"""
Create a dataloader whose batch size is equal to the size of the dataset
:param dataset: dataset used to generate dataloader
:param batch_size: batch size of new dataloader
:return: dataloader
"""
dataset = dataloader.dataset
sampler = dataloader.sampler.__class__(dataset)

def pad_sequences(batch):
# extract input_ids and attention_mask from the batch
input_ids = [torch.tensor(item["input_ids"]).squeeze(0) for item in batch]
masks = [torch.tensor(item["attention_mask"]).squeeze(0) for item in batch]

# while 0 is not necessarily the "correct" padding value, the padded
# input_ids are ignored according to the attention_mask
pad_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
pad_masks = pad_sequence(masks, batch_first=True, padding_value=0)

return {
"input_ids": pad_input_ids,
"attention_mask": pad_masks,
}

return torch.utils.data.DataLoader(
dataset,
batch_size=batch_size,
sampler=sampler,
collate_fn=pad_sequences,
pin_memory=True,
)


def format_calibration_data(
tokenized_dataset: Dataset,
num_calibration_samples: Optional[int] = None,
Expand Down

0 comments on commit 2f65d01

Please sign in to comment.