Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace tokenizer with processor #955

Merged
merged 12 commits into from
Dec 17, 2024
12 changes: 7 additions & 5 deletions src/llmcompressor/pytorch/model_load/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from llmcompressor.core import active_session, create_session, pre_initialize_structure
from llmcompressor.pytorch.utils import ModuleSparsificationInfo
from llmcompressor.typing import Processor

COMPLETED_STAGES_FILENAME = "completed_stages.json"

Expand Down Expand Up @@ -92,15 +93,16 @@ def initialize_recipe(model: Module, recipe_path: str):
def save_model_and_recipe(
model: Module,
save_path: str,
tokenizer: Optional[Any] = None,
processor: Optional[Processor] = None,
save_safetensors: bool = False,
save_compressed: bool = False,
):
"""
Save a model, tokenizer and the currently loaded recipe to file
Save a model, processor and the currently loaded recipe to file

:param model: pytorch model to save
:param save_path: path to save output to
:param tokenizer: model tokenizer to save
:param processor: model processor or tokenizer to save
:param save_safetensors: whether to save as safetensors or pickle (bin)
:param save_compressed: whether to compress sparse weights on disk
"""
Expand All @@ -111,8 +113,8 @@ def save_model_and_recipe(
save_path, save_compressed=save_compressed, safe_serialization=save_safetensors
)

if tokenizer is not None:
tokenizer.save_pretrained(save_path)
if processor is not None:
processor.save_pretrained(save_path)

logger.info("Saving output to {}".format(os.path.abspath(save_path)))

Expand Down
44 changes: 31 additions & 13 deletions src/llmcompressor/transformers/finetune/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
from compressed_tensors.registry import RegistryMixin
from datasets import Dataset, IterableDataset
from loguru import logger
from transformers import AutoTokenizer

from llmcompressor.transformers.finetune.data.data_args import DataTrainingArguments
from llmcompressor.transformers.finetune.data.data_helpers import (
LABELS_MASK_VALUE,
get_custom_datasets_from_path,
get_raw_dataset,
)
from llmcompressor.typing import Processor


class TextGenerationDataset(RegistryMixin):
Expand All @@ -30,10 +30,10 @@ def __init__(
text_column: str,
data_args: DataTrainingArguments,
split: str,
tokenizer: AutoTokenizer,
processor: Processor,
):
self.text_column = text_column
self.tokenizer = tokenizer
self.processor = processor
self.data_args = data_args
self.raw_kwargs = data_args.raw_kwargs or {}
self.split = split
Expand All @@ -50,20 +50,38 @@ def __init__(
else:
self.padding = False

if self.tokenizer:
# get tokenizer
self.tokenizer = getattr(self.processor, "tokenizer", self.processor)

if self.tokenizer is not None:
# fill in pad token
if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token

# configure sequence length
max_seq_length = data_args.max_seq_length
model_max_length = tokenizer.model_max_length if tokenizer else max_seq_length
if self.tokenizer and max_seq_length > model_max_length:
logger.warning(
f"The max_seq_length passed ({max_seq_length}) is larger than "
f"the maximum length for the model ({tokenizer.model_max_length}). "
f"Using max_seq_length={tokenizer.model_max_length}."
# configure sequence length
max_seq_length = data_args.max_seq_length
if data_args.max_seq_length > self.tokenizer.model_max_length:
logger.warning(
f"The max_seq_length passed ({max_seq_length}) is larger than "
f"maximum length for model ({self.tokenizer.model_max_length}). "
f"Using max_seq_length={self.tokenizer.model_max_length}."
)
self.max_seq_length = min(
data_args.max_seq_length, self.tokenizer.model_max_length
)

# configure padding
self.padding = (
False
if self.data_args.concatenate_data
else "max_length"
if self.data_args.pad_to_max_length
else False
)
self.max_seq_length = min(data_args.max_seq_length, model_max_length)

else:
self.max_seq_length = None
self.padding = False

def get_raw_dataset(self, cache_dir: Optional[str] = None) -> Dataset:
"""
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/transformers/finetune/data/c4.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ class C4Dataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
data_args.dataset = "allenai/c4"
super().__init__(
text_column="text", data_args=data_args, split=split, tokenizer=tokenizer
text_column="text", data_args=data_args, split=split, processor=processor
)
6 changes: 3 additions & 3 deletions src/llmcompressor/transformers/finetune/data/cnn_dailymail.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ class CNNDailyMailDataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n"

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
data_args.dataset = "cnn_dailymail"
data_args.dataset_config_name = "3.0.0"

super().__init__(
text_column="text", data_args=data_args, split=split, tokenizer=tokenizer
text_column="text", data_args=data_args, split=split, processor=processor
)

def get_raw_dataset(self, cache_dir: Optional[str] = None):
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/transformers/finetune/data/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,17 @@ class CustomDataset(TextGenerationDataset):
:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
Can also be set to None to load all the splits
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset

"""

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
super().__init__(
text_column=data_args.text_column,
data_args=data_args,
split=split,
tokenizer=tokenizer,
processor=processor,
)
self.preprocessing_func = data_args.preprocessing_func
self.remove_columns = data_args.remove_columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class EvolCodeAlpacaDataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

EVOL_ALPACA_TEMPLATE = (
Expand All @@ -34,11 +34,11 @@ class EvolCodeAlpacaDataset(TextGenerationDataset):
"\n\n### Response:\n"
)

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
data_args.dataset = "theblackcat102/evol-codealpaca-v1"
super().__init__(
text_column="text", data_args=data_args, split=split, tokenizer=tokenizer
text_column="text", data_args=data_args, split=split, processor=processor
)

def get_raw_dataset(self, cache_dir: Optional[str] = None):
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/transformers/finetune/data/gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@ class GSM8KDataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

GSM_TEMPLATE = "Question: {question}\nAnswer:"

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
data_args.dataset = "gsm8k"
super().__init__(
text_column="text", data_args=data_args, split=split, tokenizer=tokenizer
text_column="text", data_args=data_args, split=split, processor=processor
)

def get_raw_dataset(self, cache_dir: Optional[str] = None):
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/transformers/finetune/data/open_platypus.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class OpenPlatypusDataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

ALPACA_TEMPLATE = {
Expand All @@ -37,11 +37,11 @@ class OpenPlatypusDataset(TextGenerationDataset):
"instruction}\n\n### Response:\n",
}

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
data_args.dataset = "garage-bAInd/Open-Platypus"
super().__init__(
text_column="text", data_args=data_args, split=split, tokenizer=tokenizer
text_column="text", data_args=data_args, split=split, processor=processor
)

def get_raw_dataset(self, cache_dir: Optional[str] = None):
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/transformers/finetune/data/ptb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ class PtbDataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
data_args.dataset = "ptb_text_only"
super().__init__(
text_column="sentence",
data_args=data_args,
split=split,
tokenizer=tokenizer,
processor=processor,
)
10 changes: 6 additions & 4 deletions src/llmcompressor/transformers/finetune/data/ultrachat_200k.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class UltraChatDataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

DEFAULT_CHAT_TEMPLATE = (
Expand All @@ -40,7 +40,7 @@ class UltraChatDataset(TextGenerationDataset):
"{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
)

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
data_args = deepcopy(data_args)
data_args.dataset = "HuggingFaceH4/ultrachat_200k"

Expand All @@ -51,13 +51,15 @@ def __init__(self, data_args, split, tokenizer):
text_column="messages",
data_args=data_args,
split=split,
tokenizer=tokenizer,
processor=processor,
)

if (
not hasattr(self.tokenizer, "chat_template")
or self.tokenizer.chat_template is None
):
# note that since tokenizer is a member of processor,
# this change affects processor.apply_chat_template
self.tokenizer.chat_template = self.DEFAULT_CHAT_TEMPLATE

def get_raw_dataset(self, cache_dir: Optional[str] = None):
Expand All @@ -75,7 +77,7 @@ def restructure_fn(sample):
if sample["messages"][0]["role"] != "system":
sample["messages"].insert(0, {"role": "system", "content": ""})

sample["messages"] = self.tokenizer.apply_chat_template(
sample["messages"] = self.processor.apply_chat_template(
sample["messages"], tokenize=False, add_generation_prompt=False
)
return sample
Expand Down
6 changes: 3 additions & 3 deletions src/llmcompressor/transformers/finetune/data/wikitext.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ class WikiTextDataset(TextGenerationDataset):

:param data_args: configuration settings for dataset loading
:param split: split from dataset to load, for instance `test` or `train[:5%]`
:param tokenizer: tokenizer to use on dataset
:param processor: processor or tokenizer to use on dataset
"""

def __init__(self, data_args, split, tokenizer):
def __init__(self, data_args, split, processor):
super().__init__(
text_column="text", data_args=data_args, split=split, tokenizer=tokenizer
text_column="text", data_args=data_args, split=split, processor=processor
)
6 changes: 6 additions & 0 deletions src/llmcompressor/transformers/finetune/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ class ModelArguments:
"help": "Pretrained tokenizer name or path if not the same as model_name"
},
)
processor: Optional[str] = field(
default=None,
metadata={
"help": "Pretrained processor name or path if not the same as model_name"
},
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where to store the pretrained data from huggingface.co"},
Expand Down
Loading
Loading