From bf4744a37c25eb8dd75e091b94ab3ef794416731 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Dec 2024 00:16:23 +0000 Subject: [PATCH 1/7] remove sparseml utilities Signed-off-by: Kyle Sayers --- .../compressed_tensors_utils.py | 3 +- .../transformers/utils/helpers.py | 446 +----------------- 2 files changed, 7 insertions(+), 442 deletions(-) diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 6de89dd8b..88822f69e 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -24,6 +24,7 @@ from llmcompressor.transformers.compression.sparsity_config import ( SparsityConfigMetadata, ) +from llmcompressor.transformers.utils import DEFAULT_RECIPE_NAME from llmcompressor.utils.fsdp.helpers import ( find_and_move_state_dicts_to_cpu, unwrap_and_export_model, @@ -189,7 +190,7 @@ def skip(*args, **kwargs): ) compressor.update_config(save_directory) - recipe_path = os.path.join(save_directory, "recipe.yaml") + recipe_path = os.path.join(save_directory, DEFAULT_RECIPE_NAME) session = active_session() if (recipe_yaml_str := session.get_serialized_recipe()) is not None: diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py index 401a454cf..b53705b9b 100644 --- a/src/llmcompressor/transformers/utils/helpers.py +++ b/src/llmcompressor/transformers/utils/helpers.py @@ -3,75 +3,21 @@ huggingface/transformers flows """ -import inspect import os -from collections import OrderedDict -from contextlib import suppress -from enum import Enum -from pathlib import Path -from typing import Iterable, List, Optional -from typing import OrderedDict as OrderedDictType -from typing import Tuple, Union +from typing import TYPE_CHECKING, Optional -import requests -import torch -import transformers -from huggingface_hub import HUGGINGFACE_CO_URL_HOME, HfFileSystem, hf_hub_download from loguru import logger -from transformers import AutoConfig from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import PaddingStrategy -from llmcompressor.utils.fsdp.context import main_process_first_context +if TYPE_CHECKING: + from llmcompressor.transformers import ModelArguments, TrainingArguments __all__ = [ - "RECIPE_NAME", + "DEFAULT_RECIPE_NAME", "detect_last_checkpoint", - "TaskNames", - "resolve_sequence_length", - "ALL_TASK_NAMES", - "create_fake_dataloader", - "POSSIBLE_TOKENIZER_FILES", - "download_repo_from_huggingface_hub", - "download_model_directory", ] - -class TaskNames(Enum): - mlm = {"masked-language-modeling", "mlm"} - qa = {"question-answering", "qa"} - token_classification = {"token-classification", "ner"} - text_classification = { - "text-classification", - "sentiment-analysis", - "sequence-classification", - "glue", - } - text_generation = {"text-generation"} - - -ALL_TASK_NAMES = list(set.union(*[task_names.value for task_names in TaskNames])) -RECIPE_NAME = "recipe.yaml" - -MANDATORY_DEPLOYMENT_FILES = { - "tokenizer_config.json", - "config.json", -} -OPTIONAL_DEPLOYMENT_FILES = {"tokenizer.json", "tokenizer.model"} -NLG_MANDATORY_DEPLOYMENT_FILES = {"special_tokens_map.json"} -NLG_OPTIONAL_DEPLOYMENT_FILES = { - "vocab.json", - "merges.txt", -} -POSSIBLE_TOKENIZER_FILES = { - "vocab.json", - "merges.txt", - "tokenizer.json", - "tokenizer.model", - "special_tokens_map.json", - "tokenizer_config.json", -} -RELEVANT_HF_SUFFIXES = ["json", "md", "bin", "safetensors", "yaml", "yml", "py"] +DEFAULT_RECIPE_NAME = "recipe.yaml" def detect_last_checkpoint( @@ -108,385 +54,3 @@ def detect_last_checkpoint( ) return last_checkpoint - - -def resolve_sequence_length(config: AutoConfig) -> int: - """ - Resolve the sequence length from the config - - :param config: the config to resolve the sequence length from - :return: the sequence length - """ - if hasattr(config, "max_position_embeddings"): - sequence_length = config.max_position_embeddings - - elif hasattr(config, "max_seq_len"): - sequence_length = config.max_seq_len - else: - raise ValueError( - "Could not infer a default sequence length " - "from the HF transformers config. Please specify " - "the sequence length with --sequence_length" - ) - logger.debug( - f"Using default sequence length of {sequence_length} " - "(inferred from HF transformers config) " - ) - return sequence_length - - -def resolve_recipe( - model_path: Union[str, Path], - recipe: Union[str, Path, None] = None, -) -> Union[str, None]: - """ - Resolve the recipe to apply to the model. - :param recipe: the recipe to apply to the model. - It can be one of the following: - - None - This means that we are not either not applying - any recipe and allowing the model to potentially - infer the appropriate pre-existing recipe - from the model_path - - a path to the recipe file - This can be a string or Path object pointing - to a recipe file. If the specified recipe file - is different from the potential pre-existing - recipe for that model (stored in the model_path), - the function will raise an warning - - name of the recipe file (e.g. "recipe.yaml") - Recipe file name specific is assumed to be stored - in the model_path - - a string containing the recipe - Needs to adhere to the SparseML recipe format - - :param model_path: the path to the model to load. - It can be one of the following: - - a path to the model directory - - a path to the model file - - Hugging face model id - - :return: the resolved recipe - """ - - if recipe is None: - return infer_recipe_from_model_path(model_path) - - elif os.path.isfile(recipe): - # recipe is a path to a recipe file - return resolve_recipe_file(recipe, model_path) - - elif os.path.isfile(os.path.join(model_path, recipe)): - # recipe is a name of a recipe file - recipe = os.path.join(model_path, recipe) - return resolve_recipe_file(recipe, model_path) - - elif isinstance(recipe, str): - # recipe is a string containing the recipe - logger.debug( - "Applying the recipe string directly to the model, without " - "checking for a potential existing recipe in the model_path." - ) - return recipe - - logger.info( - "No recipe requested and no default recipe " - f"found in {model_path}. Skipping recipe resolution." - ) - return None - - -def infer_recipe_from_model_path(model_path: Union[str, Path]) -> Optional[str]: - """ - Infer the recipe from the model_path. - :param model_path: the path to the model to load. - It can be one of the following: - - a path to the model directory - - a path to the model file - - Hugging face model id - :return the path to the recipe file if found, None otherwise - """ - model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path - - if os.path.isdir(model_path) or os.path.isfile(model_path): - # model_path is a local path to the model directory or model file - # attempting to find the recipe in the model_directory - model_path = ( - os.path.dirname(model_path) if os.path.isfile(model_path) else model_path - ) - recipe = os.path.join(model_path, RECIPE_NAME) - if os.path.isfile(recipe): - logger.info(f"Found recipe in the model_path: {recipe}") - return recipe - logger.debug(f"No recipe found in the model_path: {model_path}") - return None - - recipe = recipe_from_huggingface_model_id(model_path)[0] - - if recipe is None: - logger.info("Failed to infer the recipe from the model_path") - return recipe - - -def recipe_from_huggingface_model_id( - model_path: str, recipe_name: str = RECIPE_NAME -) -> Tuple[Optional[str], bool]: - """ - Attempts to download the recipe from the huggingface model id. - - :param model_path: Assumed to be the huggingface model id. - If it is not, this function will return None. - :param recipe_name: The name of the recipe file to download. - Defaults to RECIPE_NAME. - :return: tuple: - - the path to the recipe file if found, None otherwise - - True if model_path is a valid huggingface model id, False otherwise - """ - model_id = os.path.join(HUGGINGFACE_CO_URL_HOME, model_path) - request = requests.get(model_id) - if not request.status_code == 200: - logger.debug( - "model_path is not a valid huggingface model id. " - "Skipping recipe resolution." - ) - return None, False - - logger.info( - "model_path is a huggingface model id. " - "Attempting to download recipe from " - f"{HUGGINGFACE_CO_URL_HOME}" - ) - try: - recipe = hf_hub_download(repo_id=model_path, filename=recipe_name) - logger.info(f"Found recipe: {recipe_name} for model id: {model_path}.") - except Exception as e: - logger.info( - f"Unable to to find recipe {recipe_name} " - f"for model id: {model_path}: {e}. " - "Skipping recipe resolution." - ) - recipe = None - return recipe, True - - -def resolve_recipe_file( - requested_recipe: Union[str, Path], model_path: Union[str, Path] -) -> Union[str, Path, None]: - """ - Given the requested recipe and the model_path, return the path to the recipe file. - - :param requested_recipe. Is a full path to the recipe file - :param model_path: the path to the model to load. - It can be one of the following: - - a path to the model directory - - a path to the model file - - Hugging face model id - :return the path to the recipe file if found, None otherwise - """ - # preprocess arguments so that they are all strings - requested_recipe = ( - requested_recipe.as_posix() - if isinstance(requested_recipe, Path) - else requested_recipe - ) - model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path - model_path = ( - os.path.dirname(model_path) if os.path.isfile(model_path) else model_path - ) - - if not os.path.isdir(model_path): - default_recipe, model_exists = recipe_from_huggingface_model_id(model_path) - if not model_exists: - raise ValueError(f"Unrecognized model_path: {model_path}") - - if not default_recipe == requested_recipe and default_recipe is not None: - logger.warning( - f"Attempting to apply recipe: {requested_recipe} " - f"to the model at: {model_path}, " - f"but the model already has a recipe: {default_recipe}. " - f"Using {requested_recipe} instead." - ) - return requested_recipe - - # pathway for model_path that is a directory - default_recipe = os.path.join(model_path, RECIPE_NAME) - default_recipe_exists = os.path.isfile(default_recipe) - default_and_request_recipes_identical = os.path.samefile( - default_recipe, requested_recipe - ) - - if ( - default_recipe_exists - and requested_recipe - and not default_and_request_recipes_identical - ): - logger.warning( - f"Attempting to apply recipe: {requested_recipe} " - f"to the model located in {model_path}, " - f"but the model already has a recipe stored as {default_recipe}. " - f"Using {requested_recipe} instead." - ) - - elif not default_recipe_exists and requested_recipe: - logger.warning( - f"Attempting to apply {requested_recipe} " - f"to the model located in {model_path}." - "However, it is expected that the model " - f"has its target recipe stored as {default_recipe}." - "Applying any recipe before the target recipe may " - "result in unexpected behavior." - f"Applying {requested_recipe} nevertheless." - ) - - elif default_recipe_exists: - logger.info(f"Using the default recipe: {requested_recipe}") - - return requested_recipe - - -def create_fake_dataloader( - model: torch.nn.Module, - tokenizer: transformers.AutoTokenizer, - num_samples: int, -) -> Tuple[Iterable[OrderedDictType[str, torch.Tensor]], List[str]]: - """ - Creates fake transformers dataloader for the model, based on the model's - forward signature. - - :param model: The model to create the dataloader for - :param tokenizer: The tokenizer to use for the dataloader - :param num_samples: The number of fake samples in the dataloader - :return: The data loader (iterable) and the input names for the model - """ - - forward_args_spec = inspect.getfullargspec(model.__class__.forward) - inputs = tokenizer( - "", return_tensors="pt", padding=PaddingStrategy.MAX_LENGTH.value - ).data - fake_inputs = OrderedDict( - [ - (input_key, inputs[input_key][0].reshape(1, -1)) - for input_key in forward_args_spec.args - if input_key in inputs - ] - ) - data_loader = (fake_inputs for _ in range(num_samples)) - input_names = list(fake_inputs.keys()) - return data_loader, input_names - - -def fetch_recipe_path(target: str): - """ - Fetches the recipe path for the given target. - This method will also download the recipe if it is not - already downloaded. - - Takes care of three scenarios: - 1. target is a local path to a model directory - (looks for recipe.yaml in the directory) - 2. target is a HuggingFace stub (downloads and - returns the path to the default recipe) - - :param target: The target to fetch the recipe path for - can be a local path or HuggingFace stub - :return: The path to the recipe for the target - """ - DEFAULT_RECIPE_NAME = "recipe.yaml" - if Path(target).exists(): - # target is a local path - potential_recipe_path = Path(target) / DEFAULT_RECIPE_NAME - return str(potential_recipe_path) if potential_recipe_path.exists() else None - - # Recipe must be downloaded - - recipe_path = None - - # target is a HuggingFace stub - with suppress(Exception): - # suppress any errors if the recipe is not found on HuggingFace - recipe_path = hf_hub_download(repo_id=target, filename=DEFAULT_RECIPE_NAME) - - return recipe_path - - -def download_repo_from_huggingface_hub(repo_id, **kwargs): - """ - Download relevant model files from the Hugging Face Hub - using the huggingface_hub.hf_hub_download function - - Note(s): - - Does not download the entire repo, only the relevant files - for the model, such as the model weights, tokenizer files, etc. - - Does not re-download files that already exist locally, unless - the force_download flag is set to True - - :pre-condition: the repo_id must be a valid Hugging Face Hub repo id - :param repo_id: the repo id to download - :param kwargs: additional keyword arguments to pass to hf_hub_download - """ - hf_filesystem = HfFileSystem() - files = hf_filesystem.ls(repo_id) - - if not files: - raise ValueError(f"Could not find any files in HF repo {repo_id}") - - # All file(s) from hf_filesystem have "name" key - # Extract the file names from the files - relevant_file_names = ( - Path(file["name"]).name - for file in files - if any(file["name"].endswith(suffix) for suffix in RELEVANT_HF_SUFFIXES) - ) - - hub_kwargs_names = ( - "subfolder", - "repo_type", - "revision", - "library_name", - "library_version", - "cache_dir", - "local_dir", - "local_dir_use_symlinks", - "user_agent", - "force_download", - "force_filename", - "proxies", - "etag_timeout", - "resume_download", - "token", - "local_files_only", - "headers", - "legacy_cache_layout", - "endpoint", - ) - hub_kwargs = {name: kwargs[name] for name in hub_kwargs_names if name in kwargs} - - for file_name in relevant_file_names: - last_file = hf_hub_download(repo_id=repo_id, filename=file_name, **hub_kwargs) - - # parent directory of the last file is the model directory - return str(Path(last_file).parent.resolve().absolute()) - - -def download_model_directory(pretrained_model_name_or_path: str, **kwargs): - """ - Download the model directory from the HF hub if the model is not found locally - - :param pretrained_model_name_or_path: the name of or path to the model to load - can be a HuggingFace model stub - :param kwargs: additional keyword arguments to pass to the download function - :return: the path to the downloaded model directory - """ - pretrained_model_path: Path = Path(pretrained_model_name_or_path) - - if pretrained_model_path.exists(): - logger.debug( - "Model directory already exists locally.", - ) - return pretrained_model_name_or_path - - with main_process_first_context(): - logger.debug("Downloading model from HuggingFace Hub.") - return download_repo_from_huggingface_hub( - repo_id=pretrained_model_name_or_path, **kwargs - ) From 7e516c143fea72be5db0fa06e1d2d5bae6ea1cc4 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Dec 2024 00:38:13 +0000 Subject: [PATCH 2/7] use in model_load Signed-off-by: Kyle Sayers --- src/llmcompressor/pytorch/model_load/helpers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py index 11a924f1d..180c559af 100644 --- a/src/llmcompressor/pytorch/model_load/helpers.py +++ b/src/llmcompressor/pytorch/model_load/helpers.py @@ -9,6 +9,7 @@ from llmcompressor.core import active_session, create_session, pre_initialize_structure from llmcompressor.pytorch.utils import ModuleSparsificationInfo +from llmcompressor.transformers import DEFAULT_RECIPE_NAME COMPLETED_STAGES_FILENAME = "completed_stages.json" @@ -24,8 +25,6 @@ "save_completed_stages", ] -RECIPE_FILE_NAME = "recipe.yaml" - def log_model_load( model: Module, model_name_or_path: str, model_type: str, delayed_load: bool @@ -116,7 +115,7 @@ def save_model_and_recipe( logger.info("Saving output to {}".format(os.path.abspath(save_path))) - recipe_path = os.path.join(save_path, RECIPE_FILE_NAME) + recipe_path = os.path.join(save_path, DEFAULT_RECIPE_NAME) session = active_session() recipe_yaml_str = session.get_serialized_recipe() with open(recipe_path, "w") as fp: From 9e33641b1ca660d61d1d857a4d4486152184e4df Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Dec 2024 16:39:18 +0000 Subject: [PATCH 3/7] remove use of RECIPE FILE NAME Signed-off-by: Kyle Sayers --- src/llmcompressor/transformers/finetune/session_mixin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index e3a9c4d84..498ff4a40 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -24,8 +24,9 @@ from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import ( KDModelWrapper, ) -from llmcompressor.pytorch.model_load.helpers import RECIPE_FILE_NAME, get_session_model +from llmcompressor.pytorch.model_load.helpers import get_session_model from llmcompressor.pytorch.utils import ModuleSparsificationInfo +from llmcompressor.transformers import DEFAULT_RECIPE_NAME from llmcompressor.transformers.finetune.callbacks import ( DisableHalfPrecisionCallback, TrainingLoopCallbacks, @@ -495,7 +496,7 @@ def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False): if self.accelerator.is_main_process: # save recipe, will contain modifiers from the model's original recipe as # well as those added from self.recipe - recipe_path = os.path.join(output_dir, RECIPE_FILE_NAME) + recipe_path = os.path.join(output_dir, DEFAULT_RECIPE_NAME) session = active_session() recipe_yaml_str = session.get_serialized_recipe() with open(recipe_path, "w") as fp: From 58c0fba3d75f8e25bd19fd6cbbc8823c2eaeb5c3 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Dec 2024 16:56:00 +0000 Subject: [PATCH 4/7] rename to RECIPE_FILE_NAME, avoid circular import Signed-off-by: Kyle Sayers --- src/llmcompressor/pytorch/model_load/helpers.py | 5 +++-- src/llmcompressor/transformers/finetune/session_mixin.py | 4 ++-- .../transformers/sparsification/compressed_tensors_utils.py | 4 ++-- src/llmcompressor/transformers/utils/helpers.py | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py index 180c559af..3db9be173 100644 --- a/src/llmcompressor/pytorch/model_load/helpers.py +++ b/src/llmcompressor/pytorch/model_load/helpers.py @@ -9,7 +9,6 @@ from llmcompressor.core import active_session, create_session, pre_initialize_structure from llmcompressor.pytorch.utils import ModuleSparsificationInfo -from llmcompressor.transformers import DEFAULT_RECIPE_NAME COMPLETED_STAGES_FILENAME = "completed_stages.json" @@ -105,6 +104,8 @@ def save_model_and_recipe( :param save_safetensors: whether to save as safetensors or pickle (bin) :param save_compressed: whether to compress sparse weights on disk """ + # avoid circular import + from llmcompressor.transformers.utils.helpers import RECIPE_FILE_NAME model.save_pretrained( save_path, save_compressed=save_compressed, safe_serialization=save_safetensors @@ -115,7 +116,7 @@ def save_model_and_recipe( logger.info("Saving output to {}".format(os.path.abspath(save_path))) - recipe_path = os.path.join(save_path, DEFAULT_RECIPE_NAME) + recipe_path = os.path.join(save_path, RECIPE_FILE_NAME) session = active_session() recipe_yaml_str = session.get_serialized_recipe() with open(recipe_path, "w") as fp: diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index 498ff4a40..b1ac57b95 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -26,7 +26,7 @@ ) from llmcompressor.pytorch.model_load.helpers import get_session_model from llmcompressor.pytorch.utils import ModuleSparsificationInfo -from llmcompressor.transformers import DEFAULT_RECIPE_NAME +from llmcompressor.transformers import RECIPE_FILE_NAME from llmcompressor.transformers.finetune.callbacks import ( DisableHalfPrecisionCallback, TrainingLoopCallbacks, @@ -496,7 +496,7 @@ def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False): if self.accelerator.is_main_process: # save recipe, will contain modifiers from the model's original recipe as # well as those added from self.recipe - recipe_path = os.path.join(output_dir, DEFAULT_RECIPE_NAME) + recipe_path = os.path.join(output_dir, RECIPE_FILE_NAME) session = active_session() recipe_yaml_str = session.get_serialized_recipe() with open(recipe_path, "w") as fp: diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 88822f69e..759098894 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -24,7 +24,7 @@ from llmcompressor.transformers.compression.sparsity_config import ( SparsityConfigMetadata, ) -from llmcompressor.transformers.utils import DEFAULT_RECIPE_NAME +from llmcompressor.transformers.utils import RECIPE_FILE_NAME from llmcompressor.utils.fsdp.helpers import ( find_and_move_state_dicts_to_cpu, unwrap_and_export_model, @@ -190,7 +190,7 @@ def skip(*args, **kwargs): ) compressor.update_config(save_directory) - recipe_path = os.path.join(save_directory, DEFAULT_RECIPE_NAME) + recipe_path = os.path.join(save_directory, RECIPE_FILE_NAME) session = active_session() if (recipe_yaml_str := session.get_serialized_recipe()) is not None: diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py index b53705b9b..a93111a8d 100644 --- a/src/llmcompressor/transformers/utils/helpers.py +++ b/src/llmcompressor/transformers/utils/helpers.py @@ -13,11 +13,11 @@ from llmcompressor.transformers import ModelArguments, TrainingArguments __all__ = [ - "DEFAULT_RECIPE_NAME", + "RECIPE_FILE_NAME", "detect_last_checkpoint", ] -DEFAULT_RECIPE_NAME = "recipe.yaml" +RECIPE_FILE_NAME = "recipe.yaml" def detect_last_checkpoint( From 1180b3417c4884de05b297fa8f5e258c540eebef Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 3 Dec 2024 20:42:11 +0000 Subject: [PATCH 5/7] remove qa ignore Signed-off-by: Kyle Sayers --- src/llmcompressor/transformers/utils/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py index a93111a8d..1263bb004 100644 --- a/src/llmcompressor/transformers/utils/helpers.py +++ b/src/llmcompressor/transformers/utils/helpers.py @@ -21,8 +21,8 @@ def detect_last_checkpoint( - training_args: "TrainingArguments", # noqa 821 - model_args: Optional["ModelArguments"] = None, # noqa 821 + training_args: "TrainingArguments", + model_args: Optional["ModelArguments"] = None, ): last_checkpoint = None if ( From 1aba16dc3ccd705b9e0986ed36fea5cd3d186e81 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Dec 2024 00:36:35 +0000 Subject: [PATCH 6/7] replace tokenizer with processor Signed-off-by: Kyle Sayers --- .../pytorch/model_load/helpers.py | 12 +++-- .../transformers/finetune/data/base.py | 44 +++++++++++----- .../transformers/finetune/data/c4.py | 6 +-- .../finetune/data/cnn_dailymail.py | 6 +-- .../transformers/finetune/data/custom.py | 6 +-- .../finetune/data/evolcodealpaca.py | 6 +-- .../transformers/finetune/data/gsm8k.py | 6 +-- .../finetune/data/open_platypus.py | 6 +-- .../transformers/finetune/data/ptb.py | 6 +-- .../finetune/data/ultrachat_200k.py | 10 ++-- .../transformers/finetune/data/wikitext.py | 6 +-- .../transformers/finetune/model_args.py | 6 +++ .../transformers/finetune/runner.py | 16 +++--- .../transformers/finetune/session_mixin.py | 5 +- .../transformers/finetune/text_generation.py | 52 +++++++++++-------- .../compressed_tensors_utils.py | 5 +- .../sparsification/sparse_model.py | 8 +-- .../utils/preprocessing_functions.py | 7 ++- src/llmcompressor/typing.py | 17 ++++++ src/llmcompressor/utils/fsdp/helpers.py | 7 +-- .../compression/test_quantization.py | 2 +- .../finetune/data/test_dataset_loading.py | 20 +++---- .../finetune/data/test_registry.py | 6 +-- .../transformers/obcq/test_obcq_completion.py | 2 +- tests/testing_utils.py | 4 +- 25 files changed, 165 insertions(+), 106 deletions(-) create mode 100644 src/llmcompressor/typing.py diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py index 3db9be173..a9ecb67a7 100644 --- a/src/llmcompressor/pytorch/model_load/helpers.py +++ b/src/llmcompressor/pytorch/model_load/helpers.py @@ -9,6 +9,7 @@ from llmcompressor.core import active_session, create_session, pre_initialize_structure from llmcompressor.pytorch.utils import ModuleSparsificationInfo +from llmcompressor.typing import Processor COMPLETED_STAGES_FILENAME = "completed_stages.json" @@ -92,15 +93,16 @@ def initialize_recipe(model: Module, recipe_path: str): def save_model_and_recipe( model: Module, save_path: str, - tokenizer: Optional[Any] = None, + processor: Optional[Processor] = None, save_safetensors: bool = False, save_compressed: bool = False, ): """ - Save a model, tokenizer and the currently loaded recipe to file + Save a model, processor and the currently loaded recipe to file + :param model: pytorch model to save :param save_path: path to save output to - :param tokenizer: model tokenizer to save + :param processor: model processor or tokenizer to save :param save_safetensors: whether to save as safetensors or pickle (bin) :param save_compressed: whether to compress sparse weights on disk """ @@ -111,8 +113,8 @@ def save_model_and_recipe( save_path, save_compressed=save_compressed, safe_serialization=save_safetensors ) - if tokenizer is not None: - tokenizer.save_pretrained(save_path) + if processor is not None: + processor.save_pretrained(save_path) logger.info("Saving output to {}".format(os.path.abspath(save_path))) diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py index d4c3a6222..3b68e0fc1 100644 --- a/src/llmcompressor/transformers/finetune/data/base.py +++ b/src/llmcompressor/transformers/finetune/data/base.py @@ -3,7 +3,6 @@ from compressed_tensors.registry import RegistryMixin from datasets import Dataset, IterableDataset from loguru import logger -from transformers import AutoTokenizer from llmcompressor.transformers.finetune.data.data_args import DataTrainingArguments from llmcompressor.transformers.finetune.data.data_helpers import ( @@ -11,6 +10,7 @@ get_custom_datasets_from_path, get_raw_dataset, ) +from llmcompressor.typing import Processor class TextGenerationDataset(RegistryMixin): @@ -30,10 +30,10 @@ def __init__( text_column: str, data_args: DataTrainingArguments, split: str, - tokenizer: AutoTokenizer, + processor: Processor, ): self.text_column = text_column - self.tokenizer = tokenizer + self.processor = processor self.data_args = data_args self.raw_kwargs = data_args.raw_kwargs or {} self.split = split @@ -50,20 +50,38 @@ def __init__( else: self.padding = False - if self.tokenizer: + # get tokenizer + self.tokenizer = getattr(self.processor, "tokenizer", self.processor) + + if self.tokenizer is not None: + # fill in pad token if not self.tokenizer.pad_token: self.tokenizer.pad_token = self.tokenizer.eos_token - # configure sequence length - max_seq_length = data_args.max_seq_length - model_max_length = tokenizer.model_max_length if tokenizer else max_seq_length - if self.tokenizer and max_seq_length > model_max_length: - logger.warning( - f"The max_seq_length passed ({max_seq_length}) is larger than " - f"the maximum length for the model ({tokenizer.model_max_length}). " - f"Using max_seq_length={tokenizer.model_max_length}." + # configure sequence length + max_seq_length = data_args.max_seq_length + if data_args.max_seq_length > self.tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({max_seq_length}) is larger than " + f"maximum length for model ({self.tokenizer.model_max_length}). " + f"Using max_seq_length={self.tokenizer.model_max_length}." + ) + self.max_seq_length = min( + data_args.max_seq_length, self.tokenizer.model_max_length + ) + + # configure padding + self.padding = ( + False + if self.data_args.concatenate_data + else "max_length" + if self.data_args.pad_to_max_length + else False ) - self.max_seq_length = min(data_args.max_seq_length, model_max_length) + + else: + self.max_seq_length = None + self.padding = False def get_raw_dataset(self, cache_dir: Optional[str] = None) -> Dataset: """ diff --git a/src/llmcompressor/transformers/finetune/data/c4.py b/src/llmcompressor/transformers/finetune/data/c4.py index 37eeceae6..91cbc58e8 100644 --- a/src/llmcompressor/transformers/finetune/data/c4.py +++ b/src/llmcompressor/transformers/finetune/data/c4.py @@ -10,12 +10,12 @@ class C4Dataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) data_args.dataset = "allenai/c4" super().__init__( - text_column="text", data_args=data_args, split=split, tokenizer=tokenizer + text_column="text", data_args=data_args, split=split, processor=processor ) diff --git a/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py b/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py index 64755de4a..dcebe7573 100644 --- a/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py +++ b/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py @@ -24,18 +24,18 @@ class CNNDailyMailDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n" - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) data_args.dataset = "cnn_dailymail" data_args.dataset_config_name = "3.0.0" super().__init__( - text_column="text", data_args=data_args, split=split, tokenizer=tokenizer + text_column="text", data_args=data_args, split=split, processor=processor ) def get_raw_dataset(self, cache_dir: Optional[str] = None): diff --git a/src/llmcompressor/transformers/finetune/data/custom.py b/src/llmcompressor/transformers/finetune/data/custom.py index e849594e7..817cb34de 100644 --- a/src/llmcompressor/transformers/finetune/data/custom.py +++ b/src/llmcompressor/transformers/finetune/data/custom.py @@ -32,17 +32,17 @@ class CustomDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` Can also be set to None to load all the splits - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) super().__init__( text_column=data_args.text_column, data_args=data_args, split=split, - tokenizer=tokenizer, + processor=processor, ) self.preprocessing_func = data_args.preprocessing_func self.remove_columns = data_args.remove_columns diff --git a/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py b/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py index 9529d3115..66505f117 100644 --- a/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py +++ b/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py @@ -24,7 +24,7 @@ class EvolCodeAlpacaDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ EVOL_ALPACA_TEMPLATE = ( @@ -34,11 +34,11 @@ class EvolCodeAlpacaDataset(TextGenerationDataset): "\n\n### Response:\n" ) - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) data_args.dataset = "theblackcat102/evol-codealpaca-v1" super().__init__( - text_column="text", data_args=data_args, split=split, tokenizer=tokenizer + text_column="text", data_args=data_args, split=split, processor=processor ) def get_raw_dataset(self, cache_dir: Optional[str] = None): diff --git a/src/llmcompressor/transformers/finetune/data/gsm8k.py b/src/llmcompressor/transformers/finetune/data/gsm8k.py index f9a94bcf4..299ae1bb2 100644 --- a/src/llmcompressor/transformers/finetune/data/gsm8k.py +++ b/src/llmcompressor/transformers/finetune/data/gsm8k.py @@ -11,16 +11,16 @@ class GSM8KDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ GSM_TEMPLATE = "Question: {question}\nAnswer:" - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) data_args.dataset = "gsm8k" super().__init__( - text_column="text", data_args=data_args, split=split, tokenizer=tokenizer + text_column="text", data_args=data_args, split=split, processor=processor ) def get_raw_dataset(self, cache_dir: Optional[str] = None): diff --git a/src/llmcompressor/transformers/finetune/data/open_platypus.py b/src/llmcompressor/transformers/finetune/data/open_platypus.py index 55e54cbce..7a17c6fde 100644 --- a/src/llmcompressor/transformers/finetune/data/open_platypus.py +++ b/src/llmcompressor/transformers/finetune/data/open_platypus.py @@ -24,7 +24,7 @@ class OpenPlatypusDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ ALPACA_TEMPLATE = { @@ -37,11 +37,11 @@ class OpenPlatypusDataset(TextGenerationDataset): "instruction}\n\n### Response:\n", } - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) data_args.dataset = "garage-bAInd/Open-Platypus" super().__init__( - text_column="text", data_args=data_args, split=split, tokenizer=tokenizer + text_column="text", data_args=data_args, split=split, processor=processor ) def get_raw_dataset(self, cache_dir: Optional[str] = None): diff --git a/src/llmcompressor/transformers/finetune/data/ptb.py b/src/llmcompressor/transformers/finetune/data/ptb.py index 6f502edaf..8519f023c 100644 --- a/src/llmcompressor/transformers/finetune/data/ptb.py +++ b/src/llmcompressor/transformers/finetune/data/ptb.py @@ -10,15 +10,15 @@ class PtbDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) data_args.dataset = "ptb_text_only" super().__init__( text_column="sentence", data_args=data_args, split=split, - tokenizer=tokenizer, + processor=processor, ) diff --git a/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py b/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py index 5b2e66ab5..30607847d 100644 --- a/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py +++ b/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py @@ -24,7 +24,7 @@ class UltraChatDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ DEFAULT_CHAT_TEMPLATE = ( @@ -40,7 +40,7 @@ class UltraChatDataset(TextGenerationDataset): "{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" ) - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): data_args = deepcopy(data_args) data_args.dataset = "HuggingFaceH4/ultrachat_200k" @@ -51,13 +51,15 @@ def __init__(self, data_args, split, tokenizer): text_column="messages", data_args=data_args, split=split, - tokenizer=tokenizer, + processor=processor, ) if ( not hasattr(self.tokenizer, "chat_template") or self.tokenizer.chat_template is None ): + # note that since tokenizer is a member of processor, + # this change affects processor.apply_chat_template self.tokenizer.chat_template = self.DEFAULT_CHAT_TEMPLATE def get_raw_dataset(self, cache_dir: Optional[str] = None): @@ -75,7 +77,7 @@ def restructure_fn(sample): if sample["messages"][0]["role"] != "system": sample["messages"].insert(0, {"role": "system", "content": ""}) - sample["messages"] = self.tokenizer.apply_chat_template( + sample["messages"] = self.processor.apply_chat_template( sample["messages"], tokenize=False, add_generation_prompt=False ) return sample diff --git a/src/llmcompressor/transformers/finetune/data/wikitext.py b/src/llmcompressor/transformers/finetune/data/wikitext.py index 034d58ba2..25280589c 100644 --- a/src/llmcompressor/transformers/finetune/data/wikitext.py +++ b/src/llmcompressor/transformers/finetune/data/wikitext.py @@ -8,10 +8,10 @@ class WikiTextDataset(TextGenerationDataset): :param data_args: configuration settings for dataset loading :param split: split from dataset to load, for instance `test` or `train[:5%]` - :param tokenizer: tokenizer to use on dataset + :param processor: processor or tokenizer to use on dataset """ - def __init__(self, data_args, split, tokenizer): + def __init__(self, data_args, split, processor): super().__init__( - text_column="text", data_args=data_args, split=split, tokenizer=tokenizer + text_column="text", data_args=data_args, split=split, processor=processor ) diff --git a/src/llmcompressor/transformers/finetune/model_args.py b/src/llmcompressor/transformers/finetune/model_args.py index d3d8e974f..c81900ee2 100644 --- a/src/llmcompressor/transformers/finetune/model_args.py +++ b/src/llmcompressor/transformers/finetune/model_args.py @@ -34,6 +34,12 @@ class ModelArguments: "help": "Pretrained tokenizer name or path if not the same as model_name" }, ) + processor: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained processor name or path if not the same as model_name" + }, + ) cache_dir: Optional[str] = field( default=None, metadata={"help": "Where to store the pretrained data from huggingface.co"}, diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py index 6344b1a2b..131180199 100644 --- a/src/llmcompressor/transformers/finetune/runner.py +++ b/src/llmcompressor/transformers/finetune/runner.py @@ -6,7 +6,6 @@ import torch from loguru import logger from torch.utils.data import Dataset -from transformers import AutoTokenizer from llmcompressor.core import active_session from llmcompressor.pytorch.model_load.helpers import ( @@ -24,6 +23,7 @@ ) from llmcompressor.transformers.finetune.model_args import ModelArguments from llmcompressor.transformers.finetune.training_args import TrainingArguments +from llmcompressor.typing import Processor from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_model_and_recipe @@ -38,7 +38,7 @@ class StageRunner: - set_trainer() - train() / evaluate() / predict() - :param model_args: Arguments pertaining to model/config/tokenizer + :param model_args: Arguments pertaining to model/config/processor :param data_args: Arguments pertaining to what data to use for different flows :param training_args: Arguments pertaining to training loop configuration :model: unwrapped model to run flows on @@ -56,11 +56,11 @@ def __init__( self.datasets = {} self.trainer = None - self.tokenizer = None + self.processor = None self.parent_output_dir = self._training_args.output_dir self._output_dir = self._training_args.output_dir - def populate_datasets(self, tokenizer: "AutoTokenizer", add_labels: bool = True): + def populate_datasets(self, processor: Processor, add_labels: bool = True): """ Loads datasets for each flow based on data_args, stores a Dataset for each enabled flow in self.datasets @@ -68,7 +68,7 @@ def populate_datasets(self, tokenizer: "AutoTokenizer", add_labels: bool = True) :param tokenizer: tokenizer to use for dataset tokenization """ if self._data_args.dataset is None: - self.tokenizer = self._model_args.tokenizer + self.processor = self._model_args.processor logger.info( "Running oneshot without calibration data. This is expected for " "weight-only and dynamic quantization" @@ -102,7 +102,7 @@ def _get_split_name(inp_str): registry_id, data_args=self._data_args, split=split_str, - tokenizer=tokenizer, + processor=processor, ) dataset = self._data_args.dataset @@ -124,7 +124,7 @@ def _get_split_name(inp_str): do_predict=self._training_args.do_predict, do_oneshot=self._training_args.do_oneshot, ) - self.tokenizer = tokenizer + self.processor = processor def get_dataset_split(self, split_name: str) -> Dataset: """ @@ -266,7 +266,7 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None): save_model_and_recipe( model=self.trainer.model, save_path=self._output_dir, - tokenizer=self.tokenizer, + processor=self.processor, save_safetensors=self._training_args.save_safetensors, save_compressed=self._training_args.save_compressed, ) diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index b1ac57b95..27860aeb4 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -487,8 +487,9 @@ def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False): ) self.save_state() - if self.tokenizer is not None: - self.tokenizer.save_pretrained(output_dir) + processor = getattr(self, "processing_class", self.tokenizer) + if processor is not None: + processor.save_pretrained(output_dir) if not self.recipe: return diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py index 85aa6d82c..a6c21fc39 100644 --- a/src/llmcompressor/transformers/finetune/text_generation.py +++ b/src/llmcompressor/transformers/finetune/text_generation.py @@ -24,9 +24,9 @@ from transformers import ( AutoConfig, AutoModelForCausalLM, - AutoTokenizer, - DefaultDataCollator, + AutoProcessor, HfArgumentParser, + PreTrainedModel, set_seed, ) @@ -49,9 +49,10 @@ patch_tied_tensors_bug, ) from llmcompressor.transformers.sparsification.sparse_model import ( - get_shared_tokenizer_src, + get_shared_processor_src, ) from llmcompressor.transformers.utils.helpers import detect_last_checkpoint +from llmcompressor.typing import Processor from llmcompressor.utils.fsdp.helpers import is_fsdp_model @@ -134,6 +135,13 @@ def parse_args(**kwargs): arg_dict[key] = value training_args.recipe_args = arg_dict + # silently assign tokenizer to processor + if model_args.tokenizer: + if model_args.processor: + raise ValueError("Cannot use both a tokenizer and processor") + model_args.processor = model_args.tokenizer + model_args.tokenizer = None + return model_args, data_args, training_args @@ -226,11 +234,13 @@ def initialize_model_from_path( return teacher, model_path, model -def initialize_tokenizer_from_path(model_args, model, teacher): - tokenizer_src = model_args.tokenizer - tokenizer_src = tokenizer_src or get_shared_tokenizer_src(model, teacher) - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_src, +def initialize_processor_from_path( + model_args: ModelArguments, model: PreTrainedModel, teacher: PreTrainedModel +) -> Processor: + processor_src = model_args.processor + processor_src = processor_src or get_shared_processor_src(model, teacher) + processor = AutoProcessor.from_pretrained( + processor_src, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, @@ -238,7 +248,7 @@ def initialize_tokenizer_from_path(model_args, model, teacher): trust_remote_code=model_args.trust_remote_code_model, ) - return tokenizer + return processor def main( @@ -299,11 +309,9 @@ def main( # Detecting last checkpoint. last_checkpoint = None teacher = model_args.distill_teacher - model = model_args.model - # Load tokenizer - # distill TODO: support for different tokenizer for teacher? - tokenizer = model_args.tokenizer + # distill TODO: support for different processor for teacher? + model = model_args.model if isinstance(model, str) or isinstance(model, PosixPath): (teacher, _model_path, model) = initialize_model_from_path( model_args, @@ -317,8 +325,9 @@ def main( if teacher is not None: teacher.eval() - if isinstance(tokenizer, str) or tokenizer is None: - tokenizer = initialize_tokenizer_from_path(model_args, model, teacher) + processor = model_args.processor + if isinstance(processor, str) or processor is None: + processor = initialize_processor_from_path(model_args, model, teacher) pre_initialize_structure(model=model) @@ -330,13 +339,12 @@ def main( model_args=model_args, data_args=data_args, training_args=training_args ) add_labels = training_args.do_train or training_args.run_stages - stage_runner.populate_datasets(tokenizer=tokenizer, add_labels=add_labels) + stage_runner.populate_datasets(processor=processor, add_labels=add_labels) train_dataset = stage_runner.get_dataset_split("train") eval_dataset = stage_runner.get_dataset_split("validation") calib_dataset = stage_runner.get_dataset_split("calibration") # Initialize our Trainer - data_collator = DefaultDataCollator() trainer = Trainer( model_init=get_session_model, teacher=teacher, @@ -346,13 +354,13 @@ def main( data_args=data_args, train_dataset=train_dataset or calib_dataset, eval_dataset=eval_dataset, - tokenizer=tokenizer, - data_collator=data_collator, + processing_class=processor, + data_collator=data_args.data_collator, ) # wrap model.save_pretrained if is_fsdp_model(model): - modify_fsdp_model_save_pretrained(trainer, tokenizer) + modify_fsdp_model_save_pretrained(trainer, processor) else: modify_save_pretrained(model) @@ -396,8 +404,8 @@ def main( model.save_pretrained( training_args.output_dir, save_compressed=training_args.save_compressed ) - if tokenizer is not None: - tokenizer.save_pretrained(training_args.output_dir) + if processor is not None: + processor.save_pretrained(training_args.output_dir) # Clean up the CompressionSession before exit if requested if training_args.clear_sparse_session: diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py index 759098894..ce4ae7fb2 100644 --- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py +++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py @@ -25,6 +25,7 @@ SparsityConfigMetadata, ) from llmcompressor.transformers.utils import RECIPE_FILE_NAME +from llmcompressor.typing import Processor from llmcompressor.utils.fsdp.helpers import ( find_and_move_state_dicts_to_cpu, unwrap_and_export_model, @@ -33,7 +34,7 @@ __all__ = ["modify_save_pretrained", "modify_fsdp_model_save_pretrained"] -def modify_fsdp_model_save_pretrained(trainer, tokenizer): +def modify_fsdp_model_save_pretrained(trainer, processor: Processor): """ Overrides a PreTrainedModel's save_pretrained() method with a wrapped version that supports compression for fsdp model @@ -78,7 +79,7 @@ def save_pretrained_wrapper( model=trainer.model, accelerator=trainer.accelerator, output_dir=save_directory, - tokenizer=tokenizer, + processor=processor, ) # only allow the main process move the state # dicts to cpu diff --git a/src/llmcompressor/transformers/sparsification/sparse_model.py b/src/llmcompressor/transformers/sparsification/sparse_model.py index bf09396d7..d7abc323a 100644 --- a/src/llmcompressor/transformers/sparsification/sparse_model.py +++ b/src/llmcompressor/transformers/sparsification/sparse_model.py @@ -7,7 +7,7 @@ __all__ = [ "SparseAutoModelForCausalLM", - "get_shared_tokenizer_src", + "get_shared_processor_src", ] @@ -20,14 +20,14 @@ def from_pretrained(*args, **kwargs): return AutoModelForCausalLM.from_pretrained(*args, **kwargs) -def get_shared_tokenizer_src(student: Module, teacher: Optional[Module]) -> str: +def get_shared_processor_src(student: Module, teacher: Optional[Module]) -> str: """ - Get a tokenizer source used for both student and teacher, assuming + Get a processor/tokenizer source used for both student and teacher, assuming that they could be shared :param student: the student model :param teacher: the teacher model - :return: the source for the tokenizer shared between teacher and model + :return: the source for the processor/tokenizer shared between teacher and model """ if teacher is not None and teacher not in ("disable", "self"): diff --git a/src/llmcompressor/transformers/utils/preprocessing_functions.py b/src/llmcompressor/transformers/utils/preprocessing_functions.py index cadec88f0..6bf6ade42 100644 --- a/src/llmcompressor/transformers/utils/preprocessing_functions.py +++ b/src/llmcompressor/transformers/utils/preprocessing_functions.py @@ -1,14 +1,17 @@ -from typing import Dict +from typing import TYPE_CHECKING, Dict from compressed_tensors.registry import RegistryMixin +if TYPE_CHECKING: + from llmcompressor.transformers.finetune.data.base import TextGenerationDataset + class PreprocessingFunctionRegistry(RegistryMixin): pass @PreprocessingFunctionRegistry.register() -def custom_evolved_codealpaca_dataset(data: Dict): +def custom_evolved_codealpaca_dataset(self: "TextGenerationDataset", data: Dict): PROMPT_DICT = """[Instruction]:\n{instruction}\n\n[Response]:""" data["prompt"] = PROMPT_DICT.format_map(data) data["text"] = data["prompt"] + data["output"] diff --git a/src/llmcompressor/typing.py b/src/llmcompressor/typing.py new file mode 100644 index 000000000..1050f7138 --- /dev/null +++ b/src/llmcompressor/typing.py @@ -0,0 +1,17 @@ +from typing import Union + +from datasets import Dataset, DatasetDict, IterableDataset +from transformers import ( + BaseImageProcessor, + FeatureExtractionMixin, + PreTrainedTokenizer, + ProcessorMixin, +) + +# Tokenizer or Processor. Processors do not inherit from a unified base class +Processor = Union[ + PreTrainedTokenizer, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin +] + +# Supported dataset types, IterableDataset is a streamed dataset +DatasetType = Union[Dataset, DatasetDict, IterableDataset] diff --git a/src/llmcompressor/utils/fsdp/helpers.py b/src/llmcompressor/utils/fsdp/helpers.py index 8cc0f5405..3a3248fa5 100644 --- a/src/llmcompressor/utils/fsdp/helpers.py +++ b/src/llmcompressor/utils/fsdp/helpers.py @@ -18,6 +18,7 @@ from llmcompressor.core.state import State from llmcompressor.pytorch.model_load.helpers import save_model_and_recipe +from llmcompressor.typing import Processor from llmcompressor.utils.pytorch import set_layer __all__ = [ @@ -71,7 +72,7 @@ def set_wrapped_model(state: State, wrapped_model: Module): state.model = wrapped_model -def unwrap_and_export_model(model, accelerator, output_dir, tokenizer): +def unwrap_and_export_model(model, accelerator, output_dir: str, processor: Processor): """ Recursively unwraps an FSDP model, then saves the unwrapped model and the currently active recipe to disk @@ -79,7 +80,7 @@ def unwrap_and_export_model(model, accelerator, output_dir, tokenizer): :param model: model to unwrap :param accelerator: Accelerator instance used to perform unwrapping :param output_dir: where to save output model - :param tokenizer: tokenizer used by the model + :param processor: processor used by the model """ full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) with FullyShardedDataParallel.state_dict_type( @@ -95,7 +96,7 @@ def unwrap_and_export_model(model, accelerator, output_dir, tokenizer): save_model_and_recipe( model=unwrapped_model, save_path=output_dir, - tokenizer=tokenizer, + processor=processor, ) diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py index 2dd1249d6..4a37f138d 100644 --- a/tests/llmcompressor/transformers/compression/test_quantization.py +++ b/tests/llmcompressor/transformers/compression/test_quantization.py @@ -133,7 +133,7 @@ def _get_dataloader(self, data_args, tokenizer): data_args.dataset, data_args=data_args, split="train_gen[:5%]", - tokenizer=tokenizer, + processor=tokenizer, ) calib_dataset = dataset_manager.tokenize_and_process( dataset_manager.get_raw_dataset() diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py index 3415858af..cbedd5b9d 100644 --- a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py +++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py @@ -29,7 +29,7 @@ def test_concatenation_tokenization(self): self.data_args.dataset, data_args=self.data_args, split="train[:5%]", - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) raw_dataset = wiki_manager.get_raw_dataset() self.assertGreater(len(raw_dataset), 0) @@ -61,7 +61,7 @@ def test_no_padding_tokenization(self): self.data_args.dataset, data_args=self.data_args, split="train[5%:10%]", - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) raw_dataset = op_manager.get_raw_dataset() self.assertGreater(len(raw_dataset), 0) @@ -96,7 +96,7 @@ def test_max_seq_len_clipped(self): self.data_args.dataset, data_args=self.data_args, split="train[80%:]", - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) self.assertEqual( @@ -125,7 +125,7 @@ def test_dataset_kwargs_and_percentages(self): self.data_args.dataset, data_args=self.data_args, split="train[5%:10%]", - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) raw_dataset_a = c4_manager_a.get_raw_dataset() @@ -133,7 +133,7 @@ def test_dataset_kwargs_and_percentages(self): self.data_args.dataset, data_args=self.data_args, split="train[5%:15%]", - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) raw_dataset_b = c4_manager_b.get_raw_dataset() @@ -164,7 +164,7 @@ def test_datasets(self, dataset_key, dataset_config, split, do_concat): data_args.dataset, data_args=data_args, split=split, - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) raw_dataset = manager.get_raw_dataset() self.assertGreater(len(raw_dataset), 0) @@ -204,7 +204,7 @@ def test_evol(self): self.data_args.dataset, data_args=self.data_args, split="train[:2%]", - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) raw_dataset = evol_manager.get_raw_dataset() self.assertGreater(len(raw_dataset), 0) @@ -238,7 +238,7 @@ def test_stream_loading(self): self.data_args.dataset, data_args=self.data_args, split="train", - tokenizer=self.tiny_llama_tokenizer, + processor=self.tiny_llama_tokenizer, ) raw_dataset = manager.get_raw_dataset() @@ -276,7 +276,7 @@ def test_split_loading(self, split_def): stage_runner = StageRunner( model_args=model_args, data_args=data_args, training_args=training_args ) - stage_runner.populate_datasets(tokenizer=self.tiny_llama_tokenizer) + stage_runner.populate_datasets(processor=self.tiny_llama_tokenizer) train_dataset = stage_runner.get_dataset_split("train") assert train_dataset is not None @@ -320,7 +320,7 @@ def preprocess(sample): ), training_args=TrainingArguments(do_oneshot=True), ) - stage_runner.populate_datasets(tokenizer=None) + stage_runner.populate_datasets(processor=None) calib_dataset = stage_runner.get_dataset_split("calibration") self.assertEqual(len(calib_dataset), self.num_calib_samples) data_cols = calib_dataset.column_names diff --git a/tests/llmcompressor/transformers/finetune/data/test_registry.py b/tests/llmcompressor/transformers/finetune/data/test_registry.py index e4c804c07..3350d0a79 100644 --- a/tests/llmcompressor/transformers/finetune/data/test_registry.py +++ b/tests/llmcompressor/transformers/finetune/data/test_registry.py @@ -16,7 +16,7 @@ def test_c4_initializes(tiny_llama_tokenizer): data_args.dataset, data_args=data_args, split=None, - tokenizer=tiny_llama_tokenizer, + processor=tiny_llama_tokenizer, ) assert isinstance(c4_manager, TextGenerationDataset) assert isinstance(c4_manager, C4Dataset) @@ -34,7 +34,7 @@ def test_wikitext_initializes(tiny_llama_tokenizer): data_args.dataset, data_args=data_args, split=None, - tokenizer=tiny_llama_tokenizer, + processor=tiny_llama_tokenizer, ) assert isinstance(wiki_manager, TextGenerationDataset) assert isinstance(wiki_manager, WikiTextDataset) @@ -50,7 +50,7 @@ def test_open_platypus_initializes(tiny_llama_tokenizer): data_args.dataset, data_args=data_args, split=None, - tokenizer=tiny_llama_tokenizer, + processor=tiny_llama_tokenizer, ) assert isinstance(op_manager, TextGenerationDataset) assert isinstance(op_manager, OpenPlatypusDataset) diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py index 03517de07..096c8df94 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py @@ -37,7 +37,7 @@ def labeled_dataloader(self, dataset_name, model_name): data_args.dataset, data_args=data_args, split="train", - tokenizer=tokenizer, + processor=tokenizer, ) calib_dataset = dataset_manager.tokenize_and_process( dataset_manager.get_raw_dataset() diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 7a5dab66f..c28a25545 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -9,7 +9,7 @@ import yaml from datasets import Dataset -from transformers import AutoTokenizer +from transformers import PreTrainedTokenizer from tests.data import CustomTestConfig, TestConfig @@ -130,7 +130,7 @@ def run_cli_command(cmd: List[str], cwd: Optional[Union[str, Path]] = None): def preprocess_tokenize_dataset( - ds: Dataset, tokenizer: AutoTokenizer, max_seq_length: int + ds: Dataset, tokenizer: PreTrainedTokenizer, max_seq_length: int ) -> Dataset: """ Helper function to preprocess and tokenize a dataset according to presets From 89bda306ff43a96c7577fafc313e17d417487af7 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 5 Dec 2024 04:22:28 +0000 Subject: [PATCH 7/7] defer data collator changes Signed-off-by: Kyle Sayers --- src/llmcompressor/transformers/finetune/text_generation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py index a6c21fc39..f0e3a6b16 100644 --- a/src/llmcompressor/transformers/finetune/text_generation.py +++ b/src/llmcompressor/transformers/finetune/text_generation.py @@ -25,6 +25,7 @@ AutoConfig, AutoModelForCausalLM, AutoProcessor, + DefaultDataCollator, HfArgumentParser, PreTrainedModel, set_seed, @@ -345,6 +346,7 @@ def main( calib_dataset = stage_runner.get_dataset_split("calibration") # Initialize our Trainer + data_collator = DefaultDataCollator() trainer = Trainer( model_init=get_session_model, teacher=teacher, @@ -355,7 +357,7 @@ def main( train_dataset=train_dataset or calib_dataset, eval_dataset=eval_dataset, processing_class=processor, - data_collator=data_args.data_collator, + data_collator=data_collator, ) # wrap model.save_pretrained