Skip to content

Commit

Permalink
[SparseAutoModelForCausalLM Deprecation] Feature change (#881)
Browse files Browse the repository at this point in the history
* src and tests updates

* save model if output_dir is provided

* save model if provided as a string

* typo

* save if model was provided as a string or custom output_dir was set

* comments

* save tokenizer also if model passed as a string or custom outputdir provided

* revert to True

* merge main

* merge main

* fix transformers tests

* Update tests/llmcompressor/transformers/obcq/test_consecutive_runs.py

Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>

* lint:

* fix bug

* fix bug

* comments

* comments

* fix saving bug on example script and comments

* fix test failure

* comments

* comments

* comments

* lint

* fix test_quantization.py

* fix bugs

* revert to default

* revert to default

* draft

* fix test

* logging output fix

---------

Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
  • Loading branch information
3 people committed Nov 21, 2024
1 parent 65d9db2 commit 4b4c52e
Show file tree
Hide file tree
Showing 24 changed files with 328 additions and 359 deletions.
6 changes: 3 additions & 3 deletions examples/quantization_kv_cache/llama3_fp8_kv_example.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers import oneshot

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
Expand Down
29 changes: 22 additions & 7 deletions src/llmcompressor/pytorch/model_load/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"log_model_load",
"initialize_recipe",
"save_model_and_recipe",
"copy_python_files_from_model_cache",
"fallback_to_cpu",
"parse_dtype",
"get_session_model",
Expand Down Expand Up @@ -99,7 +100,6 @@ def save_model_and_recipe(
):
"""
Save a model, tokenizer and the currently loaded recipe to file
:param model: pytorch model to save
:param save_path: path to save output to
:param tokenizer: model tokenizer to save
Expand All @@ -123,7 +123,7 @@ def save_model_and_recipe(
fp.write(recipe_yaml_str)

# copy python files from cache dir to save_path if any
_copy_python_files_from_model_cache(model, save_path)
copy_python_files_from_model_cache(model, save_path)


def fallback_to_cpu(device: str) -> str:
Expand Down Expand Up @@ -213,16 +213,31 @@ def load_safetensors_state_dict(file_path: str) -> Dict[str, torch.Tensor]:
return {key: f.get_tensor(key) for key in f.keys()}


def _copy_python_files_from_model_cache(model: Module, save_path: str):
def copy_python_files_from_model_cache(model, save_path: str):
config = model.config
cache_dir = None
cache_path = None
if hasattr(config, "_name_or_path"):
import os
import shutil

cache_dir = config._name_or_path
for file in os.listdir(cache_dir):
full_file_name = os.path.join(cache_dir, file)
from huggingface_hub import hf_hub_download
from transformers import TRANSFORMERS_CACHE
from transformers.utils import http_user_agent

cache_path = config._name_or_path
if not os.path.exists(cache_path):
user_agent = http_user_agent()
config_file_path = hf_hub_download(
repo_id=cache_path,
filename="config.json",
cache_dir=TRANSFORMERS_CACHE,
force_download=False,
user_agent=user_agent,
)
cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])

for file in os.listdir(cache_path):
full_file_name = os.path.join(cache_path, file)
if file.endswith(".py") and os.path.isfile(full_file_name):
logger.debug(f"Transferring {full_file_name} to {save_path}")
shutil.copy(full_file_name, save_path)
3 changes: 1 addition & 2 deletions src/llmcompressor/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
# isort: skip_file
# (import order matters for circular import avoidance)
from .utils import *

from .sparsification import (
SparseAutoModel,
SparseAutoModelForCausalLM,
wrap_hf_model_class,
)
from .finetune import *
6 changes: 4 additions & 2 deletions src/llmcompressor/transformers/finetune/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ accelerate launch
```python
from llmcompressor.transformers import oneshot

model = "Xenova/llama2.c-stories15M"
model ="Xenova/llama2.c-stories15M"
dataset_name = "open_platypus"
concatenate_data = False
pad_to_max_length = False
Expand All @@ -119,7 +119,6 @@ oneshot(
output_dir=output_dir,
recipe=recipe,
overwrite_output_dir=overwrite_output_dir,
concatenate_data = concatenate_data,
pad_to_max_length = pad_to_max_length,
splits = splits
)
Expand All @@ -141,8 +140,10 @@ of a staged recipe for Llama.
test_multi.py
```python
from llmcompressor.transformers import apply
from transformers import AutoModelForCausalLM

model = "../ml-experiments/nlg-text_generation/llama_pretrain-llama_7b-base/dense/training"

dataset_name = "open_platypus"
concatenate_data = False
run_stages=True
Expand All @@ -167,4 +168,5 @@ apply(
remove_unused_columns = False,
splits = splits
)

```
48 changes: 13 additions & 35 deletions src/llmcompressor/transformers/finetune/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
get_completed_stages,
get_session_model,
save_completed_stages,
save_model_and_recipe,
)
from llmcompressor.pytorch.utils import tensors_to_device
from llmcompressor.recipe import Recipe, StageRunType
Expand All @@ -25,11 +24,7 @@
)
from llmcompressor.transformers.finetune.model_args import ModelArguments
from llmcompressor.transformers.finetune.training_args import TrainingArguments
from llmcompressor.utils.fsdp.helpers import (
find_and_move_state_dicts_to_cpu,
is_fsdp_model,
unwrap_and_export_model,
)
from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_model_and_recipe


class StageRunner:
Expand Down Expand Up @@ -170,35 +165,6 @@ def one_shot(self, stage: Optional[str] = None):

self.trainer.one_shot(calibration_data=calib_data, stage=stage)

if is_fsdp_model(self.trainer.model):
try:
self.trainer.save_model(output_dir=self._output_dir, _is_oneshot=True)
except AssertionError:
# fallback to this in the case of quantization
unwrap_and_export_model(
model=self.trainer.model,
accelerator=self.trainer.accelerator,
output_dir=self._output_dir,
tokenizer=self.tokenizer,
)
# only allow the main process move the state
# dicts to cpu
if self.trainer.accelerator.is_main_process:
# assuming quantization is the last step
# we no longer need the original model
# and can safely delete it to save memory
del self.trainer.model
find_and_move_state_dicts_to_cpu(self._output_dir)

else:
save_model_and_recipe(
model=self.trainer.model,
save_path=self._output_dir,
tokenizer=self.tokenizer,
save_safetensors=self._training_args.save_safetensors,
save_compressed=self._training_args.save_compressed,
)

def train(self, checkpoint: str, stage: Optional[str] = None):
"""
Run trainer's training loop on train_dataset, saving the resulting model to
Expand Down Expand Up @@ -293,6 +259,18 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
self.train(checkpoint=checkpoint, stage=stage_name)
checkpoint = None

if (
self._training_args.output_dir
!= TrainingArguments.__dataclass_fields__["output_dir"].default
):
save_model_and_recipe(
model=self.trainer.model,
save_path=self._output_dir,
tokenizer=self.tokenizer,
save_safetensors=self._training_args.save_safetensors,
save_compressed=self._training_args.save_compressed,
)

# save stage to checkpoint dir
if self.trainer.accelerator.is_main_process:
completed_stages.append(stage_name)
Expand Down
4 changes: 1 addition & 3 deletions src/llmcompressor/transformers/finetune/session_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,9 +452,7 @@ def one_shot(
# self.maybe_log_model_sparsification()
self.accelerator.wait_for_everyone()

def save_model(
self, output_dir: Optional[str] = None, _internal_call=False, _is_oneshot=False
):
def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False):
"""
Override of the save_model function and expects it to exist in the parent.
Calls into super() to save the model and additionally saves any recipes
Expand Down
45 changes: 37 additions & 8 deletions src/llmcompressor/transformers/finetune/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from loguru import logger
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
DefaultDataCollator,
HfArgumentParser,
Expand All @@ -42,11 +43,16 @@
from llmcompressor.transformers.finetune.runner import StageRunner
from llmcompressor.transformers.finetune.trainer import Trainer
from llmcompressor.transformers.finetune.training_args import TrainingArguments
from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
modify_fsdp_model_save_pretrained,
modify_save_pretrained,
patch_tied_tensors_bug,
)
from llmcompressor.transformers.sparsification.sparse_model import (
SparseAutoModel,
get_shared_tokenizer_src,
)
from llmcompressor.transformers.utils.helpers import detect_last_checkpoint
from llmcompressor.utils.fsdp.helpers import is_fsdp_model


def train(**kwargs):
Expand Down Expand Up @@ -199,21 +205,23 @@ def initialize_model_from_path(
"trust_remote_code": model_args.trust_remote_code_model,
}
# this calls from_pretrained under the hood so should be FSDP safe
model = SparseAutoModel.text_generation_from_pretrained(
model_name_or_path=model_path,
sequence_length=None, # use model default
model = AutoModelForCausalLM.from_pretrained(
model_path,
**model_kwargs,
)
if "sequence_length" in model_kwargs:
model.seqlen = model_kwargs["sequence_length"]

teacher = (
SparseAutoModel.text_generation_from_pretrained(
model_name_or_path=model_args.distill_teacher,
sequence_length=None, # use model default
AutoModelForCausalLM.from_pretrained(
model_args.distill_teacher,
**teacher_kwargs,
)
if model_args.distill_teacher is not None
else None
)
if teacher is not None and "sequence_length" in teacher_kwargs:
teacher.seqlen = teacher_kwargs["sequence_length"]

return teacher, model_path, model

Expand Down Expand Up @@ -302,6 +310,10 @@ def main(
training_args,
)

# patch a shared tensor bug in HF transformers
# https://github.com/huggingface/transformers/issues/33689
patch_tied_tensors_bug(model)

if teacher is not None:
teacher.eval()

Expand Down Expand Up @@ -337,6 +349,13 @@ def main(
tokenizer=tokenizer,
data_collator=data_collator,
)

# wrap model.save_pretrained
if is_fsdp_model(model):
modify_fsdp_model_save_pretrained(trainer, tokenizer)
else:
modify_save_pretrained(model)

stage_runner.trainer = trainer

# alternating Training/One-shot
Expand All @@ -348,7 +367,6 @@ def main(

# exit immediately
return

# Training
if training_args.do_train:
checkpoint = None
Expand All @@ -370,6 +388,17 @@ def main(
if training_args.do_predict:
stage_runner.predict()

# save if model was provided as a string or custom output_dir was set
if isinstance(model_args.model, str) or (
training_args.output_dir
!= TrainingArguments.__dataclass_fields__["output_dir"].default
):
model.save_pretrained(
training_args.output_dir, save_compressed=training_args.save_compressed
)
if tokenizer is not None:
tokenizer.save_pretrained(training_args.output_dir)

# Clean up the CompressionSession before exit if requested
if training_args.clear_sparse_session:
reset_session()
Expand Down
Loading

0 comments on commit 4b4c52e

Please sign in to comment.