From 91cee598efebe77918c17bd0b887fb1bd172815d Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Tue, 25 Jun 2024 12:08:52 -0400 Subject: [PATCH 1/2] Remove references to SparseML and replace with LLM Compressor naming and conventions. --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- CONTRIBUTING.md | 103 ++++++------------ DEVELOPING.md | 50 ++------- examples/finetuning/configure_fsdp.md | 22 +--- .../llama7b_one_shot_quantization.md | 2 +- .../llama7b_sparse_w4a16.py | 2 +- .../llama7b_w4a16_quantization.ipynb | 90 +++++++++++---- .../llama7b_w4a16_quantization.py | 2 +- .../quantization/llama7b_w8a8_quantization.py | 2 +- examples/trl_mixin/README.md | 20 +--- examples/trl_mixin/sft_trainer.py | 14 --- src/llmcompressor/core/model_layer.py | 14 --- src/llmcompressor/metrics/logger.py | 2 +- src/llmcompressor/recipe/recipe.py | 2 - src/llmcompressor/transformers/__init__.py | 2 +- .../transformers/finetune/callbacks.py | 4 +- .../transformers/finetune/session_mixin.py | 20 ++-- .../transformers/finetune/training_args.py | 5 +- .../sparsification/sparse_model.py | 2 +- .../transformers/utils/helpers.py | 4 +- tests/llmcompressor/pytorch/helpers.py | 2 +- 21 files changed, 145 insertions(+), 221 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 509f1e1bd..4835ed784 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -15,7 +15,7 @@ A clear and concise description of what you expected to happen. Include all relevant environment information: 1. OS [e.g. Ubuntu 18.04]: 2. Python version [e.g. 3.7]: -3. SparseML version or commit hash [e.g. 0.1.0, `f7245c8`]: +3. LLM Compressor version or commit hash [e.g. 0.1.0, `f7245c8`]: 4. ML framework version(s) [e.g. torch 1.7.1]: 5. Other Python package versions [e.g. SparseZoo, DeepSparse, numpy, ONNX]: 6. Other relevant environment information [e.g. hardware, CUDA version]: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b27bb94fa..9ece29b40 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,86 +1,53 @@ - +However, remember that contributions aren't just about code. +We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. -TODO: update for upstream push +Finally, one of the most impactful ways to support us is by raising awareness about LLM Compressor and the vLLM community. +Talk about it in your blog posts, highlighting how it's driving your incredible projects. +Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. -# Contributing to SparseML +## Setup for development -If you’re reading this, hopefully we have piqued your interest to take the next step. Join us and help make SparseML even better! As a contributor, here are some community guidelines we would like you to follow: +### Install from source -- [Code of Conduct](#code-of-conduct) -- [Ways to Contribute](#ways-to-contribute) -- [Bugs and Feature Requests](#bugs-and-feature-requests) -- [Question or Problem](#question-or-problem) -- [Developing SparseML](DEVELOPING.md) +```bash +pip install -e ./[dev] +``` -## Code of Conduct +### Code Styling and Formatting checks -Help us keep the software inclusive. Please read and follow our [Code of Conduct](https://github.com/neuralmagic/sparseml/blob/main/CODE_OF_CONDUCT.md) in order to promote an environment that is friendly, fair, respectful, and safe. We want to inspire collaboration, innovation, and fun! +```bash +make style +make quality +``` -## Ways to Contribute +### Testing -Whether you’re a newbie, dabbler, or expert, we appreciate you jumping in. +```bash +make test +``` -### Contributing Code +## Contributing Guidelines -- Make pull requests for addressing bugs, open issues, and documentation -- Neural Magic as the maintainer will do reviews and final merge +### Issue Reporting -### Reporting In +If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. +If not, please file a new issue, providing as much relevant information as possible. -- See something, say something: bugs, documentation -- Propose new feature requests to Neural Magic +### Pull Requests & Code Reviews -### Helping Others +Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution. -- Answer open discussion topics -- Spread the word about SparseML -- Teach and empower others. This is the way! +### Thank You -## Bugs and Feature Requests - -Please search through existing issues and requests first to avoid duplicates. Neural Magic will work with you further to take next steps. - -- Go to: [GitHub Issues](https://github.com/vllm-project/llm-compressor/issues) - -For bugs, include: - -- brief summary -- OS/Environment details -- steps to reproduce (s.t.r.) -- code snippets, screenshots/casts, log content, sample models -- add the GitHub label "bug" to your post - -For feature requests, include: - -- problem you’re trying to solve -- community benefits -- other relevant details to support your proposal -- add the GitHub label "enhancement" to your post - -For documentation edits, include: - -- current state, proposed state -- if applicable, screenshots/casts -- add the GitHub label "documentation" to your post - -## Question or Problem - -Sign up or log in to our [**Neural Magic Community Slack**](https://neuralmagic.com/community/). We are growing the community member by member and happy to see you there. Don’t forget to search through existing discussions to avoid duplication! Thanks! - -## Developing SparseML - -Made it this far? Review [Developing SparseML](DEVELOPING.md) to get started. +Finally, thank you for taking the time to read these guidelines and for your interest in contributing to LLM Compressor. +Your contributions make LLM Compressor a great tool for everyone! diff --git a/DEVELOPING.md b/DEVELOPING.md index 4e069fffa..e884c5911 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -1,25 +1,7 @@ - - -TODO: update for upstream push - -# Developing SparseML - -SparseML is developed and tested using Python 3.8-3.11. -To develop SparseML, you will also need the development dependencies and to follow the styling guidelines. +LLM Compressor is developed and tested using Python 3.8-3.11. +To develop LLM Compressor, you will also need the development dependencies and to follow the styling guidelines. Here are some details to get started. @@ -33,17 +15,7 @@ cd llm-compressor python3 -m pip install -e "./[dev]" ``` -This will clone the SparseML repo, install it, and install the development dependencies. - -To develop framework specific features, you will also need the relevant framework packages. -Those can be installed by adding the framework name to the install extras. Frameworks include -`torch`, `keras`, and `tensorflow_v1`. For example: -```bash -python3 -m pip install -e "./[dev,torch]" -``` - -Note: Running all pytorch tests using `make test TARGETS=torch`, also requires `torchvision` -and `onnxruntime` install all these dependencies using `python3 -m pip install -e "./[dev, torch, torchvision, onnxruntime]"` +This will clone the LLM Compressor repo, install it, and install the development dependencies. **Code Styling and Formatting checks** @@ -52,22 +24,16 @@ make style make quality ``` -This will run automatic code styling using `black` and `isort` and test that the +This will run automatic code styling using `ruff`, `flake8`, `black`, `isort`, and `mypy` to test that the repository's code matches its standards. **EXAMPLE: test changes locally** ```bash -make test TARGETS= +make test ``` -This will run the targeted SparseML unit tests for the frameworks specified. -The targets should be specified, because not all framework dependencies can be installed to run all tests. - -To run just PyTorch tests, run -```bash -make test TARGETS=pytorch -``` +This will run the targeted LLM Compressor unit tests for the frameworks specified. File any error found before changes as an Issue and fix any errors found after making changes before submitting a Pull Request. @@ -92,7 +58,7 @@ File any error found before changes as an Issue and fix any errors found after m 3. Add a remote to keep up with upstream changes. ```bash - git remote add upstream https://github.com/neuralmagic/sparseml.git + git remote add upstream https://github.com/vllm-project/llm-compressor.git ``` If you already have a copy, fetch upstream changes. diff --git a/examples/finetuning/configure_fsdp.md b/examples/finetuning/configure_fsdp.md index 7b632d485..0d377f43b 100644 --- a/examples/finetuning/configure_fsdp.md +++ b/examples/finetuning/configure_fsdp.md @@ -1,23 +1,7 @@ - - # Configuring FSDP for Sparse Finetuning An example FSDP configuration file, `example_fsdp_config.yaml`, is provided in this -folder. It can be used out of the box by editting the `num_processes` parameter to +folder. It can be used out of the box by editing the `num_processes` parameter to fit the number of GPUs on your machine. You can also customize your own config file by running the following prompt @@ -25,7 +9,7 @@ You can also customize your own config file by running the following prompt accelerate config ``` -An FSDP config file can be passed to the SparseML finetuning script like this: +An FSDP config file can be passed to the LLM Compressor finetuning script like this: ``` -accelerate launch --config_file example_fsdp_config.yaml --no_python sparseml.transformers.text_generation.finetune +accelerate launch --config_file example_fsdp_config.yaml --no_python llmcompressor.transformers.text_generation.finetune ``` diff --git a/examples/quantization/llama7b_one_shot_quantization.md b/examples/quantization/llama7b_one_shot_quantization.md index cac366676..3048fafc4 100644 --- a/examples/quantization/llama7b_one_shot_quantization.md +++ b/examples/quantization/llama7b_one_shot_quantization.md @@ -1,7 +1,7 @@ # Creating a Quantized Llama Model in One Shot Quantizing a model to a lower precision can save on both memory and speed at inference time. -This example demonstrates how to use the SparseML API to quantize a Llama model from 16 bits +This example demonstrates how to use the LLM Compressor API to quantize a Llama model from 16 bits to 4 bits and save it to a compressed-tensors format for inference with vLLM. ## Step 1: Select a model and dataset diff --git a/examples/quantization/llama7b_sparse_quantized/llama7b_sparse_w4a16.py b/examples/quantization/llama7b_sparse_quantized/llama7b_sparse_w4a16.py index ac0cbed0e..97a19a098 100644 --- a/examples/quantization/llama7b_sparse_quantized/llama7b_sparse_w4a16.py +++ b/examples/quantization/llama7b_sparse_quantized/llama7b_sparse_w4a16.py @@ -11,7 +11,7 @@ model_stub, torch_dtype=torch.bfloat16, device_map="auto" ) -# uses SparseML's built-in preprocessing for ultra chat +# uses LLM Compressor's built-in preprocessing for ultra chat dataset = "ultrachat-200k" # save location of quantized model diff --git a/examples/quantization/llama7b_w4a16_quantization.ipynb b/examples/quantization/llama7b_w4a16_quantization.ipynb index c69cc90c6..3e4bd3450 100644 --- a/examples/quantization/llama7b_w4a16_quantization.ipynb +++ b/examples/quantization/llama7b_w4a16_quantization.ipynb @@ -2,11 +2,15 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "# Quantizing Llama 7B to W4A16 Using SparseML's OneShot Pathway\n", + "# Quantizing Llama 7B to W4A16 Using LLM Compressor's OneShot Pathway\n", "\n", - "This example notebook walks through how to quantize Llama 7B using SparseML. We apply int4 channel-wise quantization all Linear layers, using UltraChat 200k as a calibration dataset.\n", + "This example notebook walks through how to quantize Llama 7B using LLM Compressor. We apply int4 channel-wise quantization all Linear layers, using UltraChat 200k as a calibration dataset.\n", "\n", "This example requires at least 45GB of GPU memory to run. The memory requirement can be reduced to 32GB by setting `sequential_update: true` in the recipe definition, but this will increase the runtime significantly." ] @@ -14,7 +18,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "import torch\n", @@ -23,9 +31,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "SparseML uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. A recipe consists of one or more sparsification or quantization algorithms, called modifiers in SparseML. Below we create a sample recipe for GPTQ quantization that only requires a single modifier.\n", + "LLM Compressor uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. A recipe consists of one or more sparsification or quantization algorithms, called modifiers in LLM Compressor. Below we create a sample recipe for GPTQ quantization that only requires a single modifier.\n", "\n", "This modifier specifies that we should quantize the weights of each linear layer to 4 bits, using a symmetric channelwise quantization pattern. The lm-head will not be quantized even though it is a Linear layer, because it is included in the ignore list." ] @@ -33,7 +45,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "recipe = \"\"\"\n", @@ -55,17 +71,25 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Next we need to initialize the model we wish to quantize, and define a dataset for calibration. We will use a llama2 7b model that has been pretrained on the ultrachat 200k dataset. We will use the same dataset the model has been pretrained on for our one shot calibration. \n", "\n", - "SparseML supports several datasets, such as ultrachat-200k, out of the box. You can also pass in a tokenized `datasets.Dataset` object for custom dataset support." + "LLM Compressor supports several datasets, such as ultrachat-200k, out of the box. You can also pass in a tokenized `datasets.Dataset` object for custom dataset support." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "\n", @@ -74,7 +98,7 @@ "model_stub = \"neuralmagic/Llama-2-7b-ultrachat200k\"\n", "model = SparseAutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.bfloat16, device_map=\"auto\")\n", "\n", - "# uses SparseML's built-in preprocessing for ultra chat\n", + "# uses LLM Compressor's built-in preprocessing for ultra chat\n", "dataset = \"ultrachat-200k\"\n", "\n", "# save location of quantized model\n", @@ -83,7 +107,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Now we will configure our calibration dataset. To save on load time, we load only a small subset of ultrachat200k's `train_gen` split and label it as calibration data. For oneshot we do not need to pad the input, so we set `pad_to_max_length` to false. We also truncate each sample to a maximum of 512 tokens and select 512 samples for calibration. \n", "\n", @@ -93,7 +121,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "# set dataset config parameters\n", @@ -105,7 +137,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Finally, we can launch our quantization recipe using the `oneshot` function. This function call will apply the algorithms defined in `recipe` to the input `model`, using `num_calibration_samples` from `dataset` as calibration data. We will save the quantized model to `output_dir`.\n", "\n", @@ -115,7 +151,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "oneshot(\n", @@ -133,7 +173,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "The quantized model should now be stored in the defined `output_dir`. Its `config.json` will contain a new `compression_config` field that describes how the model has been quantized. This config will be used to load the model into vLLM." ] @@ -141,7 +185,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "output_dir" @@ -150,7 +198,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "model.save_pretrained(\"llama1.1b_W4A16_channel_packed\", save_compressed=True)" @@ -178,4 +230,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/quantization/llama7b_w4a16_quantization.py b/examples/quantization/llama7b_w4a16_quantization.py index 9bd60c2f3..fc41566f3 100644 --- a/examples/quantization/llama7b_w4a16_quantization.py +++ b/examples/quantization/llama7b_w4a16_quantization.py @@ -26,7 +26,7 @@ model_stub, torch_dtype=torch.bfloat16, device_map="auto" ) -# uses SparseML's built-in preprocessing for ultra chat +# uses LLM Compressor's built-in preprocessing for ultra chat dataset = "ultrachat-200k" # save location of quantized model out diff --git a/examples/quantization/llama7b_w8a8_quantization.py b/examples/quantization/llama7b_w8a8_quantization.py index 2429e1362..f6b328e92 100644 --- a/examples/quantization/llama7b_w8a8_quantization.py +++ b/examples/quantization/llama7b_w8a8_quantization.py @@ -32,7 +32,7 @@ model_stub, torch_dtype=torch.bfloat16, device_map="auto" ) -# uses SparseML's built-in preprocessing for ultra chat +# uses LLM Compressor's built-in preprocessing for ultra chat dataset = "ultrachat-200k" # save location of quantized model out diff --git a/examples/trl_mixin/README.md b/examples/trl_mixin/README.md index 61fa42af0..02c6d7111 100644 --- a/examples/trl_mixin/README.md +++ b/examples/trl_mixin/README.md @@ -1,25 +1,9 @@ - - # Sparse Finetuning with TRL's SFTTrainer The `SessionManagerMixin` can be added to other Trainer classes that inherit from [Hugging Face's Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer). -For example, we can add SparseML support to TRL's SFTTrainer like so: +For example, we can add LLM Compressor support to TRL's SFTTrainer like so: ```python from trl import SFTTrainer as TRLSFTTrainer @@ -28,7 +12,7 @@ class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer): ... ``` -The new `SFTTrainer` class can now apply SparseML recipes and modifiers during +The new `SFTTrainer` class can now apply LLM Compressor recipes and modifiers during supervised finetuning, will full support for all of the original TRL features. The full class is defined in the script `sft_trainer.py` and requires very minimal additional code: just a dataset load override to support passing in tokenized datasets diff --git a/examples/trl_mixin/sft_trainer.py b/examples/trl_mixin/sft_trainer.py index 0a67b9d57..ab80e0358 100644 --- a/examples/trl_mixin/sft_trainer.py +++ b/examples/trl_mixin/sft_trainer.py @@ -1,17 +1,3 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from trl import SFTTrainer as TRLSFTTrainer from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn diff --git a/src/llmcompressor/core/model_layer.py b/src/llmcompressor/core/model_layer.py index b6236bfcf..c1875de8c 100644 --- a/src/llmcompressor/core/model_layer.py +++ b/src/llmcompressor/core/model_layer.py @@ -1,17 +1,3 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from dataclasses import dataclass from typing import Any diff --git a/src/llmcompressor/metrics/logger.py b/src/llmcompressor/metrics/logger.py index 8abfa9804..dc7aabc73 100644 --- a/src/llmcompressor/metrics/logger.py +++ b/src/llmcompressor/metrics/logger.py @@ -388,7 +388,7 @@ def _create_default_logger(self, log_level: Optional[int] = None) -> logging.Log ) file_handler.setLevel(LOGGING_LEVELS["debug"]) logger.addHandler(file_handler) - logger.info(f"Logging all SparseML modifier-level logs to {log_path}") + logger.info(f"Logging all LLM Compressor modifier-level logs to {log_path}") logger.setLevel(LOGGING_LEVELS["debug"]) logger.propagate = False diff --git a/src/llmcompressor/recipe/recipe.py b/src/llmcompressor/recipe/recipe.py index bcf502875..1359e987c 100644 --- a/src/llmcompressor/recipe/recipe.py +++ b/src/llmcompressor/recipe/recipe.py @@ -24,8 +24,6 @@ class Recipe(RecipeBase): A class to represent a recipe for a model. Recipes encode the instructions needed for modifying the model and/or training process as a list of modifiers. - (More information on supported modifiers can be found at - https://docs.neuralmagic.com/products/sparseml) Recipes can be created from a file, string, or HuggingFace stub. Acceptable file formats include both json and yaml, however, diff --git a/src/llmcompressor/transformers/__init__.py b/src/llmcompressor/transformers/__init__.py index c1d6e285e..90544b019 100644 --- a/src/llmcompressor/transformers/__init__.py +++ b/src/llmcompressor/transformers/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. """ -Tools for integrating SparseML with transformers training flows +Tools for integrating LLM Compressor with transformers training flows """ # flake8: noqa diff --git a/src/llmcompressor/transformers/finetune/callbacks.py b/src/llmcompressor/transformers/finetune/callbacks.py index dc32ec386..ecdf46426 100644 --- a/src/llmcompressor/transformers/finetune/callbacks.py +++ b/src/llmcompressor/transformers/finetune/callbacks.py @@ -36,7 +36,7 @@ class TrainingLoopCallbacks(TrainerCallback): Used to update the model reference(for running with FSDP) and trigger the post- optim callbacks in each modifier. - :param sparseml_trainer: SparseML trainer that will call back into this object + :param trainer: LLM Compressor trainer that will call back into this object :param args: args to be passed to base TrainerCallback :param kwargs: key word arguments to be passed to base TrainerCallback """ @@ -98,7 +98,7 @@ class DisableHalfPrecisionCallback(TrainerCallback): """ TrainerCallback for disabling FP16 training before QAT training begins - :param sparseml_trainer: SparseML trainer that will call back into this object + :param trainer: LLM Compressor trainer that will call back into this object :param args: args to be passed to base TrainerCallback :param kwargs: key word arguments to be passed to base TrainerCallback """ diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index a1b2204d1..b60a9fc6b 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -62,8 +62,8 @@ class SessionManagerMixIn: """ - Mix-In class to extend the Hugging Face Trainer class to support SparseML recipes - for one-shot and finetuning flows. + Mix-In class to extend the Hugging Face Trainer class to support LLM Compressor + recipes for one-shot and finetuning flows. :param recipe: path to recipe file to apply during training :param recipe_args: additional kwargs to use for evaluating recipe @@ -164,7 +164,7 @@ def initialize_session( if self.recipe is None: _LOGGER.warning( "No training recipe was provided, finetuning will be run " - "without event callbacks to SparseML. To supply a recipe " + "without event callbacks to LLM Comrpessor. To supply a recipe " "pass a yaml file or string to the `recipe` argument." ) @@ -187,7 +187,7 @@ def initialize_structure(self, stage: Optional[str] = None): recipe_stage=stage, recipe_args=self.recipe_args, ) - _LOGGER.info(f"Initialized SparseML structure from recipe {self.recipe}") + _LOGGER.info(f"Initialized LLM Compressor structure from recipe {self.recipe}") torch.cuda.empty_cache() def finalize_session(self): @@ -201,7 +201,7 @@ def finalize_session(self): with summon_full_params_context(self.model, offload_to_cpu=True): # in order to update each layer we need to gathers all its parameters finalize() - _LOGGER.info("Finalized SparseML session") + _LOGGER.info("Finalized LLM Compressor session") model = get_session_model() self.model = model torch.cuda.empty_cache() @@ -227,7 +227,7 @@ def create_optimizer(self): _LOGGER.warning( "Training is being run with a streamed dataset, " "steps_per_epoch cannot be determined and will default to " - "1. SparseML modifiers utilizing this statistic may not " + "1. LLM Compressor modifiers utilizing this statistic may not " "behave as expected. " ) self.total_steps_per_epoch = 1 @@ -295,7 +295,7 @@ def compute_loss( # take the mean across multiple GPUs # this is done outside the compute_loss function in the parent, replicating it - # here for SparseML logging and distillation + # here for LLM Compressor logging and distillation loss = loss.mean() # Log step-wise loss and perplexity, for llama-recipes comparison @@ -470,7 +470,9 @@ def save_model( with open(recipe_path, "w") as fp: fp.write(recipe_yaml_str) - _LOGGER.info(f"Saved SparseML recipe with model state to {recipe_path}") + _LOGGER.info( + f"Saved LLM Compressor recipe with model state to {recipe_path}" + ) self.accelerator.wait_for_everyone() @@ -577,7 +579,7 @@ def _calculate_checkpoint_info(self, kwargs) -> Tuple[Optional[str], float]: if not kwargs or "resume_from_checkpoint" not in kwargs: _LOGGER.warning( - "resume_from_checkpoint not passed into SparseMLTrainer.train. " + "resume_from_checkpoint not passed into LLM Compressor Trainer.train. " "This will cause issues with restoring recipes when " "running from a checkpoint." ) diff --git a/src/llmcompressor/transformers/finetune/training_args.py b/src/llmcompressor/transformers/finetune/training_args.py index e8361cf6d..e3c257a12 100644 --- a/src/llmcompressor/transformers/finetune/training_args.py +++ b/src/llmcompressor/transformers/finetune/training_args.py @@ -23,7 +23,7 @@ @dataclass class TrainingArguments(HFTrainingArgs): """ - Training arguments specific to SparseML Transformers workflow + Training arguments specific to LLM Compressor Transformers workflow :param best_model_after_epoch (`int`, *optional*, defaults to None): The epoch after which best model will be saved; used in conjunction @@ -35,8 +35,7 @@ class TrainingArguments(HFTrainingArgs): default=None, metadata={ "help": ( - "Path to a SparseML sparsification recipe, see " - "https://github.com/neuralmagic/sparseml for more information" + "Path to a LLM Compressor sparsification recipe" ), }, ) diff --git a/src/llmcompressor/transformers/sparsification/sparse_model.py b/src/llmcompressor/transformers/sparsification/sparse_model.py index 97fbe46ff..69a9869e3 100644 --- a/src/llmcompressor/transformers/sparsification/sparse_model.py +++ b/src/llmcompressor/transformers/sparsification/sparse_model.py @@ -39,7 +39,7 @@ class SparseAutoModelForCausalLM(AutoModelForCausalLM): """ - SparseML wrapper for the AutoModelForCausalLM class + LLM Compressor wrapper for the AutoModelForCausalLM class Its lifecycle is defined as follows: 1. If pretrained_model_name_or_path is a HuggingFace stub the appropriate HuggingFace model will be downloaded diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py index cdbaef2d6..975053d42 100644 --- a/src/llmcompressor/transformers/utils/helpers.py +++ b/src/llmcompressor/transformers/utils/helpers.py @@ -13,8 +13,8 @@ # limitations under the License. """ -Helper variables and functions for integrating SparseML with huggingface/transformers -flows +Helper variables and functions for integrating LLM Compressor with +huggingface/transformers flows """ import inspect diff --git a/tests/llmcompressor/pytorch/helpers.py b/tests/llmcompressor/pytorch/helpers.py index ed6665d27..d7b52a836 100644 --- a/tests/llmcompressor/pytorch/helpers.py +++ b/tests/llmcompressor/pytorch/helpers.py @@ -253,7 +253,7 @@ class _QATMatMul(Module): def __init__(self): super().__init__() - # behaves like normal torch.matmul unless a SparseML QuantizationModifier + # behaves like normal torch.matmul unless a LLM Compressor QuantizationModifier # is initialized self.wrap_qat = True self.qat_wrapper_kwargs = { From 401ef682606a8dc39bc24f61e78581e12b4af82a Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Tue, 25 Jun 2024 12:25:11 -0400 Subject: [PATCH 2/2] Fixes from review --- DEVELOPING.md | 2 +- src/llmcompressor/transformers/finetune/session_mixin.py | 2 +- src/llmcompressor/transformers/finetune/training_args.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/DEVELOPING.md b/DEVELOPING.md index e884c5911..3307fdbad 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -24,7 +24,7 @@ make style make quality ``` -This will run automatic code styling using `ruff`, `flake8`, `black`, `isort`, and `mypy` to test that the +This will run automatic code styling using `ruff`, `flake8`, `black`, and `isort` to test that the repository's code matches its standards. **EXAMPLE: test changes locally** diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index b60a9fc6b..17cc0784b 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -164,7 +164,7 @@ def initialize_session( if self.recipe is None: _LOGGER.warning( "No training recipe was provided, finetuning will be run " - "without event callbacks to LLM Comrpessor. To supply a recipe " + "without event callbacks to LLM Compressor. To supply a recipe " "pass a yaml file or string to the `recipe` argument." ) diff --git a/src/llmcompressor/transformers/finetune/training_args.py b/src/llmcompressor/transformers/finetune/training_args.py index e3c257a12..d69da1591 100644 --- a/src/llmcompressor/transformers/finetune/training_args.py +++ b/src/llmcompressor/transformers/finetune/training_args.py @@ -34,9 +34,7 @@ class TrainingArguments(HFTrainingArgs): recipe: Optional[str] = field( default=None, metadata={ - "help": ( - "Path to a LLM Compressor sparsification recipe" - ), + "help": "Path to a LLM Compressor sparsification recipe", }, ) recipe_args: Optional[List[str]] = field(