Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vLLM e2e tests #117

Merged
merged 22 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added tests/e2e/vLLM/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions tests/e2e/vLLM/configs/FP8/fp8_dynamic_per_token.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
scheme: FP8_DYNAMIC
6 changes: 6 additions & 0 deletions tests/e2e/vLLM/configs/FP8/fp8_static_per_tensor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
scheme: FP8
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
5 changes: 5 additions & 0 deletions tests/e2e/vLLM/configs/FP8/fp8_weight_only_channel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml
scheme: FP8A16_channel
5 changes: 5 additions & 0 deletions tests/e2e/vLLM/configs/FP8/fp8_weight_only_tensor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml
scheme: FP8A16_tensor
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
scheme: W8A8_channel_weight_static_per_tensor
6 changes: 6 additions & 0 deletions tests/e2e/vLLM/configs/INT8/int8_dynamic_per_token.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
scheme: W8A8
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
scheme: W8A8_tensor_weight_static_per_tensor_act
7 changes: 7 additions & 0 deletions tests/e2e/vLLM/configs/WNA16/w4a16_channel_quant.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
scheme: W4A16_channel
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml
6 changes: 6 additions & 0 deletions tests/e2e/vLLM/configs/WNA16/w4a16_grouped_quant.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
scheme: W4A16
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
7 changes: 7 additions & 0 deletions tests/e2e/vLLM/configs/WNA16/w8a16_channel_quant.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
scheme: W8A16_channel
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml
6 changes: 6 additions & 0 deletions tests/e2e/vLLM/configs/WNA16/w8a16_grouped_quant.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cadence: "nightly"
test_type: "regression"
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
scheme: W8A16
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
quant_stage:
quant_modifiers:
QuantizationModifier:
sequential_update: false
ignore: [lm_head]
config_groups:
group_0:
weights: {num_bits: 8, type: float, symmetric: true, strategy: channel, dynamic: false}
targets: [Linear]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
quant_stage:
quant_modifiers:
QuantizationModifier:
sequential_update: false
ignore: [lm_head]
config_groups:
group_0:
weights: {num_bits: 8, type: float, symmetric: true, strategy: tensor, dynamic: false}
targets: [Linear]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
quant_stage:
quant_modifiers:
QuantizationModifier:
sequential_update: false
ignore: [lm_head]
config_groups:
group_0:
weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
targets: [Linear]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
quant_stage:
quant_modifiers:
QuantizationModifier:
sequential_update: false
ignore: [lm_head]
config_groups:
group_0:
weights: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
targets: [Linear]
9 changes: 9 additions & 0 deletions tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
quant_stage:
quant_modifiers:
QuantizationModifier:
sequential_update: false
ignore: [lm_head]
config_groups:
group_0:
weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
targets: [Linear]
9 changes: 9 additions & 0 deletions tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
quant_stage:
quant_modifiers:
QuantizationModifier:
sequential_update: false
ignore: [lm_head]
config_groups:
group_0:
weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false}
targets: [Linear]
123 changes: 123 additions & 0 deletions tests/e2e/vLLM/test_vllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import shutil
import unittest

import pytest
from datasets import load_dataset
from parameterized import parameterized_class
from transformers import AutoTokenizer

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from tests.testing_utils import parse_params, requires_gpu, requires_torch

try:
from vllm import LLM, SamplingParams

vllm_installed = True
except ImportError:
vllm_installed = False

# Defines the file paths to the directories containing the test configs
# for each of the quantization schemes
WNA16 = "tests/e2e/vLLM/configs/WNA16"
FP8 = "tests/e2e/vLLM/configs/FP8"
INT8 = "tests/e2e/vLLM/configs/INT8"


@requires_gpu
@requires_torch
@pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test")
@parameterized_class(parse_params([WNA16, FP8, INT8]))
class TestvLLM(unittest.TestCase):
model = None
scheme = None
dataset_id = None
dataset_split = None
recipe = None

def setUp(self):
print("========== RUNNING ==============")
print(self.scheme)

self.save_dir = None
self.device = "cuda:0"
self.oneshot_kwargs = {}
self.num_calibration_samples = 256
self.max_seq_length = 1048
self.prompts = [
"The capital of France is",
"The president of the US is",
"My name is",
]

def test_vllm(self):
# Load model.
loaded_model = SparseAutoModelForCausalLM.from_pretrained(
self.model, device_map=self.device, torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(self.model)

def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}

def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=self.max_seq_length,
truncation=True,
add_special_tokens=False,
)

if self.dataset_id:
ds = load_dataset(self.dataset_id, split=self.dataset_split)
ds = ds.shuffle(seed=42).select(range(self.num_calibration_samples))
ds = ds.map(preprocess)
ds = ds.map(tokenize, remove_columns=ds.column_names)
self.oneshot_kwargs["dataset"] = ds
self.oneshot_kwargs["max_seq_length"] = self.max_seq_length
self.oneshot_kwargs["num_calibration_samples"] = (
self.num_calibration_samples
)

self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
self.oneshot_kwargs["model"] = loaded_model
if self.recipe:
self.oneshot_kwargs["recipe"] = self.recipe
else:
# Test assumes that if a recipe was not provided, using
# a compatible preset sceme from:
# https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py
self.oneshot_kwargs["recipe"] = QuantizationModifier(
targets="Linear", scheme=self.scheme, ignore=["lm_head"]
)

# Apply quantization.
print("ONESHOT KWARGS", self.oneshot_kwargs)
oneshot(
**self.oneshot_kwargs,
output_dir=self.save_dir,
clear_sparse_session=True,
oneshot_device=self.device,
)
tokenizer.save_pretrained(self.save_dir)
# Run vLLM with saved model
print("================= RUNNING vLLM =========================")
sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
llm = LLM(model=self.save_dir)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having a test for tp>1 is also a good idea if we can

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yah I think that'll be a follow-up test since the structure will change a bit to deal with tp>1 with the same process

I do think that's more of a vLLM test. If anything, we could extend this to publish test models which are then pulled down for all vllm tests.

outputs = llm.generate(self.prompts, sampling_params)
print("================= vLLM GENERATION ======================")
for output in outputs:
assert output
prompt = output.prompt
generated_text = output.outputs[0].text
print("PROMPT", prompt)
print("GENERATED TEXT", generated_text)
Satrat marked this conversation as resolved.
Show resolved Hide resolved

def tearDown(self):
shutil.rmtree(self.save_dir)
68 changes: 39 additions & 29 deletions tests/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,39 +68,49 @@ def _validate_test_config(config: dict):
# Set cadence in the config. The environment must set if nightly, weekly or commit
# tests are running
def parse_params(
configs_directory: str, type: Optional[str] = None
configs_directory: Union[list, str], type: Optional[str] = None
) -> List[Union[dict, CustomTestConfig]]:
# parses the config file provided
assert os.path.isdir(
configs_directory
), f"Config_directory {configs_directory} is not a directory"
# parses the config files provided

config_dicts = []
for file in os.listdir(configs_directory):
config = _load_yaml(configs_directory, file)
if not config:
continue

cadence = os.environ.get("CADENCE", "commit")
expected_cadence = config.get("cadence")

if not isinstance(expected_cadence, list):
expected_cadence = [expected_cadence]
if cadence in expected_cadence:
if type == "custom":
config = CustomTestConfig(**config)

def _parse_configs_dir(current_config_dir):
assert os.path.isdir(
current_config_dir
), f"Config_directory {current_config_dir} is not a directory"

for file in os.listdir(current_config_dir):
config = _load_yaml(current_config_dir, file)
if not config:
continue

cadence = os.environ.get("CADENCE", "commit")
expected_cadence = config.get("cadence")

if not isinstance(expected_cadence, list):
expected_cadence = [expected_cadence]
if cadence in expected_cadence:
if type == "custom":
config = CustomTestConfig(**config)
else:
if not _validate_test_config(config):
raise ValueError(
"The config provided does not comply with the expected "
"structure. See tests.data.TestConfig for the expected "
"fields."
)
config_dicts.append(config)
else:
if not _validate_test_config(config):
raise ValueError(
"The config provided does not comply with the expected "
"structure. See tests.data.TestConfig for the expected "
"fields."
)
config_dicts.append(config)
else:
logging.info(
f"Skipping testing model: {file} for cadence: {config['cadence']}"
)
logging.info(
f"Skipping testing model: {file} for cadence: {config['cadence']}"
)

if isinstance(configs_directory, list):
for config in configs_directory:
_parse_configs_dir(config)
else:
_parse_configs_dir(configs_directory)

return config_dicts


Expand Down
Loading