diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py new file mode 100644 index 000000000..bd1f95ead --- /dev/null +++ b/examples/quantizing_moe/deepseek_moe_w4a16.py @@ -0,0 +1,121 @@ +import torch +from datasets import load_dataset +from transformers import AutoTokenizer + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot +from llmcompressor.transformers.compression.helpers import calculate_offload_device_map + +# select a Mixture of Experts model for quantization +MODEL_ID = "deepseek-ai/DeepSeek-V2.5" + +# adjust based off number of desired GPUs +# if not enough memory is available, some layers will automatically be offlaoded to cpu +device_map = calculate_offload_device_map( + MODEL_ID, + reserve_for_hessians=True, + num_gpus=2, + torch_dtype=torch.bfloat16, + trust_remote_code=True, +) + +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset. +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + + +# Load dataset and preprocess. +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + ) + } + + +ds = ds.map(preprocess) + + +# Tokenize inputs. +def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + + +ds = ds.map(tokenize, remove_columns=ds.column_names) + +# define a llmcompressor recipe for W416 quantization +# since the MoE gate layers are sensitive to quantization, we add them to the ignore +# list so they remain at full precision +recipe = "deepseek_recipe_w4a16.yaml" + +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16" + + +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + save_compressed=True, + output_dir=SAVE_DIR, +) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=20) +print(tokenizer.decode(output[0])) +print("==========================================") + + +# Run the model on vLLM +try: + from vllm import LLM, SamplingParams + + vllm_installed = True +except ImportError: + vllm_installed = False + +if vllm_installed: + print("vLLM installed, running using vLLM") + sampling_params = SamplingParams(temperature=0.80, top_p=0.95) + llm = LLM( + model=SAVE_DIR, + tensor_parallel_size=2, + trust_remote_code=True, + max_model_len=1042, + dtype=torch.half, + ) + prompts = [ + "The capital of France is", + "The president of the US is", + "My name is", + ] + + outputs = llm.generate(prompts, sampling_params) + print("================= vLLM GENERATION ======================") + for output in outputs: + assert output + prompt = output.prompt + generated_text = output.outputs[0].text + print("PROMPT", prompt) + print("GENERATED TEXT", generated_text) diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml new file mode 100644 index 000000000..05e294365 --- /dev/null +++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml @@ -0,0 +1,9 @@ +quant_stage: + quant_modifiers: + GPTQModifier: + sequential_update: true + ignore: [lm_head, "re:.*mlp.gate$"] + config_groups: + group_0: + weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} + targets: [Linear]