You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Running with the following is not working properly
Running with the recipe is working properly
fromdatasetsimportload_datasetfromtransformersimportAutoTokenizerfromllmcompressor.modifiers.quantizationimportGPTQModifierfromllmcompressor.modifiers.smoothquantimportSmoothQuantModifierfromllmcompressor.transformersimportSparseAutoModelForCausalLM, oneshot# Select model and load it.MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"model=SparseAutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
tokenizer=AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset.DATASET_ID="HuggingFaceH4/ultrachat_200k"DATASET_SPLIT="train_sft"# Select number of samples. 512 samples is a good place to start.# Increasing the number of samples can improve accuracy.NUM_CALIBRATION_SAMPLES=512MAX_SEQUENCE_LENGTH=2048# Load dataset and preprocess.ds=load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds=ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
defpreprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}
ds=ds.map(preprocess)
# Tokenize inputs.deftokenize(sample):
returntokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds=ds.map(tokenize, remove_columns=ds.column_names)
# Configure algorithms. In this case, we:# * apply SmoothQuant to make the activations easier to quantize# * quantize the weights to int8 with GPTQ (static per channel)# * quantize the activations to int8 (dynamic per token)# Note: set sequential_update: true in the recipe to reduce memoryrecipe= [
SmoothQuantModifier(smoothing_strength=0.8),
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
]
# Apply algorithms.oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Confirm generations of the quantized model look sane.print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids=tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output=model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
# Save to disk compressed.SAVE_DIR=MODEL_ID.split("/")[1] +"-W8A8-Dynamic-Per-Token"model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
The text was updated successfully, but these errors were encountered:
The text was updated successfully, but these errors were encountered: