From af0be23b2f273dbc1a94660253ab7ab466af69a2 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Tue, 25 Jun 2024 00:52:53 +0000 Subject: [PATCH] tweak W8A8 --- examples/quantization/example-w8a8-int8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/quantization/example-w8a8-int8.py b/examples/quantization/example-w8a8-int8.py index 85d20d0e0..97a723062 100644 --- a/examples/quantization/example-w8a8-int8.py +++ b/examples/quantization/example-w8a8-int8.py @@ -3,7 +3,7 @@ from transformers import AutoTokenizer # Select model and load it. -MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = SparseAutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype='auto', ) @@ -77,6 +77,6 @@ def preprocess(example): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-DYNAMIC-PER-TOKEN" +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR)