-
Notifications
You must be signed in to change notification settings - Fork 27.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix quantization tests #29914
Fix quantization tests #29914
Changes from 39 commits
943411a
0ce4613
41aed78
785943b
886c136
e56b9b2
125369b
9d4f428
d0fde18
b25c6c4
f729157
e603cd3
9af48e1
399a5f9
55c5c98
7fbc243
5483d66
787bd8b
e864f02
78d0dcb
a29ee10
8cb49c4
c537d46
54778f4
7095d97
5a1d2a1
49f522c
09d6e48
6418192
e736c9a
e39e200
a49b7f1
3893891
0746d63
0e45209
3dd394b
f22e745
a2a89c7
a885662
d6c9d70
247f75f
ff0d5a8
176e055
ec7cab1
a4ad871
2fff50d
813f4c1
e26ca6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] | |
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant | ||
# to be used as arguments for docker build (so far). | ||
|
||
ARG PYTORCH='2.2.0' | ||
ARG PYTORCH='2.2.1' | ||
# Example: `cu102`, `cu113`, etc. | ||
ARG CUDA='cu118' | ||
|
||
|
@@ -30,6 +30,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch] | |
|
||
RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate | ||
|
||
# needed in bnb and awq | ||
RUN python3 -m pip install --no-cache-dir einops | ||
|
||
# Add bitsandbytes for mixed int8 testing | ||
RUN python3 -m pip install --no-cache-dir bitsandbytes | ||
|
||
|
@@ -43,7 +46,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt | |
RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 | ||
|
||
# Add autoawq for quantization testing | ||
RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.0/autoawq-0.2.0+cu118-cp38-cp38-linux_x86_64.whl | ||
# >=v0.2.3 needed for compatibility with torch 2.2.1 | ||
RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With torch2.2.1, we need a newer version of autoawq. Otherwise, we have issues importing the kernels =( |
||
|
||
# Add quanto for quantization testing | ||
RUN python3 -m pip install --no-cache-dir quanto | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,7 +101,7 @@ class AwqTest(unittest.TestCase): | |
|
||
EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish" | ||
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a" | ||
|
||
EXPECTED_OUTPUT_EXLLAMA = "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out" | ||
device_map = "cuda" | ||
|
||
# called only once for all test in this class | ||
|
@@ -200,11 +200,11 @@ def test_quantized_model_exllama(self): | |
|
||
quantization_config = AwqConfig(version="exllama") | ||
quantized_model = AutoModelForCausalLM.from_pretrained( | ||
self.model_name, quantization_config=quantization_config | ||
).to(torch_device) | ||
self.model_name, quantization_config=quantization_config, device_map=torch_device | ||
) | ||
|
||
output = quantized_model.generate(**input_ids, max_new_tokens=40) | ||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) | ||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA) | ||
|
||
def test_quantized_model_no_device_map(self): | ||
""" | ||
|
@@ -239,7 +239,7 @@ def test_quantized_model_multi_gpu(self): | |
|
||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto") | ||
|
||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1, 2, 3}) | ||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1}) | ||
|
||
output = quantized_model.generate(**input_ids, max_new_tokens=40) | ||
|
||
|
@@ -272,8 +272,8 @@ class AwqFusedTest(unittest.TestCase): | |
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ" | ||
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510" | ||
|
||
custom_mapping_model_id = "TheBloke/Yi-34B-AWQ" | ||
custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589" | ||
custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ" | ||
custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7" | ||
Comment on lines
+275
to
+276
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed to a mistral model since the yi model is based on remote code and we had breaking changes ... (Dynamic Cache). This should be fine since we want to test if we can pass custom fuse mapping. |
||
|
||
mixtral_model_name = "casperhansen/mixtral-instruct-awq" | ||
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b" | ||
|
@@ -287,8 +287,8 @@ class AwqFusedTest(unittest.TestCase): | |
"You end up exactly where you started. Where are you?" | ||
) | ||
|
||
EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for" | ||
EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org" | ||
EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf" | ||
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20" | ||
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe" | ||
|
||
def tearDown(self): | ||
|
@@ -423,35 +423,33 @@ def test_generation_custom_model(self): | |
fuse_max_seq_len=512, | ||
modules_to_fuse={ | ||
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"], | ||
"layernorm": ["ln1", "ln2", "norm"], | ||
"mlp": ["gate_proj", "up_proj", "down_proj"], | ||
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], | ||
"use_alibi": False, | ||
"num_attention_heads": 56, | ||
"hidden_size": 4096, | ||
"num_attention_heads": 32, | ||
"num_key_value_heads": 8, | ||
"hidden_size": 7168, | ||
}, | ||
) | ||
|
||
model = AutoModelForCausalLM.from_pretrained( | ||
self.custom_mapping_model_id, | ||
quantization_config=quantization_config, | ||
trust_remote_code=True, | ||
device_map="balanced", | ||
revision=self.custom_model_revision, | ||
) | ||
|
||
self._check_fused_modules(model) | ||
|
||
tokenizer = AutoTokenizer.from_pretrained( | ||
self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True | ||
) | ||
tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision) | ||
|
||
prompt = "Hello" | ||
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device) | ||
|
||
outputs = model.generate(**inputs, max_new_tokens=12) | ||
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL) | ||
|
||
@unittest.skip("Not enough GPU memory on CI runners") | ||
@require_torch_multi_gpu | ||
def test_generation_mixtral_fused(self): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems a lot simpler no?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
$(find "$(pwd)/quantization" -mindepth 1 -maxdepth 1 -type d | sort)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes indeed ! I'll switch to your solution =)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sure we don't have the prefix (here
/home/arthur/transformers/
but whatever it is in a system) in the outputs.