From c7f35667b6071e8b724f7b9c5c70fd25c3c3a329 Mon Sep 17 00:00:00 2001 From: Nikolay Date: Wed, 13 Nov 2024 12:38:47 +0100 Subject: [PATCH 1/2] enabled dynamic quantization and kv cache precision u8 --- examples/llm_compression/openvino/tiny_llama/main.py | 2 +- .../openvino/tiny_llama_find_hyperparams/main.py | 2 +- tests/post_training/pipelines/lm_weight_compression.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py index d6d52f0578d..af0b8e611c8 100644 --- a/examples/llm_compression/openvino/tiny_llama/main.py +++ b/examples/llm_compression/openvino/tiny_llama/main.py @@ -67,7 +67,7 @@ def transform_fn(data, model, tokenizer): ) model.save_pretrained(OUTPUT_DIR) - model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"}) + model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"KV_CACHE_PRECISION": "u8"}) input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device) start_t = time.time() diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py index 081e99125b4..a92bfaf6d92 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py @@ -246,8 +246,8 @@ def main(): ov_config = { "PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", + "KV_CACHE_PRECISION": "u8", "CACHE_DIR": "", - "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", } model = OVModelForCausalLM.from_pretrained( model_id, diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 3c45d9de636..1c486dbd4b4 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -290,7 +290,7 @@ def _validate(self): load_in_8bit=False, compile=False, stateful=is_stateful, - ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"}, + ov_config={"KV_CACHE_PRECISION": "u8"}, ) print("Evaluation of the target model") _, all_metrics = evaluator.score(compressed_model_hf) From 4fb83fe16fd86138d1f6689f8ef46e55127a682e Mon Sep 17 00:00:00 2001 From: Nikolay Date: Thu, 14 Nov 2024 16:13:45 +0100 Subject: [PATCH 2/2] updated references for u8 kv cache compression --- .../post_training/data/wc_reference_data.yaml | 60 +++++++++---------- .../data/wc_reference_data_2024.5.yaml | 34 +++++++---- 2 files changed, 53 insertions(+), 41 deletions(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 8fbc14a4396..3629938393c 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -1,46 +1,46 @@ -tinyllama_data_free_backend_OV: - metric_value: 0.73873 - num_int4: 114 - num_int8: 84 +tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV: + metric_value: 0.88264 + num_int4: 11 + num_int8: 290 + metrics_xfail_reason: "Issue-148819" +tinyllama_awq_backup_mode_none_backend_OV: + metric_value: 0.84781 + num_int4: 208 + num_int8: 0 tinyllama_data_aware_backend_OV: - metric_value: 0.85767 - num_int4: 94 - num_int8: 124 -tinyllama_data_aware_awq_stateful_backend_OV: - metric_value: 0.85571 + metric_value: 0.86141 num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_scale_estimation_backend_OV: - metric_value: 0.86355 + metric_value: 0.87054 num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV: - metric_value: 0.86355 + metric_value: 0.85467 + num_int4: 94 + num_int8: 124 +tinyllama_data_aware_awq_stateful_backend_OV: + metric_value: 0.84118 num_int4: 94 num_int8: 124 -tinyllama_int8_data_free_backend_TORCH: - metric_value: 0.95624 - num_int4: 0 - num_int8: 312 tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV: - metric_value: 0.86697 + metric_value: 0.85716 num_int4: 94 num_int8: 124 metrics_xfail_reason: "Issue-148819" -tinyllama_scale_estimation_per_channel_backend_OV: - metric_value: 0.80798 - num_int4: 188 - num_int8: 124 tinyllama_data_aware_lora_stateful_backend_OV: - metric_value: 0.83446 + metric_value: 0.82979 num_int4: 94 num_int8: 500 -tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV: - metric_value: 0.87132 - num_int4: 11 - num_int8: 290 - metrics_xfail_reason: "Issue-148819" -tinyllama_awq_backup_mode_none_backend_OV: - metric_value: 0.85679 - num_int4: 208 - num_int8: 0 +tinyllama_data_free_backend_OV: + metric_value: 0.73795 + num_int4: 114 + num_int8: 84 +tinyllama_int8_data_free_backend_TORCH: + metric_value: 0.94014 + num_int4: 0 + num_int8: 312 +tinyllama_scale_estimation_per_channel_backend_OV: + metric_value: 0.82413 + num_int4: 188 + num_int8: 124 diff --git a/tests/post_training/data/wc_reference_data_2024.5.yaml b/tests/post_training/data/wc_reference_data_2024.5.yaml index e55a9f03dcd..d2c85eefad0 100644 --- a/tests/post_training/data/wc_reference_data_2024.5.yaml +++ b/tests/post_training/data/wc_reference_data_2024.5.yaml @@ -1,34 +1,46 @@ tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV: - metric_value: 0.88663 + metric_value: 0.86466 num_int4: 11 num_int8: 290 metrics_xfail_reason: "Issue-148819" -tinyllama_int4_data_free_backend_TORCH: - metric_value: 0.73873 - num_int4: 114 - num_int8: 84 tinyllama_awq_backup_mode_none_backend_OV: - metric_value: 0.84783 + metric_value: 0.84809 num_int4: 208 num_int8: 0 +tinyllama_data_aware_backend_OV: + metric_value: 0.85273 + num_int4: 94 + num_int8: 124 tinyllama_data_aware_awq_scale_estimation_backend_OV: - metric_value: 0.85502 + metric_value: 0.85473 num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV: - metric_value: 0.85502 + metric_value: 0.85882 num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_stateful_backend_OV: - metric_value: 0.85616 + metric_value: 0.8433 num_int4: 94 num_int8: 124 tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV: - metric_value: 0.86503 + metric_value: 0.83747 num_int4: 94 num_int8: 124 metrics_xfail_reason: "Issue-148819" +tinyllama_data_aware_lora_stateful_backend_OV: + metric_value: 0.83219 + num_int4: 94 + num_int8: 500 +tinyllama_data_free_backend_OV: + metric_value: 0.70809 + num_int4: 114 + num_int8: 84 +tinyllama_int8_data_free_backend_TORCH: + metric_value: 0.94589 + num_int4: 0 + num_int8: 312 tinyllama_scale_estimation_per_channel_backend_OV: - metric_value: 0.81389 + metric_value: 0.81748 num_int4: 188 num_int8: 124 \ No newline at end of file