diff --git a/tests/e2e/vLLM/configs/kv_cache_phi3.yaml b/tests/e2e/vLLM/configs/kv_cache_phi3.yaml new file mode 100644 index 000000000..9637e83e1 --- /dev/null +++ b/tests/e2e/vLLM/configs/kv_cache_phi3.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: microsoft/Phi-3-mini-4k-instruct +recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +scheme: kv_cache_default_phi3 \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml b/tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml new file mode 100644 index 000000000..8711d2a4d --- /dev/null +++ b/tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +scheme: kv_cache_default_tinyllama \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/kv_cache_tinyllama_gptq.yaml b/tests/e2e/vLLM/configs/kv_cache_tinyllama_gptq.yaml new file mode 100644 index 000000000..6dd112d3c --- /dev/null +++ b/tests/e2e/vLLM/configs/kv_cache_tinyllama_gptq.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 +recipe: tests/e2e/vLLM/recipes/kv_cache/gptq.yaml +dataset_id: HuggingFaceH4/ultrachat_200k +dataset_split: train_sft +scheme: kv_cache_default_tinyllama \ No newline at end of file diff --git a/tests/e2e/vLLM/recipes/kv_cache/default.yaml b/tests/e2e/vLLM/recipes/kv_cache/default.yaml new file mode 100644 index 000000000..57e4dc285 --- /dev/null +++ b/tests/e2e/vLLM/recipes/kv_cache/default.yaml @@ -0,0 +1,6 @@ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + {num_bits: 8, type: float, symmetric: true, strategy: tensor} + \ No newline at end of file diff --git a/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml b/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml new file mode 100644 index 000000000..8c76de33a --- /dev/null +++ b/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml @@ -0,0 +1,17 @@ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + {num_bits: 8, type: float, symmetric: true, strategy: tensor} + GPTQModifier: + sequential_update: false + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: "channel" + actorder: False + targets: ["Linear"]