From 7937c2fd52716c668bb9ed433813ca623d59d6d9 Mon Sep 17 00:00:00 2001 From: sunyicode0012 <116338547+sunyicode0012@users.noreply.github.com> Date: Tue, 20 May 2025 00:49:57 +0800 Subject: [PATCH 001/192] Add files via uploadAdd fused MoE kernel tuning configs (fp8_w8a8) for DeepSeek V3/R1 on a single-node 8x NVIDIA H20 96GB setup (#18337) --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..3e0ad0d5a989 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} From 81712218341ce09d555579829e8903e7a9aa4880 Mon Sep 17 00:00:00 2001 From: Gong Shufan <2624542821@qq.com> Date: Tue, 20 May 2025 00:51:01 +0800 Subject: [PATCH 002/192] [Misc] Fix typo (#18330) --- .../lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh index 831ef0bb574b..5719fa821292 100644 --- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh +++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh @@ -54,6 +54,6 @@ elif [[ $1 == "decoder" ]]; then else echo "Invalid role: $1" - echo "Should be either prefill, decode" + echo "Should be either prefiller, decoder" exit 1 fi From dc1440cf9f8f6233a3c464e1a01daa12207f8680 Mon Sep 17 00:00:00 2001 From: Satyajith Chilappagari Date: Mon, 19 May 2025 09:54:47 -0700 Subject: [PATCH 003/192] Neuron up mistral (#18222) Signed-off-by: Satyajith Chilappagari --- tests/neuron/2_core/test_mistral.py | 32 +++++++++++++++++++ .../model_loader/neuronx_distributed.py | 3 ++ vllm/platforms/neuron.py | 3 +- 3 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 tests/neuron/2_core/test_mistral.py diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py new file mode 100644 index 000000000000..8acd082f2ded --- /dev/null +++ b/tests/neuron/2_core/test_mistral.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams + + +def test_mistral(): + llm = LLM(model="mistralai/Mistral-7B-v0.1", + tensor_parallel_size=2, + max_num_seqs=4, + max_model_len=512, + use_v2_block_manager=True, + override_neuron_config={ + "sequence_parallel_enabled": False, + "skip_warmup": True + }, + device="neuron") + + prompts = [ + "The president of the United States is", + "The capital of France is", + ] + outputs = llm.generate(prompts, SamplingParams(top_k=1)) + + expected_outputs = [ + " the most powerful person in the world. He is the head of state " + "and head", + " a city of many faces. It is a city of history, culture, art" + ] + + for expected_output, output in zip(expected_outputs, outputs): + generated_text = output.outputs[0].text + assert (expected_output == generated_text) diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index fee8c10b6c2f..b98cea7fe6e1 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -48,6 +48,9 @@ # Models supported by Neuronx distributed for inference. _NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = { "LlamaForCausalLM": + ("neuronx_distributed_inference.models.llama.modeling_llama", + "NeuronLlamaForCausalLM"), + "MistralForCausalLM": ("neuronx_distributed_inference.models.llama.modeling_llama", "NeuronLlamaForCausalLM"), "DbrxForCausalLM": diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 71f7c718cdf9..e08337b8391d 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -51,8 +51,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: assert (vllm_config.lora_config is None), "LoRA is not supported for Neuron backend." - cache_config = vllm_config.cache_config - if cache_config: + if vllm_config.cache_config and vllm_config.model_config: # neuron needs block_size = max_model_len vllm_config.cache_config.block_size = \ vllm_config.model_config.max_model_len # type: ignore From 258bf621d5e533b01026b73fbfb31b746f68684f Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 19 May 2025 13:42:35 -0700 Subject: [PATCH 004/192] fix CUDA_check redefinition in #17918 (#18287) Signed-off-by: Lucia Fang Co-authored-by: Lucia (Lu) Fang --- csrc/cutlass_extensions/common.hpp | 9 --------- csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh | 8 +++++--- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 0877da52435e..195872e8edd3 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -15,15 +15,6 @@ cutlassGetStatusString(error)); \ } -/** - * Panic wrapper for unwinding CUDA runtime errors - */ -#define CUDA_CHECK(status) \ - { \ - cudaError_t error = status; \ - TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \ - } - inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { int max_shared_mem_per_block_opt_in = 0; cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh index 9c8a50332ad0..c22523da4e43 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh @@ -8,6 +8,8 @@ #include +#include "cuda_utils.h" + #include "cutlass/cutlass.h" #include "cutlass/gemm/device/gemm_universal_adapter.h" @@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm { // clang-format off using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< - cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, - ElementAB, cutlass::layout::RowMajor, AlignmentAB, - ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, + cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, + ElementAB, cutlass::layout::RowMajor, AlignmentAB, + ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, ElementAcc, TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp; From d565e0976fb5ffd353727066ac8aa98272e318af Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Mon, 19 May 2025 16:30:32 -0700 Subject: [PATCH 005/192] [neuron] fix authorization issue (#18364) Signed-off-by: Liangfu Chen --- .buildkite/scripts/hardware_ci/run-neuron-test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh index ec6a080eb499..c0b9dd8dadba 100644 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh @@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" HF_CACHE="$(realpath ~)/huggingface" mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" +HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN) NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" mkdir -p "${NEURON_COMPILE_CACHE_URL}" NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" # Try building the docker image -aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws # prune old image and containers to save disk space, and only once a day # by using a timestamp file in tmp. @@ -47,6 +48,7 @@ trap remove_docker_container EXIT docker run --rm -it --device=/dev/neuron0 --network bridge \ -v "${HF_CACHE}:${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \ + -e "HF_TOKEN=${HF_TOKEN}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ From f07a673eb2fc4eb6f4e18eadb3512702877f5c3a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 20 May 2025 11:20:12 +0800 Subject: [PATCH 006/192] [Misc] Allow `AutoWeightsLoader` to skip loading weights with specific substr in name (#18358) Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/test_utils.py | 70 +++++++++++++++++++++ vllm/model_executor/models/granite.py | 16 ++--- vllm/model_executor/models/grok1.py | 10 +-- vllm/model_executor/models/mixtral.py | 2 +- vllm/model_executor/models/mixtral_quant.py | 5 +- vllm/model_executor/models/nemotron.py | 11 +--- vllm/model_executor/models/olmo.py | 16 +---- vllm/model_executor/models/olmo2.py | 16 +---- vllm/model_executor/models/olmoe.py | 5 +- vllm/model_executor/models/orion.py | 11 +--- vllm/model_executor/models/phi4mm.py | 4 +- vllm/model_executor/models/phimoe.py | 5 +- vllm/model_executor/models/qwen2_moe.py | 5 +- vllm/model_executor/models/qwen3_moe.py | 5 +- vllm/model_executor/models/solar.py | 11 +--- vllm/model_executor/models/stablelm.py | 10 +-- vllm/model_executor/models/starcoder2.py | 5 +- vllm/model_executor/models/utils.py | 18 +++++- 18 files changed, 116 insertions(+), 109 deletions(-) diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py index d61c7d2d5000..a16384efe195 100644 --- a/tests/models/test_utils.py +++ b/tests/models/test_utils.py @@ -77,3 +77,73 @@ def weight_generator(): assert torch.all( new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var) assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1 + + +def test_module_skip_prefix(): + """Ensure the auto weight loader can skip prefix.""" + mod = ModuleWithNestedBatchNorm() + # Run some data through the module with batchnorm + mod(torch.Tensor([[1, 2], [3, 4]])) + + # Try to load the weights to a new instance + def weight_generator(): + # weights needed to be filtered out + redundant_weights = { + "prefix.bn.weight": torch.Tensor([1, 2]), + "prefix.bn.bias": torch.Tensor([3, 4]), + } + yield from (mod.state_dict() | redundant_weights).items() + + new_mod = ModuleWithNestedBatchNorm() + + assert not torch.all( + new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean) + assert not torch.all( + new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var) + assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0 + + loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."]) + loader.load_weights(weight_generator()) + + # Ensure the stats are updated + assert torch.all( + new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean) + assert torch.all( + new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var) + assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1 + + +def test_module_skip_substr(): + """Ensure the auto weight loader can skip prefix.""" + mod = ModuleWithNestedBatchNorm() + # Run some data through the module with batchnorm + mod(torch.Tensor([[1, 2], [3, 4]])) + + # Try to load the weights to a new instance + def weight_generator(): + # weights needed to be filtered out + redundant_weights = { + "nested_mod.0.substr.weight": torch.Tensor([1, 2]), + "nested_mod.0.substr.bias": torch.Tensor([3, 4]), + "nested_mod.substr.weight": torch.Tensor([1, 2]), + "nested_mod.substr.bias": torch.Tensor([3, 4]), + } + yield from (mod.state_dict() | redundant_weights).items() + + new_mod = ModuleWithNestedBatchNorm() + + assert not torch.all( + new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean) + assert not torch.all( + new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var) + assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0 + + loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."]) + loader.load_weights(weight_generator()) + + # Ensure the stats are updated + assert torch.all( + new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean) + assert torch.all( + new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var) + assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1 diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index eed0820a5779..c49db653f735 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -478,18 +478,14 @@ def make_empty_intermediate_tensors( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - skip_prefixes = [ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached", - ] # With tie_word_embeddings, we can skip lm_head.weight # The weight might appear unnecessarily in the files if the model is # processed with quantization, LoRA, fine-tuning, etc. - if self.config.tie_word_embeddings: - skip_prefixes.append("lm_head.weight") + skip_prefixes = (["lm_head."] + if self.config.tie_word_embeddings else None) - loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + loader = AutoWeightsLoader( + self, + skip_prefixes=skip_prefixes, + ) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 6d2d16d098d4..578d31a851a9 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -550,10 +550,12 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - skip_prefixes = ["rotary_emb.inv_freq"] # Skip lm_head when tie_word_embeddings is True - if self.config.tie_word_embeddings: - skip_prefixes.append("lm_head") + skip_prefixes = (["lm_head"] + if self.config.tie_word_embeddings else None) - loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + loader = AutoWeightsLoader( + self, + skip_prefixes=skip_prefixes, + ) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 1968bf9e68af..4823808e8906 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -482,5 +482,5 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"]) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index b6a0c9ec6fc1..f096f6a7996d 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -447,8 +447,5 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["rotary_emb.inv_freq"]), - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 0b5a102ea1f2..c5c5155a2df5 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -502,14 +502,5 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=([ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached" - ]), - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 26ca770d8493..fcb7c619a102 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -382,19 +382,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, - skip_prefixes=([ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached", - "lm_head.weight" - ] if self.config.tie_word_embeddings else [ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached" - ]), + skip_prefixes=(["lm_head.weight"] + if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index e4dc0e0cc411..0a1fb10c186e 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -403,19 +403,7 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, - skip_prefixes=([ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached", - "lm_head.weight" - ] if self.config.tie_word_embeddings else [ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached" - ]), + skip_prefixes=(["lm_head.weight"] + if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 9a07f57fd999..6364b89fb837 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -442,8 +442,5 @@ def compute_logits(self, hidden_states: torch.Tensor, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=["rotary_emb.inv_freq"], - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 1ccd1fe1f741..da2a194e6bdf 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -344,14 +344,5 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=([ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached" - ]), - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index b7bb3c45c633..418ff900ffd5 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1228,9 +1228,7 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None: - weights = ((name, data) for name, data in weights - if "lora" not in name) - loader = AutoWeightsLoader(self) + loader = AutoWeightsLoader(self, skip_substrs=["lora"]) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_mm_mapping(self) -> MultiModelKeys: diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 7f2e9fdf7c4e..d9917c26d1b1 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -660,8 +660,5 @@ def compute_logits(self, hidden_states: torch.Tensor, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["rotary_emb.inv_freq"]), - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 7cf98dc7a4ea..143b9f98b029 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -535,8 +535,5 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["rotary_emb.inv_freq"]), - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index aae5401721df..8a4c2850dda3 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -530,8 +530,5 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["rotary_emb.inv_freq"]), - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 8c78c846302a..53e5274aa574 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -500,14 +500,5 @@ def compute_logits(self, hidden_states: torch.Tensor, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=([ - "rotary_emb.inv_freq", - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - "rotary_emb.cos_cached", - "rotary_emb.sin_cached" - ]), - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 8c2ad6f19251..86ce813ddf3d 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -338,13 +338,5 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - skip_prefixes=[ - "rotary_emb.inv_freq", "rotary_emb.cos_cached", - "rotary_emb.sin_cached" - ], - ) + loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 5927afa91f49..f4ba5a8030e5 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -349,8 +349,7 @@ def load_weights(self, weights: Iterable[tuple[str, self, # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. - skip_prefixes=([ - "rotary_emb.inv_freq", "lm_head.weight" - ] if self.config.tie_word_embeddings else ["rotary_emb.inv_freq"]), + skip_prefixes=(["lm_head.weight"] + if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 5cc501622891..027cd748e9de 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -80,18 +80,30 @@ class AutoWeightsLoader: environment variable ``VLLM_LOGGING_LEVEL=DEBUG``. """ + # Models trained using early version ColossalAI + # may include these tensors in checkpoint. Skip them. + ROTARY_EMBEDS_UNUSED_WEIGHTS = [ + "rotary_emb.inv_freq", + "rotary_emb.cos_cached", + "rotary_emb.sin_cached", + ] + def __init__( self, module: nn.Module, *, skip_prefixes: Optional[list[str]] = None, + skip_substrs: Optional[list[str]] = None, ignore_unexpected_prefixes: Optional[list[str]] = None, ) -> None: super().__init__() self.module = module self.skip_prefixes = skip_prefixes or [] + self.skip_substrs = skip_substrs or [] self.ignore_unexpected_prefixes = ignore_unexpected_prefixes or [] + # update default skip_substrs + self.skip_substrs += self.ROTARY_EMBEDS_UNUSED_WEIGHTS def _groupby_prefix( self, @@ -119,7 +131,8 @@ def _get_qualname(self, prefix: str, rest: str) -> str: return ".".join((prefix, rest)) def _can_skip(self, qualname: str) -> bool: - return any(qualname.startswith(p) for p in self.skip_prefixes) + return (any(qualname.startswith(p) for p in self.skip_prefixes) + or any(substr in qualname for substr in self.skip_substrs)) def _can_ignore_unexpected(self, qualname: str) -> bool: return any( @@ -257,6 +270,9 @@ def load_weights( ) -> set[str]: if mapper is not None: weights = mapper.apply(weights) + # filter out weights with first-prefix/substr to skip in name + weights = ((name, weight) for name, weight in weights + if not self._can_skip(name)) autoloaded_weights = set(self._load_module("", self.module, weights)) return autoloaded_weights From 9609327fa436dac549de75daad42bf48d2be354d Mon Sep 17 00:00:00 2001 From: Nan Qin Date: Mon, 19 May 2025 22:21:27 -0500 Subject: [PATCH 007/192] [Core] [Bugfix]: tensor parallel with prompt embeds (#18171) Signed-off-by: Nan2018 Co-authored-by: Andrew Sansom --- .../test_basic_correctness.py | 75 +++++++++++-- tests/conftest.py | 9 ++ vllm/sequence.py | 14 +-- vllm/worker/model_runner.py | 100 ++++++++++-------- 4 files changed, 136 insertions(+), 62 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 9f3b0e8ae079..86b5e1e0ab7c 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -8,12 +8,13 @@ from unittest.mock import Mock import pytest +import torch -from vllm import LLM +from vllm import LLM, envs from vllm.platforms import current_platform from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1 -from ..conftest import VllmRunner +from ..conftest import HfRunner, VllmRunner from ..models.utils import check_outputs_equal from ..utils import multi_gpu_test @@ -43,11 +44,26 @@ def test_vllm_gc_ed(): assert weak_llm() is None +def _fix_prompt_embed_outputs( + vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner, + example_prompts: list[str]) -> list[tuple[list[int], str]]: + fixed_vllm_outputs = [] + for vllm_output, hf_input, prompt in zip( + vllm_outputs, hf_model.get_inputs(example_prompts), + example_prompts): + hf_input_ids = hf_input["input_ids"].tolist()[0] + fixed_vllm_outputs.append( + (hf_input_ids + vllm_output[0][len(hf_input_ids):], + prompt + vllm_output[1])) + return fixed_vllm_outputs + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("enable_prompt_embeds", [True, False]) def test_models( monkeypatch: pytest.MonkeyPatch, hf_runner, @@ -56,8 +72,13 @@ def test_models( dtype: str, max_tokens: int, enforce_eager: bool, + enable_prompt_embeds: bool, ) -> None: + if enable_prompt_embeds and envs.is_set( + "VLLM_USE_V1") and envs.VLLM_USE_V1: + pytest.skip("enable_prompt_embeds is not supported in v1.") + if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") @@ -78,14 +99,25 @@ def test_models( with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + if enable_prompt_embeds: + with torch.no_grad(): + prompt_embeds = hf_model.get_prompt_embeddings( + example_prompts) with VllmRunner(model, max_model_len=8192, dtype=dtype, enforce_eager=enforce_eager, + enable_prompt_embeds=enable_prompt_embeds, gpu_memory_utilization=0.7) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) + if enable_prompt_embeds: + vllm_outputs = vllm_model.generate_greedy( + prompt_embeds, max_tokens) + vllm_outputs = _fix_prompt_embed_outputs( + vllm_outputs, hf_model, example_prompts) + else: + vllm_outputs = vllm_model.generate_greedy( + example_prompts, max_tokens) check_outputs_equal( outputs_0_lst=hf_outputs, @@ -108,6 +140,7 @@ def test_models( ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"), ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ]) +@pytest.mark.parametrize("enable_prompt_embeds", [True, False]) def test_models_distributed( monkeypatch: pytest.MonkeyPatch, hf_runner, @@ -117,14 +150,22 @@ def test_models_distributed( distributed_executor_backend: str, attention_backend: str, test_suite: str, + enable_prompt_embeds: bool, ) -> None: + if enable_prompt_embeds and envs.is_set( + "VLLM_USE_V1") and envs.VLLM_USE_V1: + pytest.skip("enable_prompt_embeds is not supported in v1.") + if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") with monkeypatch.context() as monkeypatch_context: if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa - # test Ray Compiled Graph + if enable_prompt_embeds: + pytest.skip( + "enable_prompt_embeds does not work with ray compiled dag." + ) monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") @@ -147,12 +188,26 @@ def test_models_distributed( dtype=dtype, tensor_parallel_size=2, distributed_executor_backend=distributed_executor_backend, + enable_prompt_embeds=enable_prompt_embeds, + gpu_memory_utilization=0.7, ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + if enable_prompt_embeds: + with hf_runner(model, dtype=dtype) as hf_model: + with torch.no_grad(): + prompt_embeds = hf_model.get_prompt_embeddings( + example_prompts) + vllm_outputs = vllm_model.generate_greedy( + prompt_embeds, max_tokens) + vllm_outputs = _fix_prompt_embed_outputs( + vllm_outputs, hf_model, example_prompts) + hf_outputs = hf_model.generate_greedy( + example_prompts, max_tokens) + else: + vllm_outputs = vllm_model.generate_greedy( + example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy( + example_prompts, max_tokens) check_outputs_equal( outputs_0_lst=hf_outputs, diff --git a/tests/conftest.py b/tests/conftest.py index c5700179c228..19c2c6247129 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -430,6 +430,15 @@ def get_inputs( return all_inputs + def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]: + all_inputs = self.get_inputs(prompts) + embeddings = [] + for inputs in all_inputs: + input_ids = self.wrap_device(inputs)["input_ids"] + embedding = self.model.get_input_embeddings()(input_ids).squeeze(0) + embeddings.append(embedding) + return embeddings + def classify(self, prompts: list[str]) -> list[str]: # output is final logits all_inputs = self.get_inputs(prompts) diff --git a/vllm/sequence.py b/vllm/sequence.py index 5aa9ae62f542..f5f9c56a7db2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -112,12 +112,12 @@ class RequestMetrics: will include model forward, block/sync across workers, cpu-gpu sync time and sampling time. spec_token_acceptance_counts: number of accepted speculative tokens at - each position; the first token is from + each position; the first token is from the target model and is always accepted; - e.g., when it's [10, 8, 4, 2] for a req, + e.g., when it's [10, 8, 4, 2] for a req, it means there were 10 forward passes in - total, and there were 8, 4, 2 accepted - tokens at 1st, 2nd, 3rd speculation step. + total, and there were 8, 4, 2 accepted + tokens at 1st, 2nd, 3rd speculation step. """ arrival_time: float last_token_time: float @@ -714,9 +714,9 @@ class SequenceGroup: trace_headers: OpenTelemetry trace headers. prompt_adapter_request: Prompt Adapter request. priority: User-defined priority of the request. - draft_size: The number of speculative tokens plus one from the target + draft_size: The number of speculative tokens plus one from the target model; equal to max number of tokens a step can generate - for single-draft speculative decoding but larger than + for single-draft speculative decoding but larger than that for multi-draft SD (currently not supported). """ @@ -1123,7 +1123,7 @@ def __repr__(self) -> str: self.output_embed.shape if self.output_embed is not None else None return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " f"output_token={self.output_token}, " - f"output_embed.shape={output_embed_shape}" + f"output_embed.shape={output_embed_shape}, " f"logprobs={self.logprobs})") def __eq__(self, other: object) -> bool: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 8a294de45c81..12025617e512 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -23,7 +23,7 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import get_pp_group +from vllm.distributed import broadcast_tensor_dict, get_pp_group from vllm.distributed.kv_transfer import get_kv_transfer_group from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, graph_capture) @@ -872,7 +872,7 @@ def build(self) -> ModelInputForGPU: """ # Combine and flatten intermediate data. input_tokens = list[int]() - inputs_embeds_lst = list[torch.Tensor]() + inputs_embeds_list = list[torch.Tensor]() token_types = list[int]() for inter_data in self.inter_data_list: for cur_input_tokens in inter_data.input_tokens: @@ -880,15 +880,15 @@ def build(self) -> ModelInputForGPU: for cur_token_types in inter_data.token_types: token_types.extend(cur_token_types) if inter_data.inputs_embeds is not None: - inputs_embeds_lst.append( + inputs_embeds_list.append( inter_data.inputs_embeds.to( dtype=self.runner.model_config.dtype, device=self.runner.device)) inputs_embeds: Optional[torch.Tensor] - if len(inputs_embeds_lst) == 0: + if len(inputs_embeds_list) == 0: inputs_embeds = None else: - inputs_embeds = torch.cat(inputs_embeds_lst, dim=0).to( + inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to( dtype=self.runner.model_config.dtype, device=self.runner.device) assert len(inputs_embeds) == len(input_tokens) @@ -1893,50 +1893,60 @@ def execute_model( logits = self.model.compute_logits(hidden_or_intermediate_states, model_input.sampling_metadata) - if not self.is_driver_worker: - return [] + if self.is_driver_worker: + if model_input.async_callback is not None: + model_input.async_callback() - if model_input.async_callback is not None: - model_input.async_callback() + # Sample the next token. + assert isinstance(self.sampler, Sampler) + orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor + if model_input.inputs_embeds is not None: + self.sampler.include_gpu_probs_tensor = True - # Sample the next token. - assert isinstance(self.sampler, Sampler) - orig_include_gpu_probs_tensor = self.sampler.include_gpu_probs_tensor - if model_input.inputs_embeds is not None: - self.sampler.include_gpu_probs_tensor = True - - output: SamplerOutput = self.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - if (self.observability_config is not None - and self.observability_config.collect_model_forward_time - and output is not None): - model_forward_end.synchronize() - model_forward_time = model_forward_start.elapsed_time( - model_forward_end) - orig_model_forward_time = 0.0 - if intermediate_tensors is not None: - orig_model_forward_time = intermediate_tensors.tensors.get( - "model_forward_time", torch.tensor(0.0)).item() - # If there are multiple workers, we are still tracking the latency - # from the start time of the driver worker to the end time of the - # driver worker. The model forward time will then end up covering - # the communication time as well. - output.model_forward_time = (orig_model_forward_time + - model_forward_time) + output: SamplerOutput = self.sampler( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + if (self.observability_config is not None + and self.observability_config.collect_model_forward_time + and output is not None): + model_forward_end.synchronize() + model_forward_time = model_forward_start.elapsed_time( + model_forward_end) + orig_model_forward_time = 0.0 + if intermediate_tensors is not None: + orig_model_forward_time = intermediate_tensors.tensors.get( + "model_forward_time", torch.tensor(0.0)).item() + # If there are multiple workers, we are still tracking the + # latency from the start time of the driver worker to the end + # time of the driver worker. The model forward time will then + # end up covering the communication time as well. + output.model_forward_time = (orig_model_forward_time + + model_forward_time) if model_input.inputs_embeds is not None: - self.sampler.include_gpu_probs_tensor = \ - orig_include_gpu_probs_tensor - if output.sampled_token_ids is not None: - output.sampled_token_embeds = self.model.get_input_embeddings( - output.sampled_token_ids.squeeze(1)) - - for token_embed, sequence_group_output in zip( - output.sampled_token_embeds, output.outputs): - assert len(sequence_group_output.samples) == 1 - sequence_group_output.samples[0].output_embed = token_embed + if self.is_driver_worker: + sampled = broadcast_tensor_dict( + {"token_ids": output.sampled_token_ids}) + else: + sampled = broadcast_tensor_dict() + if sampled["token_ids"] is not None: + sampled_token_embeds = self.model.get_input_embeddings( + sampled["token_ids"].squeeze(1)) + if self.is_driver_worker: + self.sampler.include_gpu_probs_tensor = \ + orig_include_gpu_probs + + output.sampled_token_embeds = sampled_token_embeds + + for token_embed, sequence_group_output in zip( + output.sampled_token_embeds, output.outputs): + assert len(sequence_group_output.samples) == 1 + sequence_group_output.samples[ + 0].output_embed = token_embed + + if not self.is_driver_worker: + return [] if self.return_hidden_states: # we only need to pass hidden states of most recent token From d981396778d70726d7f9311ac54a18844b06d1a2 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 19 May 2025 23:49:23 -0700 Subject: [PATCH 008/192] [release] Change dockerhub username for TPU release (#18389) --- .buildkite/release-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 2118cf4595eb..b3c27e2c99c2 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -64,7 +64,7 @@ steps: - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" plugins: - docker-login#v3.0.0: - username: vllm + username: vllmbot password-env: DOCKERHUB_TOKEN env: DOCKER_BUILDKIT: "1" From bca55b556f6a0cc7f8d75d5e12205be44eacd381 Mon Sep 17 00:00:00 2001 From: Random Fly Date: Tue, 20 May 2025 15:54:33 +0800 Subject: [PATCH 009/192] [Bugfix] fix adding bias twice in ipex GPTQ quantization (#18363) Signed-off-by: rand-fly --- vllm/model_executor/layers/quantization/ipex_quant.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 8bce6bba460a..b7baa3d3363b 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -181,8 +181,6 @@ def apply(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: reshaped_x = x.reshape(-1, x.shape[-1]) out = layer.ipex_qlinear(reshaped_x) - if bias is not None: - out.add_(bias) return out.reshape(x.shape[:-1] + (layer.ipex_output_size, )) From 1b1e8e05ff3c26b98e4161bd3c8671e86fb145f4 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Tue, 20 May 2025 16:53:27 +0800 Subject: [PATCH 010/192] [doc] update env variable export (#18391) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/source/getting_started/quickstart.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 298ba59f7d8b..42468ff73c2c 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -82,6 +82,11 @@ llm = LLM(model="facebook/opt-125m") :::{note} By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. + +```shell +export VLLM_USE_MODELSCOPE=True +``` + ::: Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. From 6b35cb10a0127b7fd9cd23346ac0a4ced72fed3c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 20 May 2025 18:27:30 +0800 Subject: [PATCH 011/192] [Misc] Add LoRA code owner (#18387) Signed-off-by: Jee Jee Li --- .github/CODEOWNERS | 2 ++ tests/quantization/test_bitsandbytes.py | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 76aa5f7a35d5..a37bdb0f4d9e 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -13,6 +13,7 @@ /vllm/model_executor/guided_decoding @mgoin @russellb /vllm/multimodal @DarkLight1337 @ywang96 /vllm/vllm_flash_attn @LucasWilkinson +/vllm/lora @jeejeelee CMakeLists.txt @tlrmchlsmth # vLLM V1 @@ -40,3 +41,4 @@ CMakeLists.txt @tlrmchlsmth /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb /tests/v1/structured_output @mgoin @russellb /tests/weight_loading @mgoin @youkaichao +/tests/lora @jeejeelee diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 0f20f42d8650..e8ddfd7fc779 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -37,12 +37,6 @@ ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"), ] -models_pre_quant_8bit_to_test = [ - ('meta-llama/Llama-Guard-3-8B-INT8', - 'read pre-quantized llama 8-bit model'), - ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"), -] - @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), reason='bitsandbytes is not supported on this GPU type.') From d6c86d09aecb910fd336ba83ede70265ee81149a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 20 May 2025 18:53:23 +0800 Subject: [PATCH 012/192] Update cpu.txt (#18398) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- requirements/cpu.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 752931158a05..d89847fe71fd 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -2,6 +2,8 @@ -r common.txt # Dependencies for CPUs +packaging>=24.2 +setuptools>=77.0.3,<80.0.0 --extra-index-url https://download.pytorch.org/whl/cpu torch==2.7.0+cpu; platform_machine == "x86_64" torch==2.7.0; platform_system == "Darwin" From 86847700d74b227b3717aeb9a8bcfcdc352697bf Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 20 May 2025 21:51:12 +0800 Subject: [PATCH 013/192] [CI] Add mteb testing to test the accuracy of the embedding model (#17175) --- requirements/test.in | 1 + requirements/test.txt | 22 +++++++++- .../openai/correctness/test_mteb.py | 42 +++++++++++++++++++ tests/models/language/pooling/test_gte.py | 2 - tests/models/language/pooling/test_nomic.py | 1 - .../pooling/test_snowflake_arctic_embed.py | 1 - 6 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 tests/entrypoints/openai/correctness/test_mteb.py diff --git a/requirements/test.in b/requirements/test.in index cdc7c563f087..87af61769038 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -33,6 +33,7 @@ num2words # required for smolvlm test opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test +mteb>=1.38.11, <2 # required for mteb test transformers==4.51.3 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. diff --git a/requirements/test.txt b/requirements/test.txt index 9a15d9a0d824..89d477017342 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -99,6 +99,7 @@ datasets==3.0.2 # via # evaluate # lm-eval + # mteb decorator==5.1.1 # via librosa dill==0.3.8 @@ -124,6 +125,8 @@ email-validator==2.2.0 # via pydantic encodec==0.1.1 # via vocos +eval-type-backport==0.2.2 + # via mteb evaluate==0.4.3 # via lm-eval fastparquet==2024.11.0 @@ -291,6 +294,8 @@ msgpack==1.1.0 # via # librosa # ray +mteb==1.38.11 + # via -r requirements/test.in multidict==6.1.0 # via # aiohttp @@ -331,6 +336,7 @@ numpy==1.26.4 # librosa # matplotlib # mistral-common + # mteb # numba # numexpr # opencv-python-headless @@ -443,6 +449,8 @@ plotly==5.24.1 # via genai-perf pluggy==1.5.0 # via pytest +polars==1.29.0 + # via mteb pooch==1.8.2 # via librosa portalocker==2.10.1 @@ -476,6 +484,7 @@ pydantic==2.9.2 # via # datamodel-code-generator # mistral-common + # mteb pydantic-core==2.23.4 # via pydantic pygments==2.18.0 @@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0 # typepy python-rapidjson==1.20 # via tritonclient +pytrec-eval-terrier==0.5.7 + # via mteb pytz==2024.2 # via # pandas @@ -564,6 +575,7 @@ requests==2.32.3 # huggingface-hub # lm-eval # mistral-common + # mteb # pooch # ray # responses @@ -580,6 +592,7 @@ rfc3987==1.3.8 rich==13.9.4 # via # genai-perf + # mteb # typer rouge-score==0.1.2 # via lm-eval @@ -607,16 +620,20 @@ scikit-learn==1.5.2 # via # librosa # lm-eval + # mteb # sentence-transformers scipy==1.13.1 # via # librosa + # mteb # scikit-learn # sentence-transformers # statsmodels # vocos sentence-transformers==3.2.1 - # via -r requirements/test.in + # via + # -r requirements/test.in + # mteb sentencepiece==0.2.0 # via mistral-common setuptools==77.0.3 @@ -696,6 +713,7 @@ torch==2.7.0+cu128 # fastsafetensors # lm-eval # mamba-ssm + # mteb # peft # runai-model-streamer # sentence-transformers @@ -720,6 +738,7 @@ tqdm==4.66.6 # evaluate # huggingface-hub # lm-eval + # mteb # nltk # peft # pqdm @@ -759,6 +778,7 @@ typing-extensions==4.12.2 # huggingface-hub # librosa # mistral-common + # mteb # pqdm # pydantic # pydantic-core diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py new file mode 100644 index 000000000000..b702e0acd38b --- /dev/null +++ b/tests/entrypoints/openai/correctness/test_mteb.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +import math +import os + +import pytest + +from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, + OpenAIClientMtebEncoder, + run_mteb_embed_task, + run_mteb_embed_task_st) +from tests.utils import RemoteOpenAIServer + +os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" + +MODEL_NAME = "BAAI/bge-m3" +DTYPE = "float16" +MAIN_SCORE = 0.7873427091972599 + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", "embed", "--dtype", DTYPE, "--enforce-eager", + "--max-model-len", "512" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +def test_mteb(server): + client = server.get_client() + encoder = OpenAIClientMtebEncoder(MODEL_NAME, client) + vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS) + st_main_score = MAIN_SCORE or run_mteb_embed_task_st( + MODEL_NAME, MTEB_EMBED_TASKS) + + print("VLLM main score: ", vllm_main_score) + print("SentenceTransformer main score: ", st_main_score) + print("Difference: ", st_main_score - vllm_main_score) + + assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 3ccf2999664c..b60d27aaa72b 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -58,8 +58,6 @@ @pytest.mark.parametrize("model_info", MODELS) def test_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - pytest.skip("Skipping mteb test.") - from .mteb_utils import mteb_test_embed_models vllm_extra_kwargs: dict[str, Any] = {} diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 6e9de30f977d..28df32e0c230 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -23,7 +23,6 @@ @pytest.mark.parametrize("model_info", MODELS) def test_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - pytest.skip("Skipping mteb test.") from .mteb_utils import mteb_test_embed_models mteb_test_embed_models(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index 7d9c3c73d852..5679e0e1ce00 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -46,7 +46,6 @@ def test_models_mteb( vllm_runner, model_info: EmbedModelInfo, ) -> None: - pytest.skip("Skipping mteb test.") from .mteb_utils import mteb_test_embed_models mteb_test_embed_models(hf_runner, vllm_runner, model_info) From be48360c1fb9284804f9e1cae23b58e23e762877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=87=83?= Date: Tue, 20 May 2025 21:59:48 +0800 Subject: [PATCH 014/192] [Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18407) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 松灵 --- vllm/worker/model_runner.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 12025617e512..15f40bcef896 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -729,8 +729,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( seq_group_metadata, range(positions[0], positions[0] + len(positions))) - if not mm_kwargs: - return inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps @@ -741,12 +739,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, video_grid_thw = mm_kwargs.get("video_grid_thw", None) audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", None) - assert ( - image_grid_thw is not None or video_grid_thw is not None - or audio_feature_lengths is not None), ( - "mrope embedding type requires multi-modal input mapper " - "returns 'image_grid_thw' or 'video_grid_thw' or " - "'audio_feature_lengths'.") second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) From 8f55962a7ffcf310ecb462f3a547e593e4fd77bd Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Tue, 20 May 2025 23:26:12 +0800 Subject: [PATCH 015/192] [Misc] refactor prompt embedding examples (#18405) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/source/features/prompt_embeds.md | 104 +----------------- .../prompt_embed_inference.py | 103 +++++++++++++++++ ...ompt_embed_inference_with_openai_client.py | 86 +++++++++++++++ 3 files changed, 191 insertions(+), 102 deletions(-) create mode 100644 examples/offline_inference/prompt_embed_inference.py create mode 100644 examples/online_serving/prompt_embed_inference_with_openai_client.py diff --git a/docs/source/features/prompt_embeds.md b/docs/source/features/prompt_embeds.md index 4e4648d171d5..9d7b242bbe51 100644 --- a/docs/source/features/prompt_embeds.md +++ b/docs/source/features/prompt_embeds.md @@ -20,59 +20,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPromp You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples: -```python -from vllm import LLM -import transformers - -model_name = "meta-llama/Llama-3.2-1B-Instruct" - -# Transformers -tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) -transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name) - -llm = LLM(model=model_name, enable_prompt_embeds=True) - -# Refer to the HuggingFace repo for the correct format to use -chat = [{"role": "user", "content": "Please tell me about the capital of France."}] -token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') - -embedding_layer = transformers_model.get_input_embeddings() -prompt_embeds = embedding_layer(token_ids).squeeze(0) - -# Single prompt inference -outputs = llm.generate({ - "prompt_embeds": prompt_embeds, -}) - -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -# Batch inference - -chats = [ - [{"role": "user", "content": "Please tell me about the capital of France."}], - [{"role": "user", "content": "When is the day longest during the year?"}], - [{"role": "user", "content": "Where is bigger, the moon or the sun?"}] -] - -token_ids_list = [ - tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats -] -prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list] - -outputs = llm.generate( - [ - { - "prompt_embeds": prompt_embeds, - } for prompt_embeds in prompt_embeds_list - ] -) - -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + ## Online Serving @@ -93,52 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \ Then, you can use the OpenAI client as follows: -```python -from openai import OpenAI -import transformers -import torch - -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -model_name = "meta-llama/Llama-3.2-1B-Instruct" - -# Transformers -tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) -transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name) - - -# Refer to the HuggingFace repo for the correct format to use -chat = [{"role": "user", "content": "Please tell me about the capital of France."}] -token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') - -embedding_layer = transformers_model.get_input_embeddings() -prompt_embeds = embedding_layer(token_ids).squeeze(0) - -# Prompt embeddings -buffer = io.BytesIO() -torch.save(prompt_embeds, buffer) -buffer.seek(0) -binary_data = buffer.read() -encoded_embeds = base64.b64encode(binary_data).decode('utf-8') - - -completion = client_with_prompt_embeds.completions.create( - model=model_name, - # NOTE: The OpenAI client does not allow `None` as an input to - # `prompt`. Use an empty string if you have no text prompts. - prompt="", - max_tokens=5, - temperature=0.0, - # NOTE: The OpenAI client allows passing in extra JSON body via the - # `extra_body` argument. - extra_body={"prompt_embeds": encoded_embeds} -) - -print(completion.choices[0].text) -``` + diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py new file mode 100644 index 000000000000..99c5a682fb27 --- /dev/null +++ b/examples/offline_inference/prompt_embed_inference.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Demonstrates how to generate prompt embeddings using +Hugging Face Transformers and use them as input to vLLM +for both single and batch inference. + +Model: meta-llama/Llama-3.2-1B-Instruct +Note: This model is gated on Hugging Face Hub. + You must request access to use it: + https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct + +Requirements: +- vLLM +- transformers + +Run: + python examples/offline_inference/prompt_embed_inference.py +""" + +import torch +from transformers import (AutoModelForCausalLM, AutoTokenizer, + PreTrainedTokenizer) + +from vllm import LLM + + +def init_tokenizer_and_llm(model_name: str): + tokenizer = AutoTokenizer.from_pretrained(model_name) + transformers_model = AutoModelForCausalLM.from_pretrained(model_name) + embedding_layer = transformers_model.get_input_embeddings() + llm = LLM(model=model_name, enable_prompt_embeds=True) + return tokenizer, embedding_layer, llm + + +def get_prompt_embeds(chat: list[dict[str, + str]], tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + token_ids = tokenizer.apply_chat_template(chat, + add_generation_prompt=True, + return_tensors='pt') + prompt_embeds = embedding_layer(token_ids).squeeze(0) + return prompt_embeds + + +def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + chat = [{ + "role": "user", + "content": "Please tell me about the capital of France." + }] + prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer) + + outputs = llm.generate({ + "prompt_embeds": prompt_embeds, + }) + + print("\n[Single Inference Output]") + print("-" * 30) + for o in outputs: + print(o.outputs[0].text) + print("-" * 30) + + +def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + chats = [[{ + "role": "user", + "content": "Please tell me about the capital of France." + }], + [{ + "role": "user", + "content": "When is the day longest during the year?" + }], + [{ + "role": "user", + "content": "Where is bigger, the moon or the sun?" + }]] + + prompt_embeds_list = [ + get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats + ] + + outputs = llm.generate([{ + "prompt_embeds": embeds + } for embeds in prompt_embeds_list]) + + print("\n[Batch Inference Outputs]") + print("-" * 30) + for i, o in enumerate(outputs): + print(f"Q{i+1}: {chats[i][0]['content']}") + print(f"A{i+1}: {o.outputs[0].text}\n") + print("-" * 30) + + +def main(): + model_name = "meta-llama/Llama-3.2-1B-Instruct" + tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name) + single_prompt_inference(llm, tokenizer, embedding_layer) + batch_prompt_inference(llm, tokenizer, embedding_layer) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py new file mode 100644 index 000000000000..ea580f1b432b --- /dev/null +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +vLLM OpenAI-Compatible Client with Prompt Embeddings + +This script demonstrates how to: +1. Generate prompt embeddings using Hugging Face Transformers +2. Encode them in base64 format +3. Send them to a vLLM server via the OpenAI-compatible Completions API + +Run the vLLM server first: +vllm serve meta-llama/Llama-3.2-1B-Instruct \ + --task generate \ + --max-model-len 4096 \ + --enable-prompt-embeds + +Run the client: +python examples/online_serving/prompt_embed_inference_with_openai_client.py + +Model: meta-llama/Llama-3.2-1B-Instruct +Note: This model is gated on Hugging Face Hub. + You must request access to use it: + https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct + +Dependencies: +- transformers +- torch +- openai +""" +import base64 +import io + +import torch +import transformers +from openai import OpenAI + + +def main(): + client = OpenAI( + api_key="EMPTY", + base_url="http://localhost:8000/v1", + ) + + model_name = "meta-llama/Llama-3.2-1B-Instruct" + + # Transformers + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + transformers_model = transformers.AutoModelForCausalLM.from_pretrained( + model_name) + + # Refer to the HuggingFace repo for the correct format to use + chat = [{ + "role": "user", + "content": "Please tell me about the capital of France." + }] + token_ids = tokenizer.apply_chat_template(chat, + add_generation_prompt=True, + return_tensors='pt') + + embedding_layer = transformers_model.get_input_embeddings() + prompt_embeds = embedding_layer(token_ids).squeeze(0) + + # Prompt embeddings + buffer = io.BytesIO() + torch.save(prompt_embeds, buffer) + buffer.seek(0) + binary_data = buffer.read() + encoded_embeds = base64.b64encode(binary_data).decode('utf-8') + + completion = client.completions.create( + model=model_name, + # NOTE: The OpenAI client does not allow `None` as an input to + # `prompt`. Use an empty string if you have no text prompts. + prompt="", + max_tokens=5, + temperature=0.0, + # NOTE: The OpenAI client allows passing in extra JSON body via the + # `extra_body` argument. + extra_body={"prompt_embeds": encoded_embeds}) + + print("-" * 30) + print(completion.choices[0].text) + print("-" * 30) + + +if __name__ == "__main__": + main() From f4a8a3746575ea62fd3e0b4bd29ebd782582836d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 20 May 2025 12:08:37 -0400 Subject: [PATCH 016/192] [Minor] Rename quantization nvfp4 to modelopt_fp4 (#18356) Signed-off-by: mgoin --- tests/models/quantization/test_nvfp4.py | 6 +++--- vllm/config.py | 2 +- vllm/model_executor/layers/quantization/__init__.py | 4 ++-- vllm/model_executor/layers/quantization/modelopt.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index f94f3457c377..510858c2d7ef 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -41,8 +41,8 @@ reason= "Prevent unstable test based on golden strings from breaking the build " " and test input model being too large and hanging the system.") -@pytest.mark.skipif(not is_quant_method_supported("nvfp4"), - reason="nvfp4 is not supported on this GPU type.") +@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"), + reason="modelopt_fp4 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: model = LLM( @@ -50,7 +50,7 @@ def test_models(example_prompts, model_name) -> None: max_model_len=MAX_MODEL_LEN, trust_remote_code=True, enforce_eager=True, - quantization="nvfp4", + quantization="modelopt_fp4", ) tokenizer = AutoTokenizer.from_pretrained(model_name) diff --git a/vllm/config.py b/vllm/config.py index a185a75c6bf3..5382e9a16829 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -824,7 +824,7 @@ def _verify_quantization(self) -> None: optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8", - "quark", "nvfp4", "bitblas", "gptq_bitblas" + "quark", "modelopt_fp4", "bitblas", "gptq_bitblas" ] if self.quantization is not None: self.quantization = cast(QuantizationMethods, diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index a22f8103e8fd..407b9c72f41d 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -14,7 +14,7 @@ "ptpc_fp8", "fbgemm_fp8", "modelopt", - "nvfp4", + "modelopt_fp4", "marlin", "bitblas", "gguf", @@ -120,7 +120,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "fp8": Fp8Config, "fbgemm_fp8": FBGEMMFp8Config, "modelopt": ModelOptFp8Config, - "nvfp4": ModelOptNvFp4Config, + "modelopt_fp4": ModelOptNvFp4Config, "marlin": MarlinConfig, "bitblas": BitBLASConfig, "gguf": GGUFConfig, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 13957a96deca..b108b02a43e2 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -192,7 +192,7 @@ def __init__( @classmethod def get_name(cls) -> QuantizationMethods: - return "nvfp4" + return "modelopt_fp4" @classmethod def get_supported_act_dtypes(cls) -> list[torch.dtype]: From e1f5a71ed74b0e232e237070c14b50542ad15298 Mon Sep 17 00:00:00 2001 From: Calvin Chen <45745657+calvin0327@users.noreply.github.com> Date: Wed, 21 May 2025 00:40:05 +0800 Subject: [PATCH 017/192] [Model] use AutoWeightsLoader for bloom (#18300) Signed-off-by: calvin chen <120380290@qq.com> --- vllm/model_executor/models/bloom.py | 79 +++++++++++++++++------------ 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index eb1085d6b40d..10424e218fbc 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -43,7 +43,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP, SupportsQuant, SupportsV0Only -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -229,6 +229,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + self.config = config self.embed_dim = config.hidden_size @@ -278,6 +279,38 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + + if "query_key_value" in name: + # NOTE: BLOOM's fused QKV's output_dim has the shape of + # (num_heads * 3 * head_size), while the + # required shape is (3 * num_heads * head_size). + # Thus, we need weight conversion. + output_dim = getattr(param, "output_dim", None) + num_heads = self.config.num_attention_heads + if output_dim is not None: + loaded_weight_shape = loaded_weight.shape + loaded_weight = loaded_weight.view( + loaded_weight_shape[:output_dim] + (num_heads, 3, -1) + + loaded_weight_shape[output_dim + 1:]) + loaded_weight = loaded_weight.transpose( + output_dim, output_dim + 1) + loaded_weight = loaded_weight.reshape(loaded_weight_shape) + + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + return loaded_params + class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant): @@ -325,35 +358,15 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if name == "lm_head.weight": - continue - if not name.startswith("transformer."): - name = "transformer." + name - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - - if "query_key_value" in name: - # NOTE: BLOOM's fused QKV's output_dim has the shape of - # (num_heads * 3 * head_size), while the - # required shape is (3 * num_heads * head_size). - # Thus, we need weight conversion. - output_dim = getattr(param, "output_dim", None) - num_heads = self.config.num_attention_heads - if output_dim is not None: - loaded_weight_shape = loaded_weight.shape - loaded_weight = loaded_weight.view( - loaded_weight_shape[:output_dim] + (num_heads, 3, -1) + - loaded_weight_shape[output_dim + 1:]) - loaded_weight = loaded_weight.transpose( - output_dim, output_dim + 1) - loaded_weight = loaded_weight.reshape(loaded_weight_shape) - - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self, skip_prefixes=["lm_head.weight"]) + weights = _add_transformer_prefix(weights) + return loader.load_weights(weights) + + +def _add_transformer_prefix( + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: + for name, tensor in weights: + if not name.startswith('transformer.'): + name = 'transformer.' + name + yield name, tensor From 980a172474fa0f32433dda87ae1fa4aadba24c51 Mon Sep 17 00:00:00 2001 From: Percy Date: Tue, 20 May 2025 13:19:34 -0500 Subject: [PATCH 018/192] [Kernel] update comment for KV shape in unified triton attn (#18099) Signed-off-by: haochengxia --- vllm/attention/ops/triton_unified_attention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 241e84ca669d..4bced779785a 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -31,8 +31,8 @@ def apply_softcap(S, x): def kernel_unified_attention_2d( output_ptr, # [num_tokens, num_query_heads, head_size] query_ptr, # [num_tokens, num_query_heads, head_size] - key_cache_ptr, # [num_blks, num_kv_heads, head_size // x, blk_size, x] - value_cache_ptr, # [num_blks, num_kv_heads, head_size, blk_size] + key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] seq_lens_ptr, # [num_seqs] alibi_slopes_ptr, # [num_query_heads] From 23baa2180b0ebba5ae94073ba9b8e93f88b75486 Mon Sep 17 00:00:00 2001 From: Dilip Gowda Bhagavan <110233170+dilipgb@users.noreply.github.com> Date: Wed, 21 May 2025 03:52:24 +0530 Subject: [PATCH 019/192] fix:Build torch wheel inline rather than picking from nightly (#18351) Signed-off-by: Dilip Gowda Bhagavan --- docker/Dockerfile.s390x | 32 +++++++++++++++++++++++++++++--- requirements/cpu.txt | 1 - 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x index 9c10cd56b594..4e89bb3057c5 100644 --- a/docker/Dockerfile.s390x +++ b/docker/Dockerfile.s390x @@ -84,16 +84,40 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ rustup default stable && \ rustup show +FROM python-install AS torch +ARG TORCH_VERSION=2.7.0 +ENV export _GLIBCXX_USE_CXX11_ABI=1 +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +WORKDIR /tmp + +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + git clone https://github.com/pytorch/pytorch.git && \ + cd pytorch && \ + git checkout v2.7.0 && \ + git submodule sync && \ + git submodule update --init --recursive && \ + uv pip install cmake ninja && \ + uv pip install -r requirements.txt && \ + python setup.py bdist_wheel + + FROM python-install AS torch-vision # Install torchvision -ARG TORCH_VERSION=2.7.0.dev20250304 +ARG TORCH_VERSION=2.7.0 ARG TORCH_VISION_VERSION=v0.20.1 WORKDIR /tmp RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \ git clone https://github.com/pytorch/vision.git && \ cd vision && \ git checkout $TORCH_VISION_VERSION && \ - uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ + TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ + uv pip install -v $TORCH_WHL_FILE && \ python setup.py bdist_wheel FROM python-install AS hf-xet-builder @@ -138,15 +162,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \ + --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \ sed -i '/^torch/d' requirements/build.txt && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \ + TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ uv pip install -v \ $ARROW_WHL_FILE \ $VISION_WHL_FILE \ $HF_XET_WHL_FILE \ - --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + $TORCH_WHL_FILE \ --index-strategy unsafe-best-match \ -r requirements/build.txt \ -r requirements/cpu.txt diff --git a/requirements/cpu.txt b/requirements/cpu.txt index d89847fe71fd..d4191888382c 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -8,7 +8,6 @@ setuptools>=77.0.3,<80.0.0 torch==2.7.0+cpu; platform_machine == "x86_64" torch==2.7.0; platform_system == "Darwin" torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" -torch==2.7.0.dev20250304; platform_machine == "s390x" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" From 3b17ea26e41b16a72935cdab7b7a771bfa1c25ef Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 20 May 2025 22:52:27 -0400 Subject: [PATCH 020/192] [TPU] Re-enable the Pallas MoE kernel (#18025) Signed-off-by: Michael Goin --- requirements/tpu.txt | 10 +++++----- vllm/model_executor/layers/fused_moe/layer.py | 3 +-- .../layers/fused_moe/moe_pallas.py | 20 +++++++++++++++++-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 11501bc5d92f..3b204a8f9905 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -18,9 +18,9 @@ setuptools==78.1.0 --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.8.0.dev20250430 -torchvision==0.22.0.dev20250430 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch==2.8.0.dev20250518 +torchvision==0.22.0.dev20250518 +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f1cb77f64eae..31efe16d1c27 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -50,8 +50,7 @@ else: from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk if current_platform.is_tpu(): - # the iterative moe implementation is used until the moe_pallas is fixed - from .moe_torch_iterative import fused_moe as fused_moe_pallas + from .moe_pallas import fused_moe as fused_moe_pallas else: fused_moe_pallas = None # type: ignore logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 8f28b64ed487..babeb97308a9 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -2,7 +2,23 @@ import torch import torch.nn.functional as F -from torch_xla.experimental.custom_kernel import _histogram + + +def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor: + """ + Compute the histogram of a int32 tensor. The bin edges are defined by the + min and max values, with step = 1. + """ + assert input.dtype == torch.int32, "input must be of torch.int32 dtype." + assert min <= max, "min must be less than or equal to max." + + def searchsorted(sorted_sequence: torch.Tensor, + values_to_search: torch.Tensor) -> torch.Tensor: + return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1) + + bin_edges = torch.linspace(min, max, max - min + 1, + dtype=input.dtype).to(input.device) + return searchsorted(bin_edges, input).to(torch.int32) def fused_moe( @@ -61,7 +77,7 @@ def fused_moe( x = torch.ops.xla.gmm(x, w2, group_sizes) x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) - x = x * topk_weights.unsqueeze_(dim=-1) + x = x * topk_weights.unsqueeze(dim=-1) x = x.sum(dim=-2) x = x.reshape(orig_shape) return x From 0c15c2e4868173642cec766c9819a210aef5e518 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Wed, 21 May 2025 00:04:33 -0400 Subject: [PATCH 021/192] [Bugfix] config.head_dim is now explicitly set to None (#18432) Signed-off-by: Gregory Shtrasberg --- vllm/distributed/kv_transfer/kv_connector/utils.py | 5 +++-- vllm/model_executor/models/exaone.py | 5 +++-- vllm/model_executor/models/granite.py | 5 +++-- vllm/model_executor/models/minimax_text_01.py | 10 ++++++---- vllm/model_executor/models/mixtral.py | 5 +++-- vllm/model_executor/models/mixtral_quant.py | 5 +++-- vllm/model_executor/models/nemotron.py | 5 +++-- vllm/model_executor/models/solar.py | 5 +++-- 8 files changed, 27 insertions(+), 18 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 0b0ce9828a74..b1c9c9af6e23 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -44,8 +44,9 @@ def get_model_args(self, model_executable: torch.nn.Module): head_size = model_config.qk_nope_head_dim + \ model_config.qk_rope_head_dim else: - head_size = getattr(model_config, "head_dim", - int(hidden_size // num_attention_heads)) + head_size = getattr(model_config, "head_dim", None) + if head_size is None: + head_size = int(hidden_size // num_attention_heads) return num_heads, head_size diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 4ffd06319684..838560692bcf 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -127,8 +127,9 @@ def __init__( assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c49db653f735..3524d036db22 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -122,8 +122,9 @@ def __init__( assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.attention_multiplier diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 0285402dadf7..7724e52c1ce1 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -604,8 +604,9 @@ def __init__( rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", - config.hidden_size // config.num_attention_heads) + head_dim = getattr(config, "head_dim", None) + if head_dim is None: + head_dim = config.hidden_size // config.num_attention_heads if hasattr(config, "max_model_len") and isinstance( config.max_model_len, int): max_position_embeddings = min(config.max_position_embeddings, @@ -861,8 +862,9 @@ def layer_fn(prefix): cache_shape=self.cache_shape) rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", - config.hidden_size // config.num_attention_heads) + head_dim = getattr(config, "head_dim", None) + if head_dim is None: + head_dim = config.hidden_size // config.num_attention_heads if hasattr(config, "max_model_len") and isinstance( config.max_model_len, int): max_position_embeddings = min(config.max_position_embeddings, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 4823808e8906..9bc7a16153e1 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -138,8 +138,9 @@ def __init__( assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MixtralConfig has an optional head_dim argument - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index f096f6a7996d..8220200d270c 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -193,8 +193,9 @@ def __init__( assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MixtralConfig has an optional head_dim argument - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c5c5155a2df5..d0999e30e1ba 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -158,8 +158,9 @@ def __init__( assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 53e5274aa574..fcd17cc1c2ba 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -126,8 +126,9 @@ def __init__( assert tp_size % self.total_num_kv_heads == 0 self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) # MistralConfig has an optional head_dim introduced by Mistral-Nemo - self.head_dim = getattr(config, "head_dim", - self.hidden_size // self.total_num_heads) + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 From 92247c522e216f9d010db1c648dc783dbf141704 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 21 May 2025 01:37:08 -0400 Subject: [PATCH 022/192] [Bug] Fix moe_sum signature (#18440) Signed-off-by: Bill Nell --- csrc/moe/torch_bindings.cpp | 2 +- tests/kernels/moe/test_moe.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 810026d034c0..05f515e2e783 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { // Calculate the result of moe by summing up the partial results // from all selected experts. - m.def("moe_sum(Tensor! input, Tensor output) -> ()"); + m.def("moe_sum(Tensor input, Tensor! output) -> ()"); m.impl("moe_sum", torch::kCUDA, &moe_sum); // Aligning the number of tokens to be processed by each expert such diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 43ddc79fcb81..9a8ac242af79 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -575,3 +575,21 @@ def test_moe_align_block_size_opcheck(): opcheck(torch.ops._moe_C.moe_align_block_size, (topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad)) + + +@pytest.mark.parametrize("m", [1, 33, 222, 1024 * 128]) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("k", [128, 511, 1024]) +@pytest.mark.parametrize("dtype", + [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") +def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): + input = torch.randn((m, topk, k), device="cuda", dtype=dtype) + actual = torch.empty((m, k), device="cuda", dtype=dtype) + + expected = input.sum(dim=1) + torch.ops._moe_C.moe_sum(input, actual) + + torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0) + + opcheck(torch.ops._moe_C.moe_sum, (input, actual)) From ad0012a0ac507973656cfcf5750d603af4a5fdcc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 21 May 2025 13:39:22 +0800 Subject: [PATCH 023/192] Revert "[Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18407)" (#18456) Signed-off-by: DarkLight1337 --- vllm/worker/model_runner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 15f40bcef896..12025617e512 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -729,6 +729,8 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( seq_group_metadata, range(positions[0], positions[0] + len(positions))) + if not mm_kwargs: + return inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps @@ -739,6 +741,12 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, video_grid_thw = mm_kwargs.get("video_grid_thw", None) audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", None) + assert ( + image_grid_thw is not None or video_grid_thw is not None + or audio_feature_lengths is not None), ( + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw' or " + "'audio_feature_lengths'.") second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) From d06dd72ba9d87794f545b08907426fe083965ade Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Wed, 21 May 2025 00:41:44 -0500 Subject: [PATCH 024/192] [Bugfix][Failing Test] Fix nixl connector test when promt size < block size (#18429) Signed-off-by: wwl2755 --- .../kv_transfer/kv_connector/v1/nixl_connector.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 9c2e82b29c76..b00f097110b0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -259,6 +259,15 @@ def build_connector_meta( # Loop through scheduled reqs and convert to ReqMeta. for req_id, (req, block_ids) in self._reqs_need_recv.items(): assert req.kv_transfer_params is not None + # For the case where there are no remote blocks to pull + # (block_ids is empty), we don't need to schedule + # an async read on the worker side. + if not block_ids: + logger.debug( + "Skipping adding request %s to NixlConnectorMetadata, " + "as there are no remote blocks to pull", req_id) + continue + meta.add_new_req( request_id=req_id, local_block_ids=block_ids, From cd8dfc6dfc832fc4bc8ea0c9b01ad92d677c75bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 21 May 2025 07:48:43 +0200 Subject: [PATCH 025/192] [Misc] MultiConnector._connectors type (#18423) Signed-off-by: nicklucche --- vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index cea454a0b597..0aabb260fd3d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -40,7 +40,7 @@ class MultiConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) - self._connectors = [] + self._connectors: list[KVConnectorBase_V1] = [] ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( "connectors") assert ktcs is not None From 5d7f5452044c2a21dcd75fd8d4266a25052fe43b Mon Sep 17 00:00:00 2001 From: Kebe Date: Wed, 21 May 2025 16:21:17 +0800 Subject: [PATCH 026/192] [Frontend] deprecate `--device` arg (#18399) Signed-off-by: Kebe --- docs/source/getting_started/installation/gpu/xpu.inc.md | 1 - vllm/config.py | 6 +++++- vllm/engine/arg_utils.py | 6 ++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index 4ab41a21c2a1..74937a184227 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -66,7 +66,6 @@ XPU platform supports **tensor parallel** inference/serving and also supports ** python -m vllm.entrypoints.openai.api_server \ --model=facebook/opt-13b \ --dtype=bfloat16 \ - --device=xpu \ --max_model_len=1024 \ --distributed-executor-backend=ray \ --pipeline-parallel-size=2 \ diff --git a/vllm/config.py b/vllm/config.py index 5382e9a16829..3fa1db0e8390 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2201,7 +2201,11 @@ class DeviceConfig: """Configuration for the device to use for vLLM execution.""" device: Union[Device, torch.device] = "auto" - """Device type for vLLM execution.""" + """Device type for vLLM execution. + This parameter is deprecated and will be + removed in a future release. + It will now be set automatically based + on the current platform.""" device_type: str = field(init=False) """Device type from the current platform. This is set in `__post_init__`.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f0c6b15b79da..91a34cb4dd59 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -737,7 +737,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: title="DeviceConfig", description=DeviceConfig.__doc__, ) - device_group.add_argument("--device", **device_kwargs["device"]) + device_group.add_argument("--device", + **device_kwargs["device"], + deprecated=True) # Speculative arguments speculative_group = parser.add_argument_group( @@ -977,7 +979,7 @@ def create_engine_config( from vllm.platforms import current_platform current_platform.pre_register_and_update() - device_config = DeviceConfig(device=self.device) + device_config = DeviceConfig(device=current_platform.device_type) model_config = self.create_model_config() # * If VLLM_USE_V1 is unset, we enable V1 for "supported features" From 907f935de9f3b6be9d793db32d7b92877419c402 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Wed, 21 May 2025 01:21:49 -0700 Subject: [PATCH 027/192] [V1] Fix general plugins not loaded in engine for multiproc (#18326) Signed-off-by: Yong Hoon Shin --- vllm/v1/engine/core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0cf2383af1c9..2234b069621d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -57,6 +57,10 @@ def __init__(self, executor_fail_callback: Optional[Callable] = None): assert vllm_config.model_config.runner_type != "pooling" + # plugins need to be loaded at the engine/scheduler level too + from vllm.plugins import load_general_plugins + load_general_plugins() + self.vllm_config = vllm_config logger.info("Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) From 107f5fc4cb2f1adf3679b4305d9953cb90840e7b Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Wed, 21 May 2025 19:10:14 +0800 Subject: [PATCH 028/192] [Misc] refactor disaggregated-prefill-v1 example (#18474) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- .../disaggregated-prefill-v1/README.md | 1 + .../decode_example.py | 76 +++++++++------- .../prefill_example.py | 90 +++++++++++-------- 3 files changed, 96 insertions(+), 71 deletions(-) diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md index f708eb253838..9cbdb19820f5 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/README.md +++ b/examples/offline_inference/disaggregated-prefill-v1/README.md @@ -5,5 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl ## Files - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially. + - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`. - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`. - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`. diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py index 11918f72feec..531c96f176a3 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -3,35 +3,47 @@ from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -# Read prompts from output.txt -prompts = [] -try: - with open("output.txt") as f: - for line in f: - prompts.append(line.strip()) - print(f"Loaded {len(prompts)} prompts from output.txt") -except FileNotFoundError: - print("Error: output.txt file not found") - exit(-1) - -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - -llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - max_num_batched_tokens=64, - max_num_seqs=16, - kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", - kv_role="kv_both", - kv_connector_extra_config={ - "shared_storage_path": "local_storage" - })) #, max_model_len=2048, max_num_batched_tokens=2048) - -# 1ST generation (prefill instance) -outputs = llm.generate(prompts, sampling_params) - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def read_prompts(): + """Read prompts from output.txt""" + prompts = [] + try: + with open("output.txt") as f: + for line in f: + prompts.append(line.strip()) + print(f"Loaded {len(prompts)} prompts from output.txt") + return prompts + except FileNotFoundError: + print("Error: output.txt file not found") + exit(-1) + + +def main(): + prompts = read_prompts() + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) + + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + max_num_batched_tokens=64, + max_num_seqs=16, + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "shared_storage_path": "local_storage" + })) #, max_model_len=2048, max_num_batched_tokens=2048) + + # 1ST generation (prefill instance) + outputs = llm.generate(prompts, sampling_params) + + print("-" * 30) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 30) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py index 798128301e0f..24b7b1d8fdbe 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -3,42 +3,54 @@ from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig -context = "Hi " * 1000 -context2 = "Hey " * 500 -prompts = [ - context + "Hello, my name is", - context + "The capital of France is", - context2 + "Your name is", - context2 + "The capital of China is", -] - -sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - -llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", - kv_role="kv_both", - kv_connector_extra_config={ - "shared_storage_path": "local_storage" - })) #, max_model_len=2048, max_num_batched_tokens=2048) - -# 1ST generation (prefill instance) -outputs = llm.generate( - prompts, - sampling_params, -) - -new_prompts = [] -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - new_prompts.append(prompt + generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -# Write new_prompts to output.txt -with open("output.txt", "w") as f: - for prompt in new_prompts: - f.write(prompt + "\n") -print(f"Saved {len(new_prompts)} prompts to output.txt") + +def read_prompts(): + context = "Hi " * 1000 + context2 = "Hey " * 500 + return [ + context + "Hello, my name is", + context + "The capital of France is", + context2 + "Your name is", + context2 + "The capital of China is", + ] + + +def main(): + prompts = read_prompts() + + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "shared_storage_path": "local_storage" + })) #, max_model_len=2048, max_num_batched_tokens=2048) + + # 1ST generation (prefill instance) + outputs = llm.generate( + prompts, + sampling_params, + ) + + new_prompts = [] + print("-" * 30) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + new_prompts.append(prompt + generated_text) + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 30) + + # Write new_prompts to output.txt + with open("output.txt", "w") as f: + for prompt in new_prompts: + f.write(prompt + "\n") + print(f"Saved {len(new_prompts)} prompts to output.txt") + + +if __name__ == "__main__": + main() From 61acfc45bcf44de6e5a14c82906bbb7a33940443 Mon Sep 17 00:00:00 2001 From: Rabi Mishra Date: Wed, 21 May 2025 17:27:28 +0530 Subject: [PATCH 029/192] [Bugfix][Failing Test] Fix test_events.py (#18460) Signed-off-by: rabi --- .buildkite/test-pipeline.yaml | 2 ++ tests/distributed/test_events.py | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 461fb6d30c45..29796184106d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -138,6 +138,7 @@ steps: - vllm/core/ - tests/distributed/test_utils - tests/distributed/test_pynccl + - tests/distributed/test_events - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py @@ -156,6 +157,7 @@ steps: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py index 15bcfdb8555f..8de1aa20eabd 100644 --- a/tests/distributed/test_events.py +++ b/tests/distributed/test_events.py @@ -119,13 +119,12 @@ def test_topic_filtering(publisher_config): """ publisher_config.replay_endpoint = None - cfg = publisher_config.model_copy() - cfg.topic = "foo" - pub = EventPublisherFactory.create(cfg) + publisher_config.topic = "foo" + pub = EventPublisherFactory.create(publisher_config) from .conftest import MockSubscriber - sub_foo = MockSubscriber(cfg.endpoint, None, "foo") - sub_bar = MockSubscriber(cfg.endpoint, None, "bar") + sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo") + sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar") try: time.sleep(0.1) From eca18691d2fe29c4f6c1b466709eda9f123116ea Mon Sep 17 00:00:00 2001 From: Dhia Eddine Rhaiem <163106757+dhiaEddineRhaiem@users.noreply.github.com> Date: Wed, 21 May 2025 15:59:06 +0400 Subject: [PATCH 030/192] [MODEL] FalconH1 (#18406) Signed-off-by: dhia.rhaiem Co-authored-by: younesbelkada Co-authored-by: Ilyas Chahed Co-authored-by: Jingwei Zuo --- docs/source/models/supported_models.md | 5 + tests/models/registry.py | 3 + .../layers/mamba/mamba_mixer2.py | 163 +++-- vllm/model_executor/models/falcon_h1.py | 685 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 5 files changed, 798 insertions(+), 59 deletions(-) create mode 100644 vllm/model_executor/models/falcon_h1.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 4d574216242b..6022dfb9c2c6 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -392,6 +392,11 @@ Specified using `--task generate`. * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. * ✅︎ * ✅︎ +- * `FalconH1ForCausalLM` + * Falcon-H1 + * `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. + * ✅︎ + * ✅︎ - * `GemmaForCausalLM` * Gemma * `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. diff --git a/tests/models/registry.py b/tests/models/registry.py index 84abd42e9231..a92dee3b642d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -147,6 +147,9 @@ def check_available_online( "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), + "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct", + is_available_online=False, + min_transformers_version="4.52.2"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index bc6e6fcdd0a2..d44d2c790198 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -34,7 +34,11 @@ @CustomOp.register("mixer2_gated_rms_norm") class Mixer2RMSNormGated(CustomOp): - def __init__(self, full_hidden_size, full_n_groups, eps=1e-6): + def __init__(self, + full_hidden_size: int, + full_n_groups: int, + use_rms_norm: bool = True, + eps: float = 1e-6): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() @@ -44,11 +48,17 @@ def __init__(self, full_hidden_size, full_n_groups, eps=1e-6): self.n_groups = full_hidden_size // self.group_size self.variance_epsilon = eps - self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size)) - set_weight_attrs(self.weight, - {"weight_loader": sharded_weight_loader(0)}) - assert self.full_hidden_size % self.tp_size== 0,\ - "Tensor parallel world size must divide hidden size." + self.use_rms_norm = use_rms_norm + if self.use_rms_norm: + # Register norm weight only if we're actually applying RMSNorm + self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size)) + set_weight_attrs(self.weight, + {"weight_loader": sharded_weight_loader(0)}) + else: + # Avoid checkpoint mismatch by skipping unused parameter + self.register_parameter("weight", None) + assert (self.full_hidden_size % self.tp_size == 0 + ), "Tensor parallel world size must divide hidden size." def forward_native( self, @@ -66,6 +76,8 @@ def forward_native( # the input and then redundantly compute the RMSNorm. input_dtype = x.dtype x = x * nn.functional.silu(gate.to(torch.float32)) + if not self.use_rms_norm: + return x if self.n_groups == 1: if self.tp_size > 1: @@ -74,7 +86,7 @@ def forward_native( global_sums = tensor_model_parallel_all_reduce(local_sums) # Calculate the variance count = self.tp_size * x.shape[-1] - variance = (global_sums / count) + variance = global_sums / count else: variance = x.pow(2).mean(-1, keepdim=True) @@ -106,6 +118,9 @@ def forward_cuda( gate: torch.Tensor, ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + if not self.use_rms_norm: + return x * nn.functional.silu(gate.to(torch.float32)) + if self.tp_size > 1 or self.n_groups != 1: return self.forward_native(x, gate) @@ -124,7 +139,7 @@ def forward_cuda( def extra_groups_for_head_shards(ngroups: int, tp_size: int): - """Compute the increase in group numbers to account for + """Compute the increase in group numbers to account for replication in order to accompany the head shards.""" # in the case ngoups % tp_size == 0, this will be zero @@ -182,13 +197,15 @@ def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: # seem to handle slices well. # https://github.com/python/mypy/issues/2410 param.data[ - boundary:(boundary + take), # type: ignore[misc] - ...] = loaded_weight[loaded_start_idx:( # type: ignore[misc] - loaded_start_idx + take)] # type: ignore[misc] + boundary:(boundary + take), + ... # type: ignore[misc] + ] = loaded_weight[loaded_start_idx:(loaded_start_idx + + take) # type: ignore[misc] + ] # type: ignore[misc] # move indexing boundaries boundary += shard_size - loaded_boundary += (full_dim - extra) + loaded_boundary += full_dim - extra return loader @@ -206,19 +223,22 @@ class MambaMixer2(CustomOp): **selective** state spaces) """ - def __init__(self, - hidden_size: int, - ssm_state_size: int, - conv_kernel_size: int, - intermediate_size: int, - use_conv_bias: bool, - use_bias: bool, - n_groups: int = 1, - num_heads: int = 128, - head_dim: int = 64, - rms_norm_eps: float = 1e-5, - activation="silu", - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + hidden_size: int, + ssm_state_size: int, + conv_kernel_size: int, + intermediate_size: int, + use_conv_bias: bool, + use_bias: bool, + n_groups: int = 1, + num_heads: int = 128, + head_dim: int = 64, + rms_norm_eps: float = 1e-5, + activation: str = "silu", + use_rms_norm: bool = True, + quant_config: Optional[QuantizationConfig] = None, + ): super().__init__() # For TP, the sharding plan is as follows: @@ -238,17 +258,16 @@ def __init__(self, self.tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() - assert num_heads % self.tp_size == 0, \ - "Tensor parallel world size must divide num heads." + assert (num_heads % self.tp_size == 0 + ), "Tensor parallel world size must divide num heads." - assert (n_groups % self.tp_size) == 0 or n_groups == 1, \ - ( - "If tensor parallel world size does not divide num_heads, " - "then num_groups must equal 1." - ) + assert (n_groups % self.tp_size) == 0 or n_groups == 1, ( + "If tensor parallel world size does not divide num_heads, " + "then num_groups must equal 1.") - assert self.tp_size == 1 or quant_config is None, \ - "Tensor parallel currently not supported for quantized models." + assert ( + self.tp_size == 1 or quant_config is None + ), "Tensor parallel currently not supported for quantized models." self.ssm_state_size = ssm_state_size self.activation = activation @@ -265,8 +284,7 @@ def __init__(self, self.n_groups = n_groups + extra_groups_for_head_shards( n_groups, self.tp_size) - self.conv_dim = (intermediate_size + - 2 * self.n_groups * ssm_state_size) + self.conv_dim = intermediate_size + 2 * self.n_groups * ssm_state_size self.conv1d = ColumnParallelLinear( input_size=conv_kernel_size, output_size=self.conv_dim, @@ -279,11 +297,12 @@ def __init__(self, # doesn't allow to override it self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) - self.in_proj = ColumnParallelLinear(input_size=hidden_size, - output_size=intermediate_size + - self.conv_dim + self.num_heads, - bias=use_bias, - quant_config=quant_config) + self.in_proj = ColumnParallelLinear( + input_size=hidden_size, + output_size=intermediate_size + self.conv_dim + self.num_heads, + bias=use_bias, + quant_config=quant_config, + ) # - because in_proj is a concatenation of 3 weights, we # need to interleave them before sharding @@ -305,7 +324,8 @@ def __init__(self, # - ditto for the otther two weights below delattr(self.conv1d.bias, "weight_loader") set_weight_attrs( - self.conv1d.bias, { + self.conv1d.bias, + { "weight_loader": mamba_v2_sharded_weight_loader( [ @@ -316,18 +336,25 @@ def __init__(self, self.tp_size, tp_rank, ) - }) + }, + ) delattr(self.conv1d.weight, "weight_loader") set_weight_attrs( - self.conv1d.weight, { + self.conv1d.weight, + { "weight_loader": - mamba_v2_sharded_weight_loader([ - intermediate_settings, - group_shard_settings, - group_shard_settings, - ], self.tp_size, tp_rank) - }) + mamba_v2_sharded_weight_loader( + [ + intermediate_settings, + group_shard_settings, + group_shard_settings, + ], + self.tp_size, + tp_rank, + ) + }, + ) if quant_config is None: # - quant layers do not have a weight loader @@ -345,8 +372,10 @@ def __init__(self, head_setings, # for dt ], self.tp_size, - tp_rank) - }) + tp_rank, + ) + }, + ) # - these are TPed by heads to reduce the size of the # temporal shape @@ -357,6 +386,7 @@ def __init__(self, )) self.D = nn.Parameter(torch.ones(num_heads // self.tp_size)) self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size)) + self.use_rms_norm = use_rms_norm set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)}) a_weight_loader = composed_weight_loader( @@ -365,18 +395,25 @@ def __init__(self, set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)}) - self.out_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=use_bias, - input_is_parallel=True, - quant_config=quant_config) + self.out_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=use_bias, + input_is_parallel=True, + quant_config=quant_config, + ) self.norm = Mixer2RMSNormGated(intermediate_size, n_groups, + self.use_rms_norm, eps=rms_norm_eps) - def forward_native(self, hidden_states: torch.Tensor, - conv_state: torch.Tensor, ssm_state: torch.Tensor): + def forward_native( + self, + hidden_states: torch.Tensor, + conv_state: torch.Tensor, + ssm_state: torch.Tensor, + ): pass def forward_cuda( @@ -384,6 +421,7 @@ def forward_cuda( hidden_states: torch.Tensor, mamba_cache_params: MambaCacheParams, mamba2_metadata: Mamba2Metadata, + mup_vector: Optional[torch.Tensor] = None, ): # mamba2_metadata contains metadata necessary for the mamba2 triton # kernels to operate in continuous batching and in chunked prefill @@ -401,6 +439,10 @@ def forward_cuda( # 1. Gated MLP's linear projection projected_states, _ = self.in_proj(hidden_states) + + if mup_vector is not None: + projected_states = projected_states * mup_vector + gate, hidden_states_B_C, dt = torch.split( projected_states, [ @@ -561,6 +603,9 @@ def forward_cuda( hidden_states = torch.vstack(ssd_output_list) # 4. gated MLP + # GatedRMSNorm internally applying SiLU to the gate + # SiLU is applied internally before normalization, unlike standard + # norm usage hidden_states = self.norm(hidden_states, gate) # 5. Final linear projection diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py new file mode 100644 index 000000000000..591a75ffdb73 --- /dev/null +++ b/vllm/model_executor/models/falcon_h1.py @@ -0,0 +1,685 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Inference-only FalconH1 model.""" +from collections.abc import Iterable +from typing import Optional + +import torch +from torch import nn +from transformers import FalconH1Config + +from vllm.attention.layer import Attention +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import divide, get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_pp_group +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.mamba2_metadata import ( + Mamba2Metadata, prepare_mamba2_metadata) +from vllm.model_executor.layers.mamba.mamba_mixer2 import ( + MambaMixer2, extra_groups_for_head_shards) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.mamba_cache import (MambaCacheManager, + MambaCacheParams) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, + SupportsV0Only) +from .utils import (PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + + +class FalconH1MLP(nn.Module): + + def __init__( + self, + config: FalconH1Config, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=config.hidden_size, + output_sizes=[config.intermediate_size] * 2, + bias=bias, + quant_config=quant_config, + ) + self.down_proj = RowParallelLinear( + input_size=config.intermediate_size, + output_size=config.hidden_size, + bias=bias, + quant_config=quant_config, + ) + self.tp_size = get_tensor_model_parallel_world_size() + self.intermediate_size = config.intermediate_size + self.gate_multiplier, self.down_multiplier = config.mlp_multipliers + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + x, _ = self.gate_up_proj(x) + x[:, :self.intermediate_size // self.tp_size] *= self.gate_multiplier + x = self.act_fn(x) + x, _ = self.down_proj(x) + x = x * self.down_multiplier + return x + + +class FalconH1SSMDecoderLayer(nn.Module): + + def __init__( + self, + config: FalconH1Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.tp_size = get_tensor_model_parallel_world_size() + + self.d_ssm = (int(config.mamba_expand * config.hidden_size) + if config.mamba_d_ssm is None else config.mamba_d_ssm) + + self.mamba = MambaMixer2( + hidden_size=config.hidden_size, + ssm_state_size=config.mamba_d_state, + conv_kernel_size=config.mamba_d_conv, + intermediate_size=self.d_ssm, + use_conv_bias=config.mamba_conv_bias, + use_bias=config.mamba_proj_bias, + n_groups=config.mamba_n_groups, + num_heads=config.mamba_n_heads, + head_dim=config.mamba_d_head, + rms_norm_eps=config.rms_norm_eps, + activation=config.hidden_act, + quant_config=quant_config, + use_rms_norm=config.mamba_rms_norm, + ) + # n_groups is overridden later by `MambaMixer2` + self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state + self.zxbcdt_multipliers = config.ssm_multipliers + self._init_mup_vector() + + def _init_mup_vector(self): + """ + Non learnable per-block scaling vector composed of element-wise + multipliersapplied to each separate contiguous block of the output + of the linear projection (in_proj) before further processing + (gating, convolution, SSM): + + - Z block: [0 : d_ssm] → zxbcdt_multipliers[0] + - X block: [d_ssm : 2 * d_ssm] → zxbcdt_multipliers[1] + - B block: [2 * d_ssm : 2 * d_ssm + G * S] → zxbcdt_multipliers[2] + - C block: [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S] + → zxbcdt_multipliers[3] + - dt block: [2 * d_ssm + 2 * G * S : end] → zxbcdt_multipliers[4] + + where: + - d_ssm: Dimension of state-space model latent + - G: Number of groups (n_groups) + - S: SSM state size per group + - All indices are divided by tp_size to support tensor parallelism + """ + vector_shape = (2 * self.d_ssm + 2 * self.groups_time_state_size + + self.config.mamba_n_heads) // self.tp_size + mup_vector = torch.ones(1, vector_shape) + # Z vector 0 -> d_ssm + mup_vector[:, :self.d_ssm // + self.tp_size] *= self.zxbcdt_multipliers[0] + # X vector d_ssm -> 2 * d_ssm + mup_vector[:, + (self.d_ssm // + self.tp_size):(2 * self.d_ssm // + self.tp_size)] *= self.zxbcdt_multipliers[1] + # B vector 2 * d_ssm -> 2 * d_ssm + (n_group * d_state) + mup_vector[ + :, + (2 * self.d_ssm) // + self.tp_size:(2 * self.d_ssm + self.groups_time_state_size) // + self.tp_size, + ] *= self.zxbcdt_multipliers[2] + # C vector 2 * d_ssm + (n_group * d_state) + # -> 2 * d_ssm + 2 * (n_group * d_state) + mup_vector[ + :, + (2 * self.d_ssm + self.groups_time_state_size) // + self.tp_size:(2 * self.d_ssm + 2 * self.groups_time_state_size) // + self.tp_size, + ] *= self.zxbcdt_multipliers[3] + # dt vector 2 * d_ssm + 2 * (n_group * d_state) + # -> 2 * d_ssm + 2 * (n_group * d_state) + n_heads + mup_vector[ + :, + (2 * self.d_ssm + 2 * self.groups_time_state_size) // + self.tp_size:, + ] *= self.zxbcdt_multipliers[4] + + self.register_buffer("mup_vector", mup_vector, persistent=False) + + def forward( + self, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + **kwargs, + ): + hidden_states = self.mamba( + hidden_states, + mamba_cache_params, + mamba2_metadata=mamba2_metadata, + mup_vector=self.mup_vector, + ) + return hidden_states, residual + + +class FalconH1AttentionDecoderLayer(nn.Module): + + def __init__( + self, + config: FalconH1Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + rope_theta = getattr(config, "rope_theta", 1e11) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = (config.hidden_size // self.total_num_heads if getattr( + config, "head_dim", None) is None else config.head_dim) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + if hasattr(config, "partial_rotary_factor"): + rotary_dim = self.head_dim * config.partial_rotary_factor + elif hasattr(config, "attn_rotary_emb"): + rotary_dim = config.attn_rotary_emb # for backward compatibility + else: + rotary_dim = self.head_dim # default + + self.rotary_emb = get_rope( + head_size=self.head_dim, + rotary_dim=rotary_dim, + max_position=max_position_embeddings, + rope_scaling=rope_scaling, + base=rope_theta, + is_neox_style=True, + dtype=None, # see impl of get_rope + ) + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + prefix=f"{prefix}.attn", + ) + self.key_multiplier = config.key_multiplier + + def self_attention( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + k = k * self.key_multiplier + + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + **kwargs, + ): + hidden_states = self.self_attention( + positions=positions, + hidden_states=hidden_states, + ) + return hidden_states, residual + + +class FalconH1ParallelHybrid(nn.Module): + """ + A hybrid decoder layer for FalconH1 where the input is processed + in parallel through both the self-attention branch and the SSM (Mamba) + branch. Their outputs are then summed to produce the final hidden state. + + This layer uses: + - FalconH1AttentionDecoderLayer for the multi-head self-attention branch. + - FalconH1SSMDecoderLayer for the state-space (Mamba) branch. + """ + + def __init__( + self, + config: FalconH1Config, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + # Instantiate the attention branch + self.self_attn = FalconH1AttentionDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ) + # Instantiate the SSM branch + self.mamba = FalconH1SSMDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + ) + self.ssm_out_multiplier = config.ssm_out_multiplier + self.ssm_in_multiplier = config.ssm_in_multiplier + + self.attention_in_multiplier = config.attention_in_multiplier + self.attn_out_multiplier = config.attention_out_multiplier + + self.feed_forward = FalconH1MLP(config) + + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.pre_ff_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + **kwargs, + ): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Process input through the attention branch. + # FalconH1AttentionDecoderLayer expects positions, hidden_states, + # kv_cache, attn_metadata, and residual. + attn_hidden, _ = self.self_attn( + positions=positions, + hidden_states=hidden_states * self.attention_in_multiplier, + residual=residual, + **kwargs, + ) + + # Process input through the SSM branch. + # FalconH1SSMDecoderLayer expects hidden_states, attn_metadata, + # residual, mamba_cache_params, and sequence_idx. + ssm_hidden, _ = self.mamba( + hidden_states=hidden_states * self.ssm_in_multiplier, + residual=residual, + mamba_cache_params=mamba_cache_params, + mamba2_metadata=mamba2_metadata, + **kwargs, + ) + # Sum the outputs from both branches. + # We assume both branches produce outputs of the same + # dimensionality (config.hidden_size). + hidden_states = (attn_hidden * self.attn_out_multiplier) + ( + ssm_hidden * self.ssm_out_multiplier) + hidden_states = hidden_states + residual + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class FalconH1Model(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: FalconH1Config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank: + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + self.embedding_multiplier = config.embedding_multiplier + else: + self.embed_tokens = PPMissingLayer() + self.embedding_multiplier = 1.0 + + def get_layer(prefix: str): + layer_idx = int(prefix.rsplit(".", 1)[1]) + layer_class = FalconH1ParallelHybrid + return layer_class( + config, + layer_idx, + cache_config, + quant_config=quant_config, + prefix=prefix, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + if get_pp_group().is_last_rank: + self.final_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + else: + self.final_layernorm = PPMissingLayer() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + mamba_cache_params: MambaCacheParams, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + # pass a sequence index tensor, that is required for + # proper continuous batching computation including + # chunked prefill + attn_metadata = get_forward_context().attn_metadata + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.mamba_chunk_size, + input_ids=input_ids, + attn_metadata=attn_metadata, + ) + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds * self.embedding_multiplier + else: + hidden_states = (self.get_input_embeddings(input_ids) * + self.embedding_multiplier) + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i) + hidden_states = layer( + positions=positions, + hidden_states=hidden_states, + mamba_cache_params=layer_mamba_cache_params, + mamba2_metadata=mamba2_metadata, + ) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + }) + hidden_states = self.final_layernorm(hidden_states) + return hidden_states + + +class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, + IsHybrid, SupportsV0Only): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + } + + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + lora_config = vllm_config.lora_config + scheduler_config = vllm_config.scheduler_config + assert (not cache_config.enable_prefix_caching + ), "FalconH1 currently does not support prefix caching" + + self.quant_config = vllm_config.quant_config + + super().__init__() + self.config = config + self.scheduler_config = scheduler_config + self.model = FalconH1Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.tie_word_embeddings = config.tie_word_embeddings + self.unpadded_vocab_size = config.vocab_size + self.mamba_cache: Optional[MambaCacheManager] = None + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=( + DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else + lora_config.lora_vocab_padding_size), + ) + self.lm_head_multiplier = config.lm_head_multiplier + if self.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights( + self.model.embed_tokens) + # Used to track and store by the Mamba cache between steps. + + self.logits_processor = LogitsProcessor( + self.unpadded_vocab_size, + config.vocab_size, + scale=config.lm_head_multiplier, + ) + else: + self.lm_head = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs, + ): + if self.mamba_cache is None: + self.mamba_cache = MambaCacheManager( + self.vllm_config, + self.lm_head.weight.dtype + if hasattr(self.lm_head, 'weight') else torch.bfloat16, + self.config.num_hidden_layers, + *self._get_mamba_cache_shape(), + ) + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + hidden_states = self.model( + input_ids, + positions, + mamba_cache_params, + intermediate_tensors, + inputs_embeds, + ) + + return hidden_states + + def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): + return self.mamba_cache.copy_inputs_before_cuda_graphs( + input_buffers, **kwargs) + + def get_seqlen_agnostic_capture_inputs(self, batch_size: int): + return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) + + def _get_mamba_cache_shape( + self) -> tuple[tuple[int, int], tuple[int, int]]: + world_size = get_tensor_model_parallel_world_size() + hidden_size = self.config.hidden_size + + conv_state_shape, temporal_state_shape = None, None + + intermediate_size = (int(self.config.mamba_expand * + hidden_size) if self.config.mamba_d_ssm + is None else self.config.mamba_d_ssm) + + # if n_groups is not divisible by world_size, need to extend the shards + # to ensure all groups needed by a head is sharded along with it + n_groups = self.config.mamba_n_groups + extra_groups_for_head_shards( + self.config.mamba_n_groups, world_size) + + # - heads and n_groups are TP-ed + conv_dim = intermediate_size + 2 * n_groups * self.config.mamba_d_state + conv_state_shape = ( + divide(conv_dim, world_size), + self.config.mamba_d_conv - 1, + ) + + # These are not TP-ed as they depend on A, dt_bias, D + # - they are typically small + # e.g., (h_heads, d_head, d_state) = (128, 64, 128) + temporal_state_shape = ( + divide(self.config.mamba_n_heads, world_size), + self.config.mamba_d_head, + self.config.mamba_d_state, + ) + return conv_state_shape, temporal_state_shape + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + if "A_log" in name: + name = name.replace("A_log", "A") + + if "mamba" in name: + name = name.replace("mamba", "mamba.mamba") + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + if self.tie_word_embeddings and "lm_head" in name: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + if self.tie_word_embeddings: + loaded_params.add("lm_head.weight") + return loaded_params diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c55f7ccd344f..61115afa76d4 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -79,6 +79,7 @@ "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), + "FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"), "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), From c154d89306bf9978ff013659c5b1244af78386cc Mon Sep 17 00:00:00 2001 From: GiantCroc <1204449533@qq.com> Date: Wed, 21 May 2025 21:45:57 +0800 Subject: [PATCH 031/192] [Doc] fix arg docstring in linear layers (#18410) Signed-off-by: giantcroc <1204449533@qq.com> --- vllm/model_executor/layers/linear.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 54dd1251e59f..dd2e477f3954 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -261,6 +261,7 @@ class ReplicatedLinear(LinearBase): quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( @@ -523,6 +524,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( @@ -805,6 +807,7 @@ class QKVParallelLinear(ColumnParallelLinear): quant_config: Quantization configure. prefix: The name of the layer in the state dict, including all parents (e.g. model.layers.0.qkv_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( @@ -1155,7 +1158,13 @@ class RowParallelLinear(LinearBase): bias can be fused with other element-wise operations. We skip adding bias but instead return it. params_dtype: Data type for the parameters. + reduce_results: If true, call all-reduce on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y = X_iA_i quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.down_proj) + return_bias: If true, return bias together with outputs in forward pass. """ def __init__( From c6c10ca920f094f402f78635a234ffdcb108d78e Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 21 May 2025 09:46:39 -0400 Subject: [PATCH 032/192] [Bugfix] Reduce moe_sum test size to avoid OOM (#18484) Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 9a8ac242af79..299279390fe0 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -577,7 +577,7 @@ def test_moe_align_block_size_opcheck(): num_tokens_post_pad)) -@pytest.mark.parametrize("m", [1, 33, 222, 1024 * 128]) +@pytest.mark.parametrize("m", [1, 33, 64, 222]) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("k", [128, 511, 1024]) @pytest.mark.parametrize("dtype", From 371376f99689fc058141980fde9220dd4980aa3f Mon Sep 17 00:00:00 2001 From: Kebe Date: Wed, 21 May 2025 22:32:06 +0800 Subject: [PATCH 033/192] [Build] fix Dockerfile shell (#18402) --- docker/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 97a7879da876..a35056f78587 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -189,6 +189,8 @@ WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive ARG TARGETPLATFORM +SHELL ["/bin/bash", "-c"] + RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment From 2b161045574fc4ba23052c6578ad8f97090db884 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= Date: Wed, 21 May 2025 23:33:11 +0900 Subject: [PATCH 034/192] [Misc] Update deprecation message for `--enable-reasoning` (#18404) --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 91a34cb4dd59..5650742ff972 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -577,7 +577,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action=argparse.BooleanOptionalAction, deprecated=True, help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as " - "of v0.8.6. Use `--reasoning-parser` to specify the reasoning " + "of v0.9.0. Use `--reasoning-parser` to specify the reasoning " "parser backend instead. This flag (`--enable-reasoning`) will be " "removed in v0.10.0. When `--reasoning-parser` is specified, " "reasoning mode is automatically enabled.") From dd5fa7e04f7544dca276701816453e8cc31fb7de Mon Sep 17 00:00:00 2001 From: Hosang <156028780+hyoon1@users.noreply.github.com> Date: Wed, 21 May 2025 11:35:00 -0400 Subject: [PATCH 035/192] [ROCm][Kernel][V1] Enable AMD Radeon GPU Custom Paged Attention on v1 (#17004) Signed-off-by: Hosang Yoon --- .../kernels/benchmark_paged_attention.py | 6 +- csrc/rocm/attention.cu | 2051 +++++++++++++++-- tests/kernels/attention/test_attention.py | 8 +- vllm/attention/backends/rocm_flash_attn.py | 3 +- .../ops/chunked_prefill_paged_decode.py | 3 +- vllm/platforms/rocm.py | 48 +- 6 files changed, 1930 insertions(+), 189 deletions(-) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 17432159c94e..54f05e723226 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -84,7 +84,10 @@ def main( if version == "v2": if current_platform.is_rocm(): global PARTITION_SIZE - PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM + if not args.custom_paged_attn and not current_platform.is_navi(): + PARTITION_SIZE = 1024 + else: + PARTITION_SIZE = PARTITION_SIZE_ROCM num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE tmp_output = torch.empty( size=(num_seqs, num_query_heads, num_partitions, head_size), @@ -159,6 +162,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: scale, block_tables, seq_lens, + None, block_size, max_seq_len, alibi_slopes, diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 8cc5a0f4f218..f1e7da164199 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -30,6 +30,14 @@ #define __HIP__GFX9__ #endif +#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__)) + #define __HIP__GFX11__ +#endif + +#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__)) + #define __HIP__GFX12__ +#endif + #if defined(NDEBUG) #undef NDEBUG #include @@ -43,7 +51,7 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) -#if defined(__HIP__GFX9__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32 #define GCN_MFMA_INSTR __builtin_amdgcn_mfma_f32_4x4x4f16 @@ -1482,191 +1490,1690 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( } } -#else // !defined(__HIP__GFX9__) TODO: Add NAVI support +#elif defined(__HIP__GFX11__) -// clang-format off -template -__global__ -__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel( - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] - const int* __restrict__ query_start_loc_ptr, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride, - float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] - int max_ctx_blocks, const float* k_scale, const float* v_scale) { - UNREACHABLE_CODE +using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float; + +using bit16_t = uint16_t; +using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t; +typedef bit16x4 _B16x4; + +using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t; +union b16x8_u { + bit16x8 u16x8; + _B16x4 xy[2]; +}; +typedef b16x8_u _B16x8; + +using bit16x16 = + __attribute__((__vector_size__(16 * sizeof(uint16_t)))) uint16_t; +union b16x16_u { + bit16x16 u16x16; + _B16x8 xy[2]; +}; +typedef b16x16_u _B16x16; + +using _B8x8 = uint2; +using bit8_t = uint8_t; + +typedef struct _B8x16 { + _B8x8 xy[2]; +} _B8x16; + +template +__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x16& inpA, + const bit16x16& inpB, + const floatx8& inpC) { + if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(inpA, inpB, inpC); + } else if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(inpA, inpB, inpC); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ float to_float(const T& inp) { + if constexpr (std::is_same::value) { + return (float)inp; + } else if constexpr (std::is_same::value) { + return __bfloat162float(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ T from_float(const float& inp) { + if constexpr (std::is_same::value) { + return (_Float16)inp; + } else if constexpr (std::is_same::value) { + return __float2bfloat16(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { + if constexpr (std::is_same::value) { + union h2cvt { + __half2 h2[4]; + _B16x8 b16x8; + } u; + u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1])); + u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3])); + u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5])); + u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7])); + return u.b16x8; + } else if constexpr (std::is_same::value) { + union b2cvt { + __hip_bfloat162 b2[4]; + _B16x8 b16x8; + } u; + + u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1])); + u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3])); + u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5])); + u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7])); + + return u.b16x8; + } else { + static_assert(false, "unsupported 16b dtype"); + } } +// clang-format off template + int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO> __global__ -__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] - const int* __restrict__ query_start_loc_ptr, // [num_seqs] +__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride, - float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] int max_ctx_blocks, const float* k_scale, const float* v_scale) { - UNREACHABLE_CODE -} + // clang-format on + constexpr int NWARPS = NUM_THREADS / WARP_SIZE; // 8 warps on gfx11 + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + const int lane2id = laneid % 2; + const int lane4id = laneid % 4; + const int lane16id = laneid % 16; + const int rowid = laneid / 16; -// Grid: (num_heads, num_seqs). -template -__global__ -__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( - OUTT* __restrict__ out, // [num_seqs, num_heads, head_size] - const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - const int* __restrict__ context_lens, // [num_seqs] - const int* __restrict__ query_start_loc_ptr, // [num_seqs] - const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { - UNREACHABLE_CODE -} -// clang-format on + const int seq_idx = blockIdx.x; + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) { + return; + } -#endif // defined(__HIP__GFX9__) TODO: Add NAVI support + const int partition_idx = blockIdx.y; -#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \ - paged_attention_ll4mi_QKV_mfma16_kernel \ - <<>>( \ - query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ - max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ - kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ - max_ctx_blocks, k_scale_ptr, v_scale_ptr); + constexpr int T_PAR_SIZE = 256; // token partition size set to 256 -#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \ - paged_attention_ll4mi_QKV_mfma4_kernel \ - <<>>( \ - query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ - max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ - kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ - max_ctx_blocks, k_scale_ptr, v_scale_ptr); + const int max_num_partitions = gridDim.y; -#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \ - paged_attention_ll4mi_reduce_kernel \ - <<>>( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \ - context_lens_ptr, query_start_loc_ptr, max_num_partitions, \ - fp8_out_scale_ptr); + const int context_len = context_lens[seq_idx]; // length of a seq -template -void paged_attention_custom_launcher( - torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, const int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& context_lens, - const std::optional& query_start_loc, int max_context_len, - const std::optional& alibi_slopes, torch::Tensor& k_scale, - torch::Tensor& v_scale, const std::optional& fp8_out_scale) { - int num_seqs = block_tables.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); + const int partition_start_token_idx = partition_idx * T_PAR_SIZE; + // exit if partition is out of context for seq + if (partition_start_token_idx >= context_len) { + return; + } - // NOTE: query start location is optional for V0 decode should not be used. - // If batch contains mix of prefills and decode, prefills should be skipped. - const int* query_start_loc_ptr = - query_start_loc - ? reinterpret_cast(query_start_loc.value().data_ptr()) - : nullptr; + constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2); - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; + __shared__ float shared_qk_max[NWARPS][16 + 1]; + __shared__ float shared_exp_sum[NWARPS][16 + 1]; + // shared_logits is used for multiple purposes + __shared__ _B16x16 shared_logits[NWARPS][2][16][2]; - float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); - float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); - T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - KVT* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* context_lens_ptr = context_lens.data_ptr(); - const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); - const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); - // NOTE: fp8_out_scale is optional. - const auto fp8_out_scale_ptr = - fp8_out_scale - ? static_cast(fp8_out_scale.value().data_ptr()) - : nullptr; - OUTT* out_ptr = reinterpret_cast(out.data_ptr()); + // for QK wmma16x16, layout is QHead/Tokenx16 across every 16 lanes, + // 32 Bytes HeadElements in each lane, 2x16B HeadElements across a row of warp + constexpr int ROWS_PER_WARP = + WARP_SIZE / 16 / 2; // rows refers to 16 lanes; refer dpp terminology + constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD = + 16 / sizeof(cache_t); // 8 for 16 bit cache type, 16 for 8 bit types + constexpr int QKHE_PER_FETCH = + CONTIGUOUS_KV_ELEMS_16B_LOAD * + ROWS_PER_WARP; // each fetch across a warp fetches these many elements + constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH; // 2xQKHE_16B across + // warp - const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); + _B16x16 Qlocal[QKHELOOP / 2]; // note that 16 contiguous elements of Q should + // be fetched per lane for 16 bit cache types - // partition size is fixed at 256 since both mfma4 and mfma16 kernels support - // it mfma4 kernel also supports partition size 512 - constexpr int PARTITION_SIZE = 256; - const int max_num_partitions = - DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE); - const int gqa_ratio = num_heads / num_kv_heads; - assert(num_heads % num_kv_heads == 0); - assert(head_size == HEAD_SIZE); + constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t); - constexpr int NTHR = 256; - dim3 grid(num_seqs, max_num_partitions, num_kv_heads); - dim3 block(NTHR); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + constexpr int TOKENS_PER_WARP = + T_PAR_SIZE / + NWARPS; // sub partition of tokens per warp for qk calculation + constexpr int TLOOP = + TOKENS_PER_WARP / + 16; // each wmma16x16x16 instruction processes 16 tokens - // mfma4 kernel is faster than mfma16 for gqa_ratio <= 4 - switch (gqa_ratio) { - case 1: - LAUNCH_CUSTOM_ATTENTION_MFMA4(1); - break; - case 2: - LAUNCH_CUSTOM_ATTENTION_MFMA4(2); - break; - case 3: - LAUNCH_CUSTOM_ATTENTION_MFMA4(3); - break; - case 4: - LAUNCH_CUSTOM_ATTENTION_MFMA4(4); - break; + _B16x16 Klocal[TLOOP] + [QKHELOOP / 2]; // can be interpreted as B8x16 for 8 bit types + + const int wg_start_head_idx = blockIdx.z * GQA_RATIO; + const int wg_start_kv_head_idx = blockIdx.z; + const int total_num_heads = gridDim.z * GQA_RATIO; + + // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps + // each wmma takes QH16xT16x16HE across warp + // repeat wmma across QKHELOOP dimension + // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens + // across 2 rows x 8 tokens per lane + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + + if (GQA_RATIO == 1) { + const int local_qhead_idx = lane16id % GQA_RATIO; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = + q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; + if (lane16id < GQA_RATIO) { + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) { + const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH * 2; + const _B16x16* q_fetch_ptr_32B = + reinterpret_cast(q_fetch_ptr); + Qlocal[qkhe_depth] = *q_fetch_ptr_32B; + } + } + } else { + // fetch Q in shared across warps and then write to registers + const int local_qhead_idx = 2 * warpid + rowid; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = + q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; + + const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B; + if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) { + const scalar_t* q_fetch_ptr = q_ptr + qhead_element; + const _B16x8* q_fetch_ptr_16B = + reinterpret_cast(q_fetch_ptr); + _B16x8 tmp = *q_fetch_ptr_16B; + + const int offset1 = + lane16id / + 2; // 16 contiguous chunks of head elems are spread across 8x2lanes + shared_logits[offset1][lane2id][local_qhead_idx][0].xy[0] = tmp; + } + + __syncthreads(); + + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) { + Qlocal[qkhe_depth].xy[0] = + shared_logits[qkhe_depth][0][lane16id % GQA_RATIO][0].xy[0]; + Qlocal[qkhe_depth].xy[1] = + shared_logits[qkhe_depth][1][lane16id % GQA_RATIO][0].xy[0]; + } + } + + const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); + const int last_ctx_block = num_context_blocks - 1; + + const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; + + int kphysical_block_number[TLOOP]; + + // fetch k physical block numbers + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kblock_idx = (kglobal_token_idx < context_len) + ? kglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; + } + + constexpr int KX = 16 / sizeof(cache_t); + const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride; + + const int row_head_elem = 0; + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int64_t kblock_number = + static_cast(kphysical_block_number[token_depth]); + const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; + const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; + + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH; + const int offset1 = head_elem / KX; + const int offset2 = head_elem % KX; + const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2; + const _B16x8* k_fetch_ptr_16B = + reinterpret_cast(k_fetch_ptr); + Klocal[token_depth][qkhe_depth / 2].xy[qkhe_depth % 2] = *k_fetch_ptr_16B; + } + } + + constexpr int VTOKENS_PER_LANE = + TOKENS_PER_WARP / ROWS_PER_WARP; // 32/1 = 32 vtokens per lane + constexpr int VBLOCKS_PER_LANE = 2; // assumes block size >=16 + constexpr int VTLOOP = NWARPS; // corresponds to tokens across warps + constexpr int VTLANELOOP = DIVIDE_ROUND_UP( + VTOKENS_PER_LANE, + CONTIGUOUS_KV_ELEMS_16B_LOAD); // optimized for 16B fetches; assumes + // minimum block size is 16 + constexpr int VHELOOP = DIVIDE_ROUND_UP( + (HEAD_SIZE / 16), NWARPS); // head_size distributed across warps; each + // wmma instr works on 16 head elements + + int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE]; + + // fetch v physical block numbers + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE; + vblock_depth++) { + const int vlocal_token_idx = + vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP + + vblock_depth * BLOCK_SIZE; + const int vglobal_token_idx = + partition_start_token_idx + vlocal_token_idx; + const int vblock_idx = (vglobal_token_idx < context_len) + ? vglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + vphysical_block_number[vtoken_depth][vblock_depth] = + block_table_seq[vblock_idx]; + } + } + + _B16x16 Vlocal[VTLOOP][VHELOOP] + [VTLANELOOP / 2]; // this can be interpreted as B8x16 too + + const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride; + // v fetches are 16head elems across lanes x (16x2) tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id; + const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE; + + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) { + const int64_t vblock_number = static_cast( + vphysical_block_number[vtoken_depth] + [vfetch_depth / VBLOCKS_PER_LANE]); + const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride); + + const cache_t* v_fetch_ptr = + v_ptr3 + + (vfetch_depth % VBLOCKS_PER_LANE) * CONTIGUOUS_KV_ELEMS_16B_LOAD; + const _B16x8* v_fetch_ptr_16B = + reinterpret_cast(v_fetch_ptr); + Vlocal[vtoken_depth][vhe_depth][vfetch_depth / 2].xy[vfetch_depth % 2] = + *v_fetch_ptr_16B; + } + } + } + + floatx8 dout[TLOOP]; + // qk wmma + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] = {0}; + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) { + dout[token_depth] = gcn_wmma16x16x16_instr( + Klocal[token_depth][qkhe_depth].u16x16, Qlocal[qkhe_depth].u16x16, + dout[token_depth]); + } + dout[token_depth] *= scale; + } + + // calculate qk_max and exp_sum per warp and write to shared memory + float qk_max = -FLT_MAX; + float exp_sum = 0.0f; + const int qkout_token_idx = + partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid; + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = (local_token_idx + 2 * i < context_len) + ? dout[token_depth][i] + : -FLT_MAX; + qk_max = fmaxf(qk_max, tmp); + } + } + + qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16)); + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = (local_token_idx + 2 * i < context_len) + ? __expf(dout[token_depth][i] - qk_max) + : 0.0f; + dout[token_depth][i] = tmp; + exp_sum += tmp; + } + } + + exp_sum += __shfl_xor(exp_sum, 16); + + __syncthreads(); + + if (laneid < 16) { + shared_qk_max[warpid][lane16id] = qk_max; + shared_exp_sum[warpid][lane16id] = exp_sum; + } + + __syncthreads(); + + // calculate partition qk_max and exp_sum + float partition_qk_max = -FLT_MAX; + float warp_qk_max_exp[NWARPS]; + float partition_exp_sum = 0.0f; + + #pragma unroll + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = shared_qk_max[w][lane16id]; + partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]); + } + + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max); + partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w]; + } + + const float inv_sum_scale = + __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid]; + + __syncthreads(); + + // write logits to shared mem + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] *= inv_sum_scale; + shared_logits[warpid][token_depth][lane16id][0].xy[rowid] = + from_floatx8(dout[token_depth]); + } + __syncthreads(); + + _B16x8 swp_buf[TLOOP][2]; + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + swp_buf[token_depth][0] = + shared_logits[warpid][token_depth][lane16id][0].xy[0]; + swp_buf[token_depth][1] = + shared_logits[warpid][token_depth][lane16id][0].xy[1]; + } + + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + #pragma unroll + for (int i = 0; i < 8; i++) { + shared_logits[warpid][token_depth][lane16id][0].xy[rowid].u16x8[i] = + swp_buf[token_depth][i % 2].u16x8[4 * rowid + (i / 2)]; + } + } + + // write out partition max_logits and exp_sum + if (threadIdx.x < GQA_RATIO) { + const int qhead_idx = lane16id; + const int offset = seq_idx * total_num_heads * max_num_partitions + + (wg_start_head_idx + qhead_idx) * max_num_partitions + + partition_idx; + max_logits[offset] = partition_qk_max; + exp_sums[offset] = partition_exp_sum; + } + + __syncthreads(); + + _B16x8 outelems[VHELOOP]; + // Softmax V wmma + // v layout: 16he across lanes x (16x2) tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + floatx8 tmp_out = {0}; + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP / 2; + vfetch_depth++) { + const int offset = vfetch_depth; + // if output format is 16 qheads across 16 lanes, 16 head elems spread + // across rows + tmp_out = gcn_wmma16x16x16_instr( + Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x16, + shared_logits[vtoken_depth][offset][lane16id][0].u16x16, tmp_out); + } + } + outelems[vhe_depth] = from_floatx8(tmp_out); + } + + __syncthreads(); + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid] = + outelems[vhe_depth]; // lane16 id head dimension; rowid head element + // dimension + } + + __syncthreads(); + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + swp_buf[vhe_depth][0] = shared_logits[warpid][vhe_depth][lane16id][0].xy[0]; + swp_buf[vhe_depth][1] = shared_logits[warpid][vhe_depth][lane16id][0].xy[1]; + } + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + #pragma unroll + for (int i = 0; i < 8; i++) { + shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid].u16x8[i] = + swp_buf[vhe_depth][i % 2].u16x8[4 * rowid + (i / 2)]; + } + } + + __syncthreads(); + + // write to tmp_out with coalesced writes after reading from shared mem + if (warpid == 0) { + _B16x8 vout[GQA_RATIO2]; + // each lane writes out 16Bytes of tmp_out along head elem dimension + const int head_elem_idx = lane16id * 8; + if (head_elem_idx < HEAD_SIZE) { + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + const int offset1 = (head_elem_idx / 16) % NWARPS; + const int offset2 = head_elem_idx / 16 / NWARPS; + const int offset3 = (head_elem_idx / 8) % 2; // num_he % num_row + vout[h] = + shared_logits[offset1][offset2][local_head_idx][0].xy[offset3]; + } + + const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions; + scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult + + partition_idx * HEAD_SIZE; + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + if (local_head_idx < GQA_RATIO) { + const int out_head_idx = wg_start_head_idx + local_head_idx; + scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult; + scalar_t* out_ptr3 = out_ptr2 + head_elem_idx; + _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3); + *out_ptr_B16x8 = vout[h]; + } + } + } + } +} + +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + UNREACHABLE_CODE +} + +// Grid: (num_heads, num_seqs). +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( + OUTT* __restrict__ out, // [num_seqs, num_heads, head_size] + const float* __restrict__ exp_sums, // [num_seqs, num_heads, + // max_num_partitions] + const float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { + const auto num_heads = gridDim.x; + const auto head_idx = blockIdx.x; + const auto seq_idx = blockIdx.y; + + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } + + const int context_len = context_lens[seq_idx]; + const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int warpid = threadIdx.x / WARP_SIZE; + [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; + + __shared__ float shared_global_exp_sum; + // max num partitions supported is warp_size * NPAR_LOOPS + __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE]; + + if (warpid == 0) { + const float* max_logits_ptr = max_logits + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + // valid partition is the last valid partition in case threadid > num + // partitions + int valid_partition[NPAR_LOOPS]; + float reg_max_logit[NPAR_LOOPS]; + const int last_valid_partition = num_partitions - 1; + + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + valid_partition[i] = + (partition_no < num_partitions) ? partition_no : last_valid_partition; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + reg_max_logit[i] = max_logits_ptr[valid_partition[i]]; + } + float max_logit = reg_max_logit[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + max_logit = fmaxf(max_logit, reg_max_logit[i]); + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask)); + } + + const float* exp_sums_ptr = exp_sums + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + float rescaled_exp_sum[NPAR_LOOPS]; + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + rescaled_exp_sum[i] *= (partition_no < num_partitions) + ? expf(reg_max_logit[i] - max_logit) + : 0.0f; + } + float global_exp_sum = rescaled_exp_sum[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + global_exp_sum += rescaled_exp_sum[i]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + shared_exp_sums[partition_no] = rescaled_exp_sum[i]; + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + global_exp_sum += __shfl_xor(global_exp_sum, mask); + } + if (threadIdx.x == 0) { + shared_global_exp_sum = global_exp_sum; + } + } // warpid == 0 + const scalar_t* tmp_out_ptr = + tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x; + constexpr int MAX_NPAR = 32; + scalar_t tmps[MAX_NPAR]; + const float dzero = 0.0f; + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + tmps[j] = from_float(dzero); + } + const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE; + const int num_partition_offset = (num_partitions)*HEAD_SIZE; + int idx = 0; + + constexpr int JCHUNK = 16; + + #pragma unroll + for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + __syncthreads(); + + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + } + } // num_partitions > JCHUNK + + // Aggregate tmp_out to out. + float acc = 0.0f; + #pragma unroll + for (int j = 0; j < JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK; j < 2 * JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + } + } + + for (int p = 1; p < NPAR_LOOPS; p++) { + if (num_partitions > p * MAX_NPAR) { + idx = 0; + #pragma unroll + for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR]; + } + } + } + + const float inv_global_exp_sum = + __fdividef(1.0f, shared_global_exp_sum + 1e-6f); + acc *= inv_global_exp_sum; + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE + + static_cast(head_idx) * HEAD_SIZE; + out_ptr[threadIdx.x] = from_float(acc); +} + +#elif defined(__HIP__GFX12__) + +using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float; + +using bit16_t = uint16_t; +using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t; +typedef bit16x4 _B16x4; + +using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t; +union b16x8_u { + bit16x8 u16x8; + _B16x4 xy[2]; +}; +typedef b16x8_u _B16x8; + +using _B8x8 = uint2; +using bit8_t = uint8_t; + +typedef struct _B8x16 { + _B8x8 xy[2]; +} _B8x16; + +template +__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x8& inpA, + const bit16x8& inpB, + const floatx8& inpC) { + if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(inpA, inpB, inpC); + } else if constexpr (std::is_same::value) { + return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(inpA, inpB, inpC); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ float to_float(const T& inp) { + if constexpr (std::is_same::value) { + return (float)inp; + } else if constexpr (std::is_same::value) { + return __bfloat162float(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ float to_float_b16(const bit16_t& inp) { + union tmpcvt { + bit16_t u; + _Float16 f; + __hip_bfloat16 b; + } t16; + t16.u = inp; + if constexpr (std::is_same::value) { + return (float)t16.f; + } else if constexpr (std::is_same::value) { + return __bfloat162float(t16.b); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ T from_float(const float& inp) { + if constexpr (std::is_same::value) { + return (_Float16)inp; + } else if constexpr (std::is_same::value) { + return __float2bfloat16(inp); + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +template +__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) { + if constexpr (std::is_same::value) { + union h2cvt { + __half2 h2[4]; + _B16x8 b16x8; + } u; + u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1])); + u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3])); + u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5])); + u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7])); + return u.b16x8; + } else if constexpr (std::is_same::value) { + union b2cvt { + __hip_bfloat162 b2[4]; + _B16x8 b16x8; + } u; + + u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1])); + u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3])); + u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5])); + u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7])); + + return u.b16x8; + } else { + static_assert(false, "unsupported 16b dtype"); + } +} + +// clang-format off +template +__global__ +__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + // clang-format on + constexpr int NWARPS = NUM_THREADS / WARP_SIZE; // 8 warps on gfx11 + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + const int lane2id = laneid % 2; + const int lane4id = laneid % 4; + const int lane16id = laneid % 16; + const int rowid = laneid / 16; + + const int seq_idx = blockIdx.x; + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } + const int partition_idx = blockIdx.y; + + constexpr int T_PAR_SIZE = 256; // token partition size set to 256 + + const int max_num_partitions = gridDim.y; + + const int context_len = context_lens[seq_idx]; // length of a seq + + const int partition_start_token_idx = partition_idx * T_PAR_SIZE; + // exit if partition is out of context for seq + if (partition_start_token_idx >= context_len) { + return; + } + + constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2); + + __shared__ float shared_qk_max[NWARPS][16 + 1]; + __shared__ float shared_exp_sum[NWARPS][16 + 1]; + // shared_logits is used for multiple purposes + __shared__ _B16x8 shared_logits[NWARPS][2][16][2]; + + // for QK wmma16x16_gfx12, layout is QHead/Tokenx16 across every 16 lanes, + // 16 Bytes HeadElements in each lane, 2x16B HeadElements across 2 rows of + // warp + constexpr int ROWS_PER_WARP = + WARP_SIZE / 16; // rows refers to 16 lanes; refer dpp terminology + constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD = + 16 / sizeof(cache_t); // 8 for 16 bit cache type, 16 for 8 bit types + constexpr int QKHE_PER_FETCH = + CONTIGUOUS_KV_ELEMS_16B_LOAD * + ROWS_PER_WARP; // each fetch across a warp fetches these many elements + constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH; // 2xQKHE_16B across + // warp + + _B16x8 Qlocal[QKHELOOP]; // note that 16 contiguous elements of Q should + // be fetched per lane for 16 bit cache types + + constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t); + + constexpr int TOKENS_PER_WARP = + T_PAR_SIZE / + NWARPS; // sub partition of tokens per warp for qk calculation + constexpr int TLOOP = + TOKENS_PER_WARP / + 16; // each wmma16x16x16 instruction processes 16 tokens + + _B16x8 Klocal[TLOOP] + [QKHELOOP]; // can be interpreted as B8x16 for 8 bit types + + const int wg_start_head_idx = blockIdx.z * GQA_RATIO; + const int wg_start_kv_head_idx = blockIdx.z; + const int total_num_heads = gridDim.z * GQA_RATIO; + + // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps + // each wmma takes QH16xT16x16HE across warp + // repeat wmma across QKHELOOP dimension + // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens + // across 2 rows x 8 tokens per lane + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + + if (GQA_RATIO == 1) { + const int local_qhead_idx = lane16id % GQA_RATIO; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = q + query_start_off * q_stride + + global_qhead_idx * HEAD_SIZE + + rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD; + if (lane16id < GQA_RATIO) { + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH; + const _B16x8* q_fetch_ptr_16B = + reinterpret_cast(q_fetch_ptr); + Qlocal[qkhe_depth] = *q_fetch_ptr_16B; + } + } + } else { + // fetch Q in shared across warps and then write to registers + const int local_qhead_idx = 2 * warpid + rowid; + const int global_qhead_idx = wg_start_head_idx + local_qhead_idx; + const scalar_t* q_ptr = + q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE; + + const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B; + if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) { + const scalar_t* q_fetch_ptr = q_ptr + qhead_element; + const _B16x8* q_fetch_ptr_16B = + reinterpret_cast(q_fetch_ptr); + _B16x8 tmp = *q_fetch_ptr_16B; + + const int offset1 = + lane16id / + 2; // 16 contiguous chunks of head elems are spread across 8x2lanes + shared_logits[offset1][lane2id][local_qhead_idx][0] = tmp; + } + + __syncthreads(); + + #pragma unroll + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + Qlocal[qkhe_depth] = + shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO][0]; + } + } + + const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); + const int last_ctx_block = num_context_blocks - 1; + + const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq; + + int kphysical_block_number[TLOOP]; + + // fetch k physical block numbers + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kblock_idx = (kglobal_token_idx < context_len) + ? kglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + kphysical_block_number[token_depth] = block_table_seq[kblock_idx]; + } + + constexpr int KX = 16 / sizeof(cache_t); + const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride; + + const int row_head_elem = rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD; + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int64_t kblock_number = + static_cast(kphysical_block_number[token_depth]); + const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; + const int klocal_token_idx = + TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; + const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx; + const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; + const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; + + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH; + const int offset1 = head_elem / KX; + const int offset2 = head_elem % KX; + const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2; + const _B16x8* k_fetch_ptr_16B = + reinterpret_cast(k_fetch_ptr); + Klocal[token_depth][qkhe_depth] = *k_fetch_ptr_16B; + } + } + + constexpr int VTOKENS_PER_LANE = + TOKENS_PER_WARP / ROWS_PER_WARP; // 32/2 = 16 vtokens per lane + constexpr int VBLOCKS_PER_LANE = 1; // assumes block size >=16 + constexpr int VTLOOP = NWARPS; // corresponds to tokens across warps + constexpr int VTLANELOOP = DIVIDE_ROUND_UP( + VTOKENS_PER_LANE, + CONTIGUOUS_KV_ELEMS_16B_LOAD); // optimized for 16B fetches; assumes + // minimum block size is 16 + constexpr int VHELOOP = DIVIDE_ROUND_UP( + (HEAD_SIZE / 16), NWARPS); // head_size distributed across warps; each + // wmma instr works on 16 head elements + + int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE]; + + // fetch v physical block numbers + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE; + vblock_depth++) { + const int vlocal_token_idx = + vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP + + rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE; + const int vglobal_token_idx = + partition_start_token_idx + vlocal_token_idx; + const int vblock_idx = (vglobal_token_idx < context_len) + ? vglobal_token_idx / BLOCK_SIZE + : last_ctx_block; + vphysical_block_number[vtoken_depth][vblock_depth] = + block_table_seq[vblock_idx]; + } + } + + _B16x8 Vlocal[VTLOOP][VHELOOP] + [VTLANELOOP]; // this can be interpreted as B8x16 too + + const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride + + ((rowid * VTOKENS_PER_LANE) % BLOCK_SIZE); + + // v fetches are 16head elems across lanes x 16 tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id; + const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE; + + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) { + const int vblock_depth = 0; + const int64_t vblock_number = static_cast( + vphysical_block_number[vtoken_depth][vblock_depth]); + const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride); + + const cache_t* v_fetch_ptr = + v_ptr3 + vfetch_depth * CONTIGUOUS_KV_ELEMS_16B_LOAD; + const _B16x8* v_fetch_ptr_16B = + reinterpret_cast(v_fetch_ptr); + Vlocal[vtoken_depth][vhe_depth][vfetch_depth] = *v_fetch_ptr_16B; + } + } + } + + floatx8 dout[TLOOP]; + // qk wmma + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] = {0}; + for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) { + dout[token_depth] = gcn_wmma16x16x16_instr( + Klocal[token_depth][qkhe_depth].u16x8, Qlocal[qkhe_depth].u16x8, + dout[token_depth]); + } + dout[token_depth] *= scale; + } + + // calculate qk_max and exp_sum per warp and write to shared memory + float qk_max = -FLT_MAX; + float exp_sum = 0.0f; + const int qkout_token_idx = + partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid * 8; + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = + (local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX; + qk_max = fmaxf(qk_max, tmp); + } + } + + qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16)); + + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + const int local_token_idx = qkout_token_idx + token_depth * 16; + for (int i = 0; i < 8; i++) { + const float tmp = (local_token_idx + i < context_len) + ? __expf(dout[token_depth][i] - qk_max) + : 0.0f; + dout[token_depth][i] = tmp; + exp_sum += tmp; + } + } + + exp_sum += __shfl_xor(exp_sum, 16); + + __syncthreads(); + + if (laneid < 16) { + shared_qk_max[warpid][lane16id] = qk_max; + shared_exp_sum[warpid][lane16id] = exp_sum; + } + + __syncthreads(); + + // calculate partition qk_max and exp_sum + float partition_qk_max = -FLT_MAX; + float warp_qk_max_exp[NWARPS]; + float partition_exp_sum = 0.0f; + + #pragma unroll + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = shared_qk_max[w][lane16id]; + partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]); + } + + for (int w = 0; w < NWARPS; w++) { + warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max); + partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w]; + } + + const float inv_sum_scale = + __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid]; + + __syncthreads(); + + // write logits to shared mem + #pragma unroll + for (int token_depth = 0; token_depth < TLOOP; token_depth++) { + dout[token_depth] *= inv_sum_scale; + shared_logits[warpid][token_depth][lane16id][rowid] = + from_floatx8(dout[token_depth]); + } + + // write out partition max_logits and exp_sum + if (threadIdx.x < GQA_RATIO) { + const int qhead_idx = lane16id; + const int offset = seq_idx * total_num_heads * max_num_partitions + + (wg_start_head_idx + qhead_idx) * max_num_partitions + + partition_idx; + max_logits[offset] = partition_qk_max; + exp_sums[offset] = partition_exp_sum; + } + + __syncthreads(); + + _B16x8 outelems[VHELOOP]; + // Softmax V wmma + // v layout: 16he across lanes x 16 tokens per lane + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + floatx8 tmp_out = {0}; + + for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) { + for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) { + const int offset = rowid * VTLANELOOP + vfetch_depth; + const int offset1 = offset % ROWS_PER_WARP; + const int offset2 = offset / ROWS_PER_WARP; + // if output format is 16 qheads across 16 lanes, 16 head elems spread + // across rows + tmp_out = gcn_wmma16x16x16_instr( + Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x8, + shared_logits[vtoken_depth][offset2][lane16id][offset1].u16x8, + tmp_out); + } + } + outelems[vhe_depth] = from_floatx8(tmp_out); + } + + __syncthreads(); + + #pragma unroll + for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) { + shared_logits[warpid][vhe_depth][lane16id][rowid] = + outelems[vhe_depth]; // lane16 id head dimension; rowid head element + // dimension + } + + __syncthreads(); + + // write to tmp_out with coalesced writes after reading from shared mem + if (warpid == 0) { + _B16x8 vout[GQA_RATIO2]; + // each lane writes out 16Bytes of tmp_out along head elem dimension + const int head_elem_idx = lane16id * 8; + if (head_elem_idx < HEAD_SIZE) { + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + const int offset1 = (head_elem_idx / 16) % NWARPS; + const int offset2 = head_elem_idx / 16 / NWARPS; + const int offset3 = (head_elem_idx / 8) % 2; // num_he % num_row + vout[h] = shared_logits[offset1][offset2][local_head_idx][offset3]; + } + + const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions; + scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult + + partition_idx * HEAD_SIZE; + for (int h = 0; h < GQA_RATIO2; h++) { + const int local_head_idx = 2 * h + rowid; + if (local_head_idx < GQA_RATIO) { + const int out_head_idx = wg_start_head_idx + local_head_idx; + scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult; + scalar_t* out_ptr3 = out_ptr2 + head_elem_idx; + _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3); + *out_ptr_B16x8 = vout[h]; + } + } + } + } +} + +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, + // head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, + // head_size, block_size] + const int num_kv_heads, const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, const int kv_block_stride, const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, + // head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + UNREACHABLE_CODE +} + +// Grid: (num_heads, num_seqs). +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( + OUTT* __restrict__ out, // [num_seqs, num_heads, head_size] + const float* __restrict__ exp_sums, // [num_seqs, num_heads, + // max_num_partitions] + const float* __restrict__ max_logits, // [num_seqs, num_heads, + // max_num_partitions] + const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, + // max_num_partitions, head_size] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { + const auto num_heads = gridDim.x; + const auto head_idx = blockIdx.x; + const auto seq_idx = blockIdx.y; + + // NOTE queries with sequence len > 1 are prefills and taken care by another + // kernel. + if (query_start_loc_ptr != nullptr && + (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) { + return; + } + + const int context_len = context_lens[seq_idx]; + const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); + [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int warpid = threadIdx.x / WARP_SIZE; + [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; + + __shared__ float shared_global_exp_sum; + // max num partitions supported is warp_size * NPAR_LOOPS + __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE]; + + if (warpid == 0) { + const float* max_logits_ptr = max_logits + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + // valid partition is the last valid partition in case threadid > num + // partitions + int valid_partition[NPAR_LOOPS]; + float reg_max_logit[NPAR_LOOPS]; + const int last_valid_partition = num_partitions - 1; + + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + valid_partition[i] = + (partition_no < num_partitions) ? partition_no : last_valid_partition; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + reg_max_logit[i] = max_logits_ptr[valid_partition[i]]; + } + float max_logit = reg_max_logit[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + max_logit = fmaxf(max_logit, reg_max_logit[i]); + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask)); + } + + const float* exp_sums_ptr = exp_sums + + seq_idx * num_heads * max_num_partitions + + head_idx * max_num_partitions; + + float rescaled_exp_sum[NPAR_LOOPS]; + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + rescaled_exp_sum[i] *= (partition_no < num_partitions) + ? expf(reg_max_logit[i] - max_logit) + : 0.0f; + } + float global_exp_sum = rescaled_exp_sum[0]; + #pragma unroll + for (int i = 1; i < NPAR_LOOPS; i++) { + global_exp_sum += rescaled_exp_sum[i]; + } + #pragma unroll + for (int i = 0; i < NPAR_LOOPS; i++) { + const int partition_no = i * WARP_SIZE + threadIdx.x; + shared_exp_sums[partition_no] = rescaled_exp_sum[i]; + } + + #pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + global_exp_sum += __shfl_xor(global_exp_sum, mask); + } + if (threadIdx.x == 0) { + shared_global_exp_sum = global_exp_sum; + } + } // warpid == 0 + const scalar_t* tmp_out_ptr = + tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x; + constexpr int MAX_NPAR = 32; + scalar_t tmps[MAX_NPAR]; + const float dzero = 0.0f; + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + tmps[j] = from_float(dzero); + } + const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE; + const int num_partition_offset = (num_partitions)*HEAD_SIZE; + int idx = 0; + + constexpr int JCHUNK = 16; + + #pragma unroll + for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + __syncthreads(); + + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + } + } // num_partitions > JCHUNK + + // Aggregate tmp_out to out. + float acc = 0.0f; + #pragma unroll + for (int j = 0; j < JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > JCHUNK) { + #pragma unroll + for (int j = JCHUNK; j < 2 * JCHUNK; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + if (num_partitions > 2 * JCHUNK) { + #pragma unroll + for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j]; + } + } + } + + for (int p = 1; p < NPAR_LOOPS; p++) { + if (num_partitions > p * MAX_NPAR) { + idx = 0; + #pragma unroll + for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE; + j += HEAD_SIZE) { + // lastj is last valid partition + const int lastj_offset = + (j < num_partition_offset) ? j : last_partition_offset; + tmps[idx] = tmp_out_ptr[lastj_offset]; + idx++; + } + + #pragma unroll + for (int j = 0; j < MAX_NPAR; j++) { + acc += to_float(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR]; + } + } + } + + const float inv_global_exp_sum = + __fdividef(1.0f, shared_global_exp_sum + 1e-6f); + acc *= inv_global_exp_sum; + + const int64_t query_start_off = static_cast( + query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx); + OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE + + static_cast(head_idx) * HEAD_SIZE; + out_ptr[threadIdx.x] = from_float(acc); +} + +#else + +// clang-format off +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + const int num_kv_heads, + const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, + const int kv_block_stride, + const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + UNREACHABLE_CODE +} + +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel( + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + const int num_kv_heads, + const float scale, + const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_blocks_per_seq, + const float* __restrict__ alibi_slopes, // [num_heads] + const int q_stride, + const int kv_block_stride, + const int kv_head_stride, + float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] + scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + OUTT* __restrict__ final_out, // [num_seqs, num_heads, head_size] + int max_ctx_blocks, const float* k_scale, const float* v_scale) { + UNREACHABLE_CODE +} + +// Grid: (num_heads, num_seqs). +template +__global__ +__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( + OUTT* __restrict__ out, // [num_seqs, num_heads, head_size] + const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] + const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] + const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] + const int* __restrict__ context_lens, // [num_seqs] + const int* __restrict__ query_start_loc_ptr, // [num_seqs] + const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) { + UNREACHABLE_CODE +} +// clang-format on + +#endif + +#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \ + paged_attention_ll4mi_QKV_mfma16_kernel \ + <<>>( \ + query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ + block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ + max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ + kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ + max_ctx_blocks, k_scale_ptr, v_scale_ptr); + +#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \ + paged_attention_ll4mi_QKV_mfma4_kernel \ + <<>>( \ + query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ + block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \ + max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \ + kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \ + max_ctx_blocks, k_scale_ptr, v_scale_ptr); + +#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \ + paged_attention_ll4mi_reduce_kernel \ + <<>>( \ + out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \ + context_lens_ptr, query_start_loc_ptr, max_num_partitions, \ + fp8_out_scale_ptr); + +template +void paged_attention_custom_launcher( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, const int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& context_lens, + const std::optional& query_start_loc, int max_context_len, + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale, const std::optional& fp8_out_scale) { + int num_seqs = block_tables.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + // NOTE: query start location is optional for V0 decode should not be used. + // If batch contains mix of prefills and decode, prefills should be skipped. + const int* query_start_loc_ptr = + query_start_loc + ? reinterpret_cast(query_start_loc.value().data_ptr()) + : nullptr; + + // NOTE: alibi_slopes is optional. + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; + + float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); + float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); + T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + KVT* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* context_lens_ptr = context_lens.data_ptr(); + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); + // NOTE: fp8_out_scale is optional. + const auto fp8_out_scale_ptr = + fp8_out_scale + ? static_cast(fp8_out_scale.value().data_ptr()) + : nullptr; + OUTT* out_ptr = reinterpret_cast(out.data_ptr()); + + const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); + + // partition size is fixed at 256 since both mfma4 and mfma16 kernels support + // it mfma4 kernel also supports partition size 512 + constexpr int PARTITION_SIZE = 256; + const int max_num_partitions = + DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE); + const int gqa_ratio = num_heads / num_kv_heads; + assert(num_heads % num_kv_heads == 0); + assert(head_size == HEAD_SIZE); + + constexpr int NTHR = 256; + dim3 grid(num_seqs, max_num_partitions, num_kv_heads); + dim3 block(NTHR); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + // mfma4 kernel is faster than mfma16 for gqa_ratio <= 4 + switch (gqa_ratio) { + case 1: + LAUNCH_CUSTOM_ATTENTION_MFMA4(1); + break; + case 2: + LAUNCH_CUSTOM_ATTENTION_MFMA4(2); + break; + case 3: + LAUNCH_CUSTOM_ATTENTION_MFMA4(3); + break; + case 4: + LAUNCH_CUSTOM_ATTENTION_MFMA4(4); + break; case 5: LAUNCH_CUSTOM_ATTENTION_MFMA16(5); break; @@ -1744,13 +3251,195 @@ void paged_attention_custom_launcher( } } -#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, \ - PSIZE, ALIBI_ENABLED) \ - paged_attention_custom_launcher( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ - max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); +template +void paged_attention_custom_launcher_navi( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, const int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& context_lens, + const std::optional& query_start_loc, int max_context_len, + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale) { + int num_seqs = block_tables.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + // NOTE: query start location is optional for V0 decode should not be used. + // If batch contains mix of prefills and decode, prefills should be skipped. + const int* query_start_loc_ptr = + query_start_loc + ? reinterpret_cast(query_start_loc.value().data_ptr()) + : nullptr; + + // NOTE: Navi does not support alibi_slopes. + const float* alibi_slopes_ptr = nullptr; + + float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); + float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); + T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + KVT* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* context_lens_ptr = context_lens.data_ptr(); + + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); + // NOTE: Navi does not support fp8. + const auto fp8_out_scale_ptr = nullptr; + OUTT* out_ptr = reinterpret_cast(out.data_ptr()); + + const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); + + constexpr int PARTITION_SIZE = 256; + const int max_num_partitions = + DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE); + const int gqa_ratio = num_heads / num_kv_heads; + assert(num_heads % num_kv_heads == 0); + assert(head_size == HEAD_SIZE); + + constexpr int NTHR = 256; + dim3 grid(num_seqs, max_num_partitions, num_kv_heads); + dim3 block(NTHR); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + switch (gqa_ratio) { + case 1: + LAUNCH_CUSTOM_ATTENTION_MFMA16(1); + break; + case 2: + LAUNCH_CUSTOM_ATTENTION_MFMA16(2); + break; + case 3: + LAUNCH_CUSTOM_ATTENTION_MFMA16(3); + break; + case 4: + LAUNCH_CUSTOM_ATTENTION_MFMA16(4); + break; + case 5: + LAUNCH_CUSTOM_ATTENTION_MFMA16(5); + break; + case 6: + LAUNCH_CUSTOM_ATTENTION_MFMA16(6); + break; + case 7: + LAUNCH_CUSTOM_ATTENTION_MFMA16(7); + break; + case 8: + LAUNCH_CUSTOM_ATTENTION_MFMA16(8); + break; + case 9: + LAUNCH_CUSTOM_ATTENTION_MFMA16(9); + break; + case 10: + LAUNCH_CUSTOM_ATTENTION_MFMA16(10); + break; + case 11: + LAUNCH_CUSTOM_ATTENTION_MFMA16(11); + break; + case 12: + LAUNCH_CUSTOM_ATTENTION_MFMA16(12); + break; + case 13: + LAUNCH_CUSTOM_ATTENTION_MFMA16(13); + break; + case 14: + LAUNCH_CUSTOM_ATTENTION_MFMA16(14); + break; + case 15: + LAUNCH_CUSTOM_ATTENTION_MFMA16(15); + break; + case 16: + LAUNCH_CUSTOM_ATTENTION_MFMA16(16); + break; + default: + TORCH_CHECK(false, "Unsupported gqa ratio: ", gqa_ratio); + break; + } + + dim3 reduce_grid(num_heads, num_seqs); + dim3 reduce_block(head_size); + const int warp_size = 32; + const int npar_loops = DIVIDE_ROUND_UP(max_num_partitions, warp_size); + // reduction kernel supports upto 16 NPAR_loops * 32 (warp_size) * 256 + // (partition size) = 128K context length + switch (npar_loops) { + case 1: + LAUNCH_CUSTOM_REDUCTION(1); + break; + case 2: + LAUNCH_CUSTOM_REDUCTION(2); + break; + case 3: + LAUNCH_CUSTOM_REDUCTION(3); + break; + case 4: + LAUNCH_CUSTOM_REDUCTION(4); + break; + case 5: + LAUNCH_CUSTOM_REDUCTION(5); + break; + case 6: + LAUNCH_CUSTOM_REDUCTION(6); + break; + case 7: + LAUNCH_CUSTOM_REDUCTION(7); + break; + case 8: + LAUNCH_CUSTOM_REDUCTION(8); + break; + case 9: + LAUNCH_CUSTOM_REDUCTION(9); + break; + case 10: + LAUNCH_CUSTOM_REDUCTION(10); + break; + case 11: + LAUNCH_CUSTOM_REDUCTION(11); + break; + case 12: + LAUNCH_CUSTOM_REDUCTION(12); + break; + case 13: + LAUNCH_CUSTOM_REDUCTION(13); + break; + case 14: + LAUNCH_CUSTOM_REDUCTION(14); + break; + case 15: + LAUNCH_CUSTOM_REDUCTION(15); + break; + case 16: + LAUNCH_CUSTOM_REDUCTION(16); + break; + default: + TORCH_CHECK(false, "Unsupported npar_loops: ", npar_loops); + break; + } +} + +#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, \ + PSIZE, ALIBI_ENABLED) \ + if (!is_navi) { \ + paged_attention_custom_launcher( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ + max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \ + } else { \ + paged_attention_custom_launcher_navi< \ + T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, context_lens, query_start_loc, \ + max_context_len, alibi_slopes, k_scale, v_scale); \ + } #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \ OUTT, PSIZE) \ @@ -1807,6 +3496,24 @@ void paged_attention_custom_launcher( break; \ } +bool is_navi_gpu() { + static bool is_cached = false; + static bool result; + + if (!is_cached) { + int device_id; + hipDeviceProp_t deviceProp; + hipGetDevice(&device_id); + hipGetDeviceProperties(&deviceProp, device_id); + + std::string arch = deviceProp.gcnArchName; + result = arch.find("gfx11") == 0 || arch.find("gfx12") == 0; + is_cached = true; + } + + return result; +} + // clang-format off void paged_attention( torch::Tensor& out, // [num_seqs, num_heads, head_size] @@ -1827,6 +3534,8 @@ void paged_attention( torch::Tensor& v_scale, const std::optional& fp8_out_scale) { // clang-format on + bool is_navi = is_navi_gpu(); + const int head_size = query.size(2); if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Half) { diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index e5650136f258..d9f956fbc7c0 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -148,6 +148,11 @@ def test_paged_attention( or (version == "rocm" and head_size not in (64, 128))): pytest.skip() + if (version == "rocm" and current_platform.is_navi() + and (kv_cache_dtype == "fp8" or head_size != 128 + or block_size != 16 or use_alibi)): + pytest.skip() + global PARTITION_SIZE current_platform.seed_everything(seed) @@ -275,6 +280,7 @@ def test_paged_attention( scale, block_tables, seq_lens, + None, block_size, max_seq_len, alibi_slopes, @@ -286,7 +292,7 @@ def test_paged_attention( opcheck(torch.ops._rocm_C.paged_attention, (output, exp_sums, max_logits, tmp_output, query, key_cache, value_cache, num_kv_heads, scale, block_tables, - seq_lens, block_size, max_seq_len, alibi_slopes, + seq_lens, None, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, k_scale, v_scale), cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0])) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 8076c4791d3c..abcb68911a8b 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -861,7 +861,8 @@ def forward( gqa_ratio = num_heads // self.num_kv_heads use_custom = use_rocm_custom_paged_attention( decode_query.dtype, head_size, block_size, gqa_ratio, - decode_meta.max_decode_seq_len, self.sliding_window) + decode_meta.max_decode_seq_len, self.sliding_window, + self.kv_cache_dtype, self.alibi_slopes) if use_custom: max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type != AttentionType.ENCODER_DECODER else diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 217db3bf965d..785799b6bf68 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -283,7 +283,8 @@ def chunked_prefill_paged_decode( use_custom = use_rocm_custom_paged_attention(query.dtype, head_size, block_size, num_queries_per_kv, - max_seq_len, sliding_window) + max_seq_len, sliding_window, + kv_cache_dtype, alibi_slopes) if use_custom: _PARTITION_SIZE_ROCM = 256 max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) // diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index c8b86087578d..3c73843c3416 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -102,26 +102,42 @@ def on_mi250_mi300() -> bool: @cache -def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, - block_size: int, gqa_ratio: int, - max_seq_len: int, - sliding_window: int) -> bool: +def use_rocm_custom_paged_attention( + qtype: torch.dtype, + head_size: int, + block_size: int, + gqa_ratio: int, + max_seq_len: int, + sliding_window: int, + kv_cache_dtype: str, + alibi_slopes: Optional[torch.Tensor] = None) -> bool: GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) + ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"]) - # rocm custom page attention not support on gfx1* # custom paged attn always supported on V0. On V1, requires sliding window # disabled due to observed numerical discrepancy. - return (ON_GFX9 and (not envs.VLLM_USE_V1 or sliding_window == 0 - or sliding_window == (-1, -1)) - and (qtype == torch.half or qtype == torch.bfloat16) - and (head_size == 64 or head_size == 128) - and (block_size == 16 or block_size == 32) - and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768 - and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) - and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN - and envs.VLLM_ROCM_USE_AITER)) + if ON_GFX9: + return ((not envs.VLLM_USE_V1 or sliding_window == 0 + or sliding_window == (-1, -1)) + and (qtype == torch.half or qtype == torch.bfloat16) + and (head_size == 64 or head_size == 128) + and (block_size == 16 or block_size == 32) + and (gqa_ratio >= 1 and gqa_ratio <= 16) + and max_seq_len <= 32768 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) + and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN + and envs.VLLM_ROCM_USE_AITER)) + + else: + return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0 + or sliding_window == (-1, -1)) + and (qtype == torch.half or qtype == torch.bfloat16) + and head_size == 128 and block_size == 16 + and (gqa_ratio >= 3 and gqa_ratio <= 16) + and max_seq_len <= 32768 and alibi_slopes is None + and kv_cache_dtype == "auto" + and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) class RocmPlatform(Platform): @@ -362,3 +378,7 @@ def use_custom_allreduce(cls) -> bool: def get_cu_count(cls, device_id: int = 0) -> int: return torch.cuda.get_device_properties( device_id).multi_processor_count + + @classmethod + def is_navi(cls) -> bool: + return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName From 7c1213e848bc53008710cb715b4b514c081c9897 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 21 May 2025 16:40:32 +0000 Subject: [PATCH 036/192] Remove incorrect env value --- docker/Dockerfile.rocm | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 39b4dd9640d9..fe8d9cf23d7b 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -114,12 +114,6 @@ ENV TOKENIZERS_PARALLELISM=false # ENV that can improve safe tensor loading, and end-to-end time ENV SAFETENSORS_FAST_GPU=1 -# User-friendly environment setting for multi-processing to avoid below RuntimeError. -# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, -# you must use the 'spawn' start method -# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn - # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 From bb0a3112130a077e557b36632de3f1b5836b4c40 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 21 May 2025 18:25:23 +0100 Subject: [PATCH 037/192] Revert "[v1] Support multiple KV cache groups in GPU model runner (#17945) (#18459) Signed-off-by: Mark McLoughlin --- tests/v1/core/test_kv_cache_utils.py | 71 +---- tests/v1/core/test_prefix_caching.py | 36 +-- tests/v1/worker/test_gpu_input_batch.py | 39 +-- tests/v1/worker/test_gpu_model_runner.py | 57 ++-- .../v1/shared_storage_connector.py | 6 +- .../attention/backends/mla/rocm_aiter_mla.py | 4 +- vllm/v1/core/kv_cache_manager.py | 34 +-- vllm/v1/core/kv_cache_utils.py | 13 +- vllm/v1/core/sched/output.py | 12 +- vllm/v1/core/sched/scheduler.py | 16 +- vllm/v1/kv_cache_interface.py | 42 --- vllm/v1/worker/block_table.py | 47 --- vllm/v1/worker/gpu_input_batch.py | 13 +- vllm/v1/worker/gpu_model_runner.py | 270 ++++++++---------- vllm/v1/worker/tpu_model_runner.py | 35 ++- 15 files changed, 214 insertions(+), 481 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 43a27da2dbe4..1e2767e2d198 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -19,8 +19,7 @@ hash_request_tokens, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor, - SlidingWindowSpec) + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -55,14 +54,12 @@ def new_kv_cache_spec(block_size=16, num_kv_heads=2, head_size=64, dtype=torch.float32, - use_mla=False, - sliding_window=None): + use_mla=False): return FullAttentionSpec(block_size=block_size, num_kv_heads=num_kv_heads, head_size=head_size, dtype=dtype, - use_mla=use_mla, - sliding_window=sliding_window) + use_mla=use_mla) def test_none_hash(monkeypatch): @@ -495,68 +492,6 @@ def test_unify_kv_cache_configs(): unify_kv_cache_configs(diff_kv_cache_config) -def test_merge_kv_cache_spec(): - same_layer_specs = [ - new_kv_cache_spec(num_kv_heads=32), - new_kv_cache_spec(num_kv_heads=32), - ] - merged_layer_spec = same_layer_specs[0].merge(same_layer_specs) - assert merged_layer_spec.block_size == 16 - assert merged_layer_spec.num_kv_heads == 32 - assert merged_layer_spec.head_size == 64 - assert merged_layer_spec.dtype == torch.float32 - assert merged_layer_spec.sliding_window is None - - different_layer_specs = [ - new_kv_cache_spec(num_kv_heads=32), - new_kv_cache_spec(num_kv_heads=16), - ] - with pytest.raises(AssertionError): - different_layer_specs[0].merge(different_layer_specs) - - full_spec = new_kv_cache_spec(num_kv_heads=32) - different_type_layer_specs = [ - full_spec, - SlidingWindowSpec( - block_size=full_spec.block_size, - num_kv_heads=full_spec.num_kv_heads, - head_size=full_spec.head_size, - dtype=full_spec.dtype, - use_mla=full_spec.use_mla, - sliding_window=1, - ), - ] - with pytest.raises(AssertionError): - different_type_layer_specs[0].merge(different_type_layer_specs) - with pytest.raises(AssertionError): - different_type_layer_specs[1].merge(different_type_layer_specs) - - different_sliding_window_layer_specs = [ - new_kv_cache_spec(num_kv_heads=32), - new_kv_cache_spec(num_kv_heads=32, sliding_window=1), - new_kv_cache_spec(num_kv_heads=32, sliding_window=2), - ] - with pytest.raises(ValueError): - different_sliding_window_layer_specs[0].merge( - different_sliding_window_layer_specs) - - same_sliding_window_layer_specs = [ - new_kv_cache_spec(num_kv_heads=32, sliding_window=1), - new_kv_cache_spec(num_kv_heads=32, sliding_window=1), - ] - merged_layer_spec = same_sliding_window_layer_specs[0].merge( - same_sliding_window_layer_specs) - assert merged_layer_spec.sliding_window == 1 - - same_sliding_window_layer_spec_with_none = [ - new_kv_cache_spec(num_kv_heads=32, sliding_window=1), - new_kv_cache_spec(num_kv_heads=32, sliding_window=None), - ] - merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge( - same_sliding_window_layer_spec_with_none) - assert merged_layer_spec.sliding_window == 1 - - @pytest.mark.parametrize( ("model_id", "max_model_len", "want_estimated_max_len"), [ ("Qwen/Qwen1.5-7B", 16385, 16384), diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 3da27786b1f2..2d7411381e16 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -84,7 +84,7 @@ def test_prefill(hash_algo): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == [1, 2, 3, 4] # Check full block metadata parent_block_hash = None @@ -107,13 +107,13 @@ def test_prefill(hash_algo): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [[1, 2, 3]] + assert computed_blocks.get_block_ids() == [1, 2, 3] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[5]] + assert blocks.get_block_ids() == [5] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -141,13 +141,13 @@ def test_prefill(hash_algo): req2 = make_request("2", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(manager.req_to_block_hashes[req2.request_id]) == 3 - assert computed_blocks.get_block_ids() == [[1, 2, 3]] + assert computed_blocks.get_block_ids() == [1, 2, 3] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[6]] + assert blocks.get_block_ids() == [6] # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. @@ -171,7 +171,7 @@ def test_prefill(hash_algo): len(computed_blocks.blocks) * 16, computed_blocks) # This block ID order also checks the eviction order. - assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]] + assert blocks.get_block_ids() == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1] assert manager.block_pool.free_block_queue.num_free_blocks == 0 assert manager.block_pool.free_block_queue.free_list_head is None assert manager.block_pool.free_block_queue.free_list_tail is None @@ -208,7 +208,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == [1, 2, 3, 4] req0_block_hashes = [b.block_hash for b in blocks.blocks] # Check full block metadata @@ -233,13 +233,13 @@ def test_prefill_plp(): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [[1, 2, 3]] + assert computed_blocks.get_block_ids() == [1, 2, 3] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[5]] + assert blocks.get_block_ids() == [5] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -277,11 +277,11 @@ def test_prefill_plp(): block_ids = blocks.get_block_ids() # Duplicate cached blocks have different ids but same hashes vs request #0 assert [b.block_hash for b in blocks.blocks] == req0_block_hashes - assert block_ids != [[1, 2, 3, 4]] + assert block_ids != [1, 2, 3, 4] # Request #2 block hashes are valid since request #0 hashes are. # Check block reference counts. - for block_id in block_ids[0]: + for block_id in block_ids: assert manager.block_pool.blocks[block_id].ref_cnt == 1 manager.free(req2) @@ -307,7 +307,7 @@ def test_decode(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == [1, 2, 3, 4] # Append slots without allocating a new block. req0.num_computed_tokens = 55 @@ -379,12 +379,12 @@ def test_evict(): # Touch the first 2 blocks. req2 = make_request("2", list(range(2 * 16 + 3))) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert computed_blocks.get_block_ids() == [[1, 2]] + assert computed_blocks.get_block_ids() == [1, 2] assert num_computed_tokens == 2 * 16 blocks = manager.allocate_slots(req2, 3, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[10]] + assert blocks.get_block_ids() == [10] assert manager.block_pool.free_block_queue.num_free_blocks == 7 @@ -625,7 +625,7 @@ def test_mm_prefix_caching(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == [1, 2, 3, 4] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -686,7 +686,7 @@ def test_cache_key_salting(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == [1, 2, 3, 4] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -797,7 +797,7 @@ def test_reset_prefix_cache(): all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) blocks = manager.allocate_slots(req0, 55) - assert blocks.get_block_ids() == [[1, 2, 3, 4]] + assert blocks.get_block_ids() == [1, 2, 3, 4] unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids @@ -808,7 +808,7 @@ def test_reset_prefix_cache(): blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [[5]] + assert blocks.get_block_ids() == [5] # Failed to reset prefix cache because some blocks are not freed yet. assert not manager.reset_prefix_cache() diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 638f5bedcfca..7b1359c8576f 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -9,11 +9,9 @@ from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable -from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch +from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState, + InputBatch) VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 @@ -24,27 +22,6 @@ MAX_NUM_PROMPT_TOKENS = 64 -def get_kv_cache_config() -> KVCacheConfig: - return KVCacheConfig( - num_blocks=10, - tensors={ - "layer.0": KVCacheTensor(size=1024), - }, - kv_cache_groups=[ - KVCacheGroupSpec( - layer_names=["layer.0"], - kv_cache_spec=FullAttentionSpec( - block_size=1, - num_kv_heads=1, - head_size=16, - dtype=torch.float16, - use_mla=False, - ), - ), - ], - ) - - def _compare_objs(obj1, obj2): attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a))) attr_names = set([ @@ -64,10 +41,6 @@ def _compare_objs(obj1, obj2): elif isinstance(a, np.ndarray): if np.allclose(a, b): is_same = True - elif isinstance(a, MultiGroupBlockTable): - for a_i, b_i in zip(a.block_tables, b.block_tables): - _compare_objs(a_i, b_i) - is_same = True elif isinstance(a, (BlockTable, SamplingMetadata)): _compare_objs(a, b) is_same = True # if we make it here must be same @@ -225,7 +198,7 @@ def _construct_cached_request_state(req_id_suffix: int): sampling_params=_create_sampling_params(), mm_inputs=[], mm_positions=[], - block_ids=[[]], + block_ids=[], generator=None, num_computed_tokens=len(output_token_ids), output_token_ids=output_token_ids, @@ -247,11 +220,11 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, + max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - kv_cache_config=get_kv_cache_config(), ) reqs: list[CachedRequestState] = [] req_id_reqs = {} @@ -337,20 +310,20 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, + max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - kv_cache_config=get_kv_cache_config(), ) ref_input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, + max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - kv_cache_config=get_kv_cache_config(), ) reqs: list[CachedRequestState] = [] diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index e44660525763..725747294fd8 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,16 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 +import weakref import pytest +import torch -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig, VllmConfig) +from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.sampling_params import SamplingParams from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) -from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor) +from vllm.v1.kv_cache_interface import FullAttentionSpec from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -18,34 +17,13 @@ def initialize_kv_cache(runner: GPUModelRunner): """ Only perform necessary steps in GPUModelRunner.initialize_kv_cache() """ - kv_cache_config = KVCacheConfig( - num_blocks=10, - tensors={ - "layer.0": KVCacheTensor(size=1024), - }, - kv_cache_groups=[ - KVCacheGroupSpec( - layer_names=["layer.0"], - kv_cache_spec=FullAttentionSpec( - block_size=16, - num_kv_heads=runner.model_config.get_num_kv_heads( - runner.parallel_config), - head_size=runner.model_config.get_head_size(), - dtype=runner.kv_cache_dtype, - use_mla=False, - )) - ]) - runner.kv_cache_config = kv_cache_config - runner.input_batch = InputBatch( - max_num_reqs=runner.max_num_reqs, - max_model_len=runner.max_model_len, - max_num_batched_tokens=runner.max_num_tokens, - device=runner.device, - pin_memory=runner.pin_memory, - vocab_size=runner.model_config.get_vocab_size(), - kv_cache_config=kv_cache_config, - ) - runner.initialize_attn_backend(kv_cache_config) + kv_cache_spec = FullAttentionSpec(block_size=16, + num_kv_heads=1, + head_size=64, + dtype=torch.float16, + use_mla=False) + runner.attn_metadata_builder = runner.attn_backend.get_builder_cls()( + weakref.proxy(runner), kv_cache_spec, runner.input_batch.block_table) @pytest.fixture @@ -70,12 +48,10 @@ def model_runner(): swap_space=0, cache_dtype="auto", ) - parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, cache_config=cache_config, scheduler_config=scheduler_config, - parallel_config=parallel_config, ) device = "cuda" @@ -97,7 +73,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[[0]], + block_ids=[0], num_computed_tokens=0, lora_request=None, )) @@ -135,14 +111,13 @@ def _is_sampling_metadata_changed(model_runner, def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: req_index = model_runner.input_batch.req_id_to_index[req_id] - block_table = model_runner.input_batch.block_table[0] + block_table = model_runner.input_batch.block_table req_state = model_runner.requests[req_id] - if block_table.num_blocks_per_row[req_index] != len( - req_state.block_ids[0]): + if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids): return False num_blocks = block_table.num_blocks_per_row[req_index] return (block_table.block_table_np[req_index, :num_blocks] == - req_state.block_ids[0]).all() + req_state.block_ids).all() def test_update_states_new_request(model_runner): @@ -225,7 +200,7 @@ def test_update_states_request_resumed(model_runner): req_id=req_id, resumed_from_preemption=False, new_token_ids=[], - new_block_ids=[[]], + new_block_ids=[], num_computed_tokens=0, ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 0421a65a2c81..0fedb6fd5ed9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -288,7 +288,7 @@ def build_connector_meta( for new_req in scheduler_output.scheduled_new_reqs: if new_req.req_id in self._requests_need_load: meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids[0], + block_ids=new_req.block_ids, block_size=self._block_size, is_store=False) total_need_load += 1 @@ -299,7 +299,7 @@ def build_connector_meta( # the original prompt tokens. if not self._found_match_for_request(new_req): meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids[0], + block_ids=new_req.block_ids, block_size=self._block_size, is_store=True) @@ -319,7 +319,7 @@ def build_connector_meta( # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. - block_ids = cached_req.new_block_ids[0] + block_ids = cached_req.new_block_ids meta.add_request(token_ids=token_ids, block_ids=block_ids, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 7ce39110ac01..3abb185c5b8f 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -67,13 +67,13 @@ def __init__(self, runner, kv_cache_spec: AttentionSpec, max_model_len = self.runner.model_config.max_model_len assert max_model_len == 32768,\ "AITER MLA requires max_model_len=32768" - assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ + assert self.runner.block_size == 1, "AITER MLA" \ "only supports block size 1." def _get_paged_kv_tensors( self, block_table: torch.Tensor, seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]: - page_size = self.kv_cache_spec.block_size + page_size = self.runner.block_size block_table_bounds = (seq_lens + page_size - 1) // page_size mask = (torch.arange(block_table.size(1), diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index da18ece7555a..598fc871110e 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -32,16 +32,9 @@ def create_empty(cls) -> "KVCacheBlocks": """Creates a new KVCacheBlocks instance with no blocks.""" return cls([]) - def get_block_ids(self) -> list[list[int]]: - """ - Converts the KVCacheBlocks instance to block_ids. - - Returns: - list[list[int]]: A two-level list where - * the outer list corresponds to KV cache groups (only 1 group now) - * each inner list contains the block_ids of the blocks in that group - """ - return [[block.block_id for block in self.blocks]] + def get_block_ids(self) -> list[int]: + """Converts the KVCacheBlocks instance to a list of block IDs.""" + return [block.block_id for block in self.blocks] def get_unhashed_block_ids(self) -> list[int]: """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" @@ -307,9 +300,9 @@ def get_num_common_prefix_blocks( self, request: Request, num_running_requests: int, - ) -> list[int]: + ) -> int: """Calculate the number of common prefix blocks shared by all requests - in the RUNNING state for each kv cache group. + in the RUNNING state. The function determines this by selecting any request and iterating through its blocks. A block is considered a common prefix block if its @@ -339,14 +332,11 @@ def get_num_common_prefix_blocks( requests in the current step. Returns: - list[int]: The number of common prefix blocks for each kv cache - group. + int: The number of common prefix blocks. """ assert request.status == RequestStatus.RUNNING - return [ - self.single_type_manager.get_num_common_prefix_blocks( - request.request_id, num_running_requests) - ] + return self.single_type_manager.get_num_common_prefix_blocks( + request.request_id, num_running_requests) def free_block_hashes(self, request: Request) -> None: """Discard the block hashes for the request. @@ -364,8 +354,10 @@ def take_events(self) -> list[KVCacheEvent]: """ return self.block_pool.take_events() - def get_block_ids(self, request_id: str) -> list[list[int]]: + def get_block_ids(self, request_id: str) -> list[int]: """Get the block ids of a request.""" assert request_id in self.single_type_manager.req_to_blocks - return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id] - ).get_block_ids() + return [ + block.block_id + for block in self.single_type_manager.req_to_blocks[request_id] + ] diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 403b5401be75..27c515835087 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -577,12 +577,14 @@ def create_kv_cache_group_specs( """ kv_cache_groups = [] for layer_names_one_group in grouped_layer_names: - layer_specs = [ - kv_cache_spec[layer_name] for layer_name in layer_names_one_group - ] - merged_layer_spec = layer_specs[0].merge(layer_specs) + layer_spec = kv_cache_spec[layer_names_one_group[0]] + assert all( + kv_cache_spec[layer_name] == layer_spec + for layer_name in layer_names_one_group[1:]), ( + "All layers in the same KV cache group must share the same " + "KVCacheSpec.") kv_cache_groups.append( - KVCacheGroupSpec(layer_names_one_group, merged_layer_spec)) + KVCacheGroupSpec(layer_names_one_group, layer_spec)) return kv_cache_groups @@ -681,7 +683,6 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): head_size=spec.head_size, dtype=spec.dtype, use_mla=spec.use_mla, - sliding_window=spec.sliding_window, ) diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 257234430983..24032498e50b 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -26,7 +26,7 @@ class NewRequestData: mm_hashes: list[str] mm_positions: list[PlaceholderRange] sampling_params: SamplingParams - block_ids: list[list[int]] + block_ids: list[int] num_computed_tokens: int lora_request: Optional[LoRARequest] @@ -34,7 +34,7 @@ class NewRequestData: def from_request( cls, request: Request, - block_ids: list[list[int]], + block_ids: list[int], ) -> NewRequestData: return cls( req_id=request.request_id, @@ -85,7 +85,7 @@ class CachedRequestData: # request's block IDs instead of appending to the existing block IDs. resumed_from_preemption: bool new_token_ids: list[int] - new_block_ids: list[list[int]] + new_block_ids: list[int] num_computed_tokens: int @classmethod @@ -94,7 +94,7 @@ def from_request( request: Request, resumed_from_preemption: bool, new_token_ids: list[int], - new_block_ids: list[list[int]], + new_block_ids: list[int], ) -> CachedRequestData: return cls( req_id=request.request_id, @@ -131,9 +131,9 @@ class SchedulerOutput: # E.g., if a request has [0, 1], it could mean the vision encoder needs # to process that the request's 0-th and 1-th images in the current step. scheduled_encoder_inputs: dict[str, list[int]] - # Number of common prefix blocks for all requests in each KV cache group. + # Number of common prefix blocks for all requests. # This can be used for cascade attention. - num_common_prefix_blocks: list[int] + num_common_prefix_blocks: int # Request IDs that are finished in between the previous and the current # steps. This is used to notify the workers about the finished requests diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d8fd67e232cb..2152409019b9 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -173,7 +173,7 @@ def schedule(self) -> SchedulerOutput: # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, list[list[int]]] = {} + req_to_new_block_ids: dict[str, list[int]] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -484,8 +484,7 @@ def schedule(self) -> SchedulerOutput: # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. - num_common_prefix_blocks = [0] * len( - self.kv_cache_config.kv_cache_groups) + num_common_prefix_blocks = 0 if self.running: any_request = self.running[0] num_common_prefix_blocks = ( @@ -572,7 +571,7 @@ def _make_cached_request_data( request: Request, num_scheduled_tokens: int, num_scheduled_spec_tokens: int, - new_block_ids: list[list[int]], + new_block_ids: list[int], resumed_from_preemption: bool, ) -> CachedRequestData: # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -947,9 +946,7 @@ def _connector_finished( """ if self.connector is None: return False, None - assert len(self.kv_cache_config.kv_cache_groups - ) == 1, "KV connector only supports one KV cache group now" - block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] + block_ids = self.kv_cache_manager.get_block_ids(request.request_id) return self.connector.request_finished(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: @@ -966,10 +963,9 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: """ if request.request_id not in self.finished_recving_kv_req_ids: return False - assert len(self.kv_cache_config.kv_cache_groups - ) == 1, "KV connector only supports one KV cache group now" + # Now that the blocks are ready, actually cache them. - block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] + block_ids = self.kv_cache_manager.get_block_ids(request.request_id) num_computed_tokens = len(block_ids) * self.block_size if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 2747fc7fabd1..4fc0844cd1f4 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,11 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -import copy from dataclasses import dataclass -from typing import Optional import torch -from typing_extensions import Self from vllm.config import VllmConfig from vllm.logger import init_logger @@ -56,16 +53,6 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: """ raise NotImplementedError - @classmethod - def merge(cls, specs: list[Self]) -> Self: - """ - Merge a list of KVCacheSpec objects into a single KVCacheSpec object. - """ - assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), ( - "All layers in the same KV cache group must share the same " - "type_id.") - return copy.deepcopy(specs[0]) - @dataclass class AttentionSpec(KVCacheSpec): @@ -84,16 +71,6 @@ def page_size_bytes(self) -> int: @dataclass class FullAttentionSpec(AttentionSpec): - sliding_window: Optional[int] = None - """ - When hybrid allocator is disabled and the model contains both full - attention layers and sliding window attention layers, sliding - window attention are regarded as full attention in KV cache manager - (blocks are allocated for all tokens), while computed as sliding window - attention in model runner. - In this case, we use FullAttentionSpec and record the sliding window size. - Default to None for not using sliding window attention. - """ @property def type_id(self) -> str: @@ -103,25 +80,6 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len return cdiv(max_model_len, self.block_size) * self.page_size_bytes - @classmethod - def merge(cls, specs: list[Self]) -> Self: - """ - Merge a list of FullAttentionSpec objects into a single - FullAttentionSpec object. - """ - merged_spec = super().merge(specs) - sliding_window = set(spec.sliding_window for spec in specs - if spec.sliding_window is not None) - if len(sliding_window) == 0: - merged_spec.sliding_window = None - elif len(sliding_window) == 1: - merged_spec.sliding_window = sliding_window.pop() - else: - raise ValueError( - "All sliding window layers in the same KV cache group " - "must have the same window size.") - return merged_spec - @dataclass class SlidingWindowSpec(AttentionSpec): diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 0c3341691509..581d3d9bd11b 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -4,8 +4,6 @@ import torch from vllm.logger import init_logger -from vllm.utils import cdiv -from vllm.v1.kv_cache_interface import KVCacheConfig logger = init_logger(__name__) @@ -98,48 +96,3 @@ def get_cpu_tensor(self) -> torch.Tensor: def get_numpy_array(self) -> np.ndarray: """Returns the numpy array of the block table.""" return self.block_table_np - - -class MultiGroupBlockTable: - """The BlockTables for each KV cache group.""" - - def __init__(self, max_num_reqs: int, max_model_len: int, - max_num_batched_tokens: int, pin_memory: bool, - device: torch.device, kv_cache_config: KVCacheConfig) -> None: - max_num_blocks_per_req = [ - cdiv(max_model_len, g.kv_cache_spec.block_size) - for g in kv_cache_config.kv_cache_groups - ] - self.block_tables = [ - BlockTable(max_num_reqs, max_num_blocks_per_req[i], - max_num_batched_tokens, pin_memory, device) - for i in range(len(kv_cache_config.kv_cache_groups)) - ] - - def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: - for i, block_table in enumerate(self.block_tables): - block_table.append_row(block_ids[i], row_idx) - - def add_row(self, block_ids: list[list[int]], row_idx: int) -> None: - for i, block_table in enumerate(self.block_tables): - block_table.add_row(block_ids[i], row_idx) - - def move_row(self, src: int, tgt: int) -> None: - for block_table in self.block_tables: - block_table.move_row(src, tgt) - - def swap_row(self, src: int, tgt: int) -> None: - for block_table in self.block_tables: - block_table.swap_row(src, tgt) - - def commit(self, num_reqs: int) -> None: - for block_table in self.block_tables: - block_table.commit(num_reqs) - - def clear(self) -> None: - for block_table in self.block_tables: - block_table.clear() - - def __getitem__(self, idx: int) -> "BlockTable": - """Returns the BlockTable for the i-th KV cache group.""" - return self.block_tables[idx] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 570de9bddd29..871654fca366 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -11,11 +11,10 @@ from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import swap_dict_values -from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import LogprobsTensors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import copy_slice -from vllm.v1.worker.block_table import MultiGroupBlockTable +from vllm.v1.worker.block_table import BlockTable _SAMPLING_EPS = 1e-5 @@ -30,7 +29,7 @@ class CachedRequestState: sampling_params: SamplingParams generator: Optional[torch.Generator] - block_ids: list[list[int]] + block_ids: list[int] num_computed_tokens: int output_token_ids: list[int] @@ -59,14 +58,15 @@ def __init__( self, max_num_reqs: int, max_model_len: int, + max_num_blocks_per_req: int, max_num_batched_tokens: int, device: torch.device, pin_memory: bool, vocab_size: int, - kv_cache_config: KVCacheConfig, ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len + self.max_num_blocks_per_req = max_num_blocks_per_req self.max_num_batched_tokens = max_num_batched_tokens self.device = device self.pin_memory = pin_memory @@ -99,13 +99,12 @@ def __init__( self.num_computed_tokens_cpu_tensor.numpy() # Block table. - self.block_table = MultiGroupBlockTable( + self.block_table = BlockTable( max_num_reqs=max_num_reqs, - max_model_len=max_model_len, + max_num_blocks_per_req=max_num_blocks_per_req, max_num_batched_tokens=max_num_batched_tokens, pin_memory=pin_memory, device=device, - kv_cache_config=kv_cache_config, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 201796c96ee5..e26f97d816ae 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -12,8 +12,6 @@ import torch.nn as nn from vllm.attention import AttentionType, get_attn_backend -from vllm.attention.backends.abstract import (AttentionBackend, - AttentionMetadataBuilder) from vllm.attention.layer import Attention from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import (CompilationLevel, VllmConfig, @@ -34,8 +32,8 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, cdiv, check_use_alibi, - is_pin_memory_available) + GiB_bytes, LayerBlockType, LazyLoader, cdiv, + check_use_alibi, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -53,7 +51,6 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.utils import is_spec_decode_supported from vllm.v1.utils import bind_kv_cache -from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin @@ -105,17 +102,59 @@ def __init__( self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] + # NOTE(woosuk): sliding_window is None for models with interleaved + # attention. Use interleaved_sliding_window instead. + self.sliding_window = model_config.get_sliding_window() + self.interleaved_sliding_window = getattr( + model_config.hf_text_config, "interleaved_sliding_window", None) + self.window_size = (self.sliding_window + or self.interleaved_sliding_window) + self.is_multimodal_model = model_config.is_multimodal_model + self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len + self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs # Model-related. + self.num_attn_layers = model_config.get_num_layers_by_block_type( + parallel_config, LayerBlockType.attention) self.num_query_heads = model_config.get_num_attention_heads( parallel_config) + self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) + self.head_size = model_config.get_head_size() self.hidden_size = model_config.get_hidden_size() self.attention_chunk_size = model_config.attention_chunk_size + self.attn_backend = get_attn_backend( + self.head_size, + self.dtype, + self.kv_cache_dtype, + self.block_size, + self.model_config.is_attention_free, + use_mla=self.model_config.use_mla, + ) + if self.attn_backend is None: + error_msg = ( + f"Error with get_att_backend: {self.head_size=}, " + f"{self.dtype=}, {self.kv_cache_dtype=}, {self.block_size=}, " + f"{self.model_config.is_attention_free=}, " + f"{self.model_config.use_mla=}") + logger.error(error_msg) + raise NotImplementedError( + "Non-Attention backend is not supported by V1 GPUModelRunner.") + + if self.vllm_config.compilation_config.full_cuda_graph: + attn_backend_name = self.attn_backend.__name__ + flash_attn_version = get_flash_attn_version() + if attn_backend_name != "FlashAttentionBackend" or \ + flash_attn_version != 3: + raise ValueError( + f"full_cuda_graph is only supported with " + f"FA3. Current attention backend is {attn_backend_name}, " + f"FlashAttention version is {flash_attn_version}.") + self.cascade_attn_enabled = not self.model_config.disable_cascade_attn # Multi-modal data support @@ -137,10 +176,8 @@ def __init__( # self.model: nn.Module # Set after load_model # Initialize in initialize_kv_cache self.kv_caches: list[torch.Tensor] = [] - self.attn_metadata_builders: list[AttentionMetadataBuilder] = [] - self.attn_backends: list[type[AttentionBackend]] = [] # self.kv_cache_config: KVCacheConfig - # self.input_batch: InputBatch # Persistent batch. + # self.attn_metadata_builder: type[AttentionMetadataBuilder] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} @@ -169,6 +206,16 @@ def __init__( # Request states. self.requests: dict[str, CachedRequestState] = {} + # Persistent batch. + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_blocks_per_req=self.max_num_blocks_per_req, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=model_config.get_vocab_size(), + ) self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE @@ -263,31 +310,6 @@ def __init__( pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() - def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: - """ - Update the order of requests in the batch based on the attention - backend's needs. For example, some attention backends (namely MLA) may - want to separate requests based on if the attention computation will be - compute-bound or memory-bound. - - Args: - scheduler_output: The scheduler output. - - Returns: - True if the batch was reordered, False otherwise. - """ - batch_reordered = self.attn_metadata_builders[0].reorder_batch( - self.input_batch, scheduler_output) - - # For models with multiple KV cache groups, the groups should agree on - # the same order of requests. We ensure this by only allowing the first - # group to reorder the batch and asserting that all other groups do not - # reorder the batch. - for i in range(1, len(self.kv_cache_config.kv_cache_groups)): - assert not self.attn_metadata_builders[i].reorder_batch( - self.input_batch, scheduler_output) - return batch_reordered - def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -424,8 +446,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - for i in range(len(self.kv_cache_config.kv_cache_groups)): - req_state.block_ids[i].extend(req_data.new_block_ids[i]) + req_state.block_ids.extend(req_data.new_block_ids) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones. @@ -483,7 +504,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: if removed_req_indices: self.input_batch.condense(removed_req_indices) - batch_reordered = self._may_reorder_batch(scheduler_output) + # Some attention backends (namely MLA) may want to separate requests + # based on if the attention computation will be compute-bound or + # memory-bound. This gives them a hook to do that. + batch_reordered = self.attn_metadata_builder.reorder_batch( + self.input_batch, scheduler_output) if batch_changed or batch_reordered: self.input_batch.refresh_sampling_metadata() @@ -551,29 +576,21 @@ def _prepare_inputs( torch.from_numpy(token_indices), out=self.input_ids_cpu[:total_num_scheduled_tokens]) - # Calculate the slot mapping for each KV cache group. - for kv_cache_group_id, kv_cache_group_spec in enumerate( - self.kv_cache_config.kv_cache_groups): - block_size = kv_cache_group_spec.kv_cache_spec.block_size - block_table: BlockTable = self.input_batch.block_table[ - kv_cache_group_id] - # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] - # where K is the max_num_blocks_per_req and the block size is 2. - # NOTE(woosuk): We can't simply use `token_indices // block_size` - # here because M (max_model_len) is not necessarily divisible by - # block_size. - block_table_indices = ( - req_indices * block_table.max_num_blocks_per_req + - positions_np // block_size) - block_table_cpu = block_table.get_cpu_tensor() - block_numbers = block_table_cpu.flatten( - )[block_table_indices].numpy() - block_offsets = positions_np % block_size - np.add( - block_numbers * block_size, - block_offsets, - out=block_table.slot_mapping_np[:total_num_scheduled_tokens]) + # Calculate the slot mapping. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` here + # because M (max_model_len) is not necessarily divisible by block_size. + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size) + block_table_cpu = self.input_batch.block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() + block_offsets = positions_np % self.block_size + np.add(block_numbers * self.block_size, + block_offsets, + out=self.input_batch.block_table. + slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. self.query_start_loc_np[0] = 0 @@ -615,6 +632,10 @@ def _prepare_inputs( attn_metadata: dict[str, FlashAttentionMetadata] = {} # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. + # NOTE(Chen): there is exactly one KV cache group that contains all + # attetnion layers in the model for now, so the current logic for + # getting attn_metadata is not related to kv_cache_group information. + # Will extend this part to support multiple KV cache groups later. for kv_cache_group_id, kv_cache_group_spec in enumerate( self.kv_cache_config.kv_cache_groups): @@ -623,19 +644,15 @@ def _prepare_inputs( if self.cascade_attn_enabled: common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, - scheduler_output. - num_common_prefix_blocks[kv_cache_group_id], - kv_cache_group_spec.kv_cache_spec, - self.attn_metadata_builders[kv_cache_group_id], + scheduler_output.num_common_prefix_blocks, ) - attn_metadata_i = ( - self.attn_metadata_builders[kv_cache_group_id].build( - num_reqs=num_reqs, - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, - common_prefix_len=common_prefix_len, - common_attn_metadata=common_attn_metadata)) + attn_metadata_i = self.attn_metadata_builder.build( + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata) for layer_name in kv_cache_group_spec.layer_names: attn_metadata[layer_name] = attn_metadata_i @@ -673,8 +690,6 @@ def _compute_cascade_attn_prefix_len( self, num_scheduled_tokens: np.ndarray, num_common_prefix_blocks: int, - kv_cache_spec: KVCacheSpec, - attn_metadata_builder: AttentionMetadataBuilder, ) -> int: """Compute the length of the common prefix for cascade attention. @@ -693,7 +708,7 @@ def _compute_cascade_attn_prefix_len( Returns: int: Length of common prefix in tokens. """ - common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size + common_prefix_len = num_common_prefix_blocks * self.block_size if common_prefix_len == 0: # Common case. return 0 @@ -742,19 +757,15 @@ def _compute_cascade_attn_prefix_len( common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) # common_prefix_len should be a multiple of the block size. - common_prefix_len = (common_prefix_len // kv_cache_spec.block_size * - kv_cache_spec.block_size) - use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or - (isinstance(kv_cache_spec, FullAttentionSpec) - and kv_cache_spec.sliding_window is not None)) - assert isinstance(kv_cache_spec, AttentionSpec) - use_cascade = attn_metadata_builder.use_cascade_attention( + common_prefix_len = (common_prefix_len // self.block_size * + self.block_size) + use_cascade = self.attn_metadata_builder.use_cascade_attention( common_prefix_len=common_prefix_len, query_lens=num_scheduled_tokens, num_query_heads=self.num_query_heads, - num_kv_heads=kv_cache_spec.num_kv_heads, + num_kv_heads=self.num_kv_heads, use_alibi=self.use_alibi, - use_sliding_window=use_sliding_window, + use_sliding_window=self.window_size is not None, num_sms=self.num_sms, ) return common_prefix_len if use_cascade else 0 @@ -1640,7 +1651,7 @@ def _dummy_run( dtype=np.int32) if skip_attn: - attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None + attn_metadata = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs] @@ -1648,19 +1659,13 @@ def _dummy_run( common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, seq_lens=seq_lens) - attn_metadata = {} - for kv_cache_group_id, kv_cache_group_spec in enumerate( - self.kv_cache_config.kv_cache_groups): - attn_metadata_i = ( - self.attn_metadata_builders[kv_cache_group_id].build( - num_reqs=num_tokens, - num_actual_tokens=num_tokens, - max_query_len=num_tokens, - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - )) - for layer_name in kv_cache_group_spec.layer_names: - attn_metadata[layer_name] = attn_metadata_i + attn_metadata = self.attn_metadata_builder.build( + num_reqs=num_tokens, + num_actual_tokens=num_tokens, + max_query_len=num_tokens, + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): @@ -1890,56 +1895,6 @@ def capture_model(self) -> None: logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) - def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: - """ - Initialize the attention backends and attention metadata builders. - """ - assert len(self.attn_backends) == 0 and len( - self.attn_metadata_builders - ) == 0, "Attention backends are already initialized" - for i, kv_cache_group_spec in enumerate( - kv_cache_config.kv_cache_groups): - kv_cache_spec = kv_cache_group_spec.kv_cache_spec - if not isinstance(kv_cache_spec, AttentionSpec): - raise NotImplementedError( - "Only AttentionSpec is supported for now.") - attn_backend_i = get_attn_backend( - kv_cache_spec.head_size, - self.dtype, - kv_cache_spec.dtype, - kv_cache_spec.block_size, - self.model_config.is_attention_free, - use_mla=kv_cache_spec.use_mla, - ) - if attn_backend_i is None: - error_msg = ( - f"Error with get_attn_backend: {kv_cache_spec.head_size=}, " - f"{self.dtype=}, {kv_cache_spec.dtype=}, " - f"{kv_cache_spec.block_size=}, " - f"{self.model_config.is_attention_free=}, " - f"{kv_cache_spec.use_mla=}") - logger.error(error_msg) - raise NotImplementedError( - "Non-Attention backend is not supported by V1 " - "GPUModelRunner.") - - if self.vllm_config.compilation_config.full_cuda_graph: - attn_backend_name = attn_backend_i.__name__ - flash_attn_version = get_flash_attn_version() - if attn_backend_name != "FlashAttentionBackend" or \ - flash_attn_version != 3: - raise ValueError( - f"full_cuda_graph is only supported with " - f"FA3. Current attention backend is " - f"{attn_backend_name}, FlashAttention version is " - f"{flash_attn_version}.") - - block_table_i = self.input_batch.block_table[i] - attn_metadata_builder_i = attn_backend_i.get_builder_cls()( - weakref.proxy(self), kv_cache_spec, block_table_i) - self.attn_backends.append(attn_backend_i) - self.attn_metadata_builders.append(attn_metadata_builder_i) - def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -1947,21 +1902,15 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ + if len(kv_cache_config.kv_cache_groups) > 1: + raise NotImplementedError( + "Hybrid models with more than one KV cache type are not " + "supported yet.") self.kv_cache_config = kv_cache_config - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.model_config.get_vocab_size(), - kv_cache_config=kv_cache_config, - ) - self.initialize_attn_backend(kv_cache_config) kv_caches: dict[str, torch.Tensor] = {} - for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups): + for kv_cache_group in kv_cache_config.kv_cache_groups: kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: tensor_config = kv_cache_config.tensors[layer_name] @@ -1976,7 +1925,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: # the min of all `num_blocks`. Verify it here. assert num_blocks >= kv_cache_config.num_blocks if isinstance(kv_cache_spec, AttentionSpec): - kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( + kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype @@ -1996,6 +1945,11 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: if has_kv_transfer_group(): get_kv_transfer_group().register_kv_caches(kv_caches) + self.attn_metadata_builder = self.attn_backend.get_builder_cls()( + weakref.proxy(self), + kv_cache_config.kv_cache_groups[0].kv_cache_spec, + self.input_batch.block_table) + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 2da99696445e..b4daf5a34678 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -171,10 +171,19 @@ def __init__( self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} - # self.input_batch: InputBatch # Persistent batch. # Request states. self.requests: dict[str, CachedRequestState] = {} + # Persistent batch. + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_blocks_per_req=self.max_num_blocks_per_req, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.vocab_size, + ) # Cached torch/numpy tensor # The pytorch tensor and numpy array share the same buffer. @@ -190,7 +199,7 @@ def __init__( self.block_table_cpu = torch.zeros( (self.max_num_reqs, self.max_num_blocks_per_req), - dtype=torch.int32, + dtype=self.input_batch.block_table.get_cpu_tensor().dtype, device="cpu") self.query_start_loc_cpu = torch.zeros(self.max_num_tokens + 1, @@ -515,12 +524,12 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. - block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() + block_table_cpu = self.input_batch.block_table.get_cpu_tensor() block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, block_offsets, - out=self.input_batch.block_table[0]. + out=self.input_batch.block_table. slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. @@ -545,15 +554,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): self.position_ids = self.positions_cpu[: padded_total_num_scheduled_tokens].to( self.device) - self.input_batch.block_table[0].slot_mapping_cpu[ + self.input_batch.block_table.slot_mapping_cpu[ total_num_scheduled_tokens:] = _PAD_SLOT_ID slot_mapping = ( - self.input_batch.block_table[0]. + self.input_batch.block_table. slot_mapping_cpu[:padded_total_num_scheduled_tokens].to( self.device)) block_tables = self.block_table_cpu[:self.max_num_reqs] block_tables[:num_reqs, :self.max_num_blocks_per_req] = ( - self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs]) + self.input_batch.block_table.get_cpu_tensor()[:num_reqs]) block_tables = block_tables.to(self.device) query_start_loc = self.query_start_loc_cpu[:self.max_num_reqs + 1].to( self.device) @@ -1254,18 +1263,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: "Hybrid models with more than one KV cache type are not " "supported yet.") - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.model_config.get_vocab_size(), - kv_cache_config=kv_cache_config, - ) - assert self.block_table_cpu.dtype == self.input_batch.block_table[ - 0].get_cpu_tensor().dtype - kv_caches: dict[str, torch.Tensor] = {} for kv_cache_group in kv_cache_config.kv_cache_groups: From 94d8ec8d2bcb4ec55e33022b313c7e978edf05e1 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 22 May 2025 01:34:28 +0800 Subject: [PATCH 038/192] [FEAT][ROCm] Upgrade AITER MLA v1 backend (#18338) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: vllmellm Co-authored-by: Luka Govedič --- docker/Dockerfile.rocm_base | 2 +- .../attention/backends/mla/rocm_aiter_mla.py | 36 +++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 222b9c158e5e..45efcbde698b 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="1a7f4dfa" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="5a77249" +ARG AITER_BRANCH="c1debd8" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 3abb185c5b8f..56ac834b4d7e 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -53,6 +53,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata): # The number of entries in the last page of each request in # the paged kv cache, shape: [batch_size] paged_kv_last_page_len: Optional[torch.Tensor] = None + # The query indptr, shape : [num_decode + 1] + qo_indptr: Optional[torch.Tensor] = None class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): @@ -75,27 +77,33 @@ def _get_paged_kv_tensors( seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]: page_size = self.runner.block_size block_table_bounds = (seq_lens + page_size - 1) // page_size + device = self.runner.device mask = (torch.arange(block_table.size(1), dtype=block_table.dtype, - device=block_table.device).unsqueeze(0) + device=device).unsqueeze(0) < block_table_bounds.unsqueeze(1)) paged_kv_indices = block_table[mask] paged_kv_indptr = torch.cat([ - torch.zeros(1, - dtype=block_table_bounds.dtype, - device=block_table_bounds.device), + torch.zeros(1, dtype=block_table_bounds.dtype, device=device), block_table_bounds.cumsum(dim=0, dtype=torch.int32) ]) paged_kv_last_page_len = seq_lens % page_size paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len) + qo_indptr = torch.arange(0, + self._num_decodes + 1, + step=1, + dtype=torch.int32, + device=device) + return ( paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len, + qo_indptr, ) def _build_decode(self, block_table_tensor: torch.Tensor, @@ -105,6 +113,7 @@ def _build_decode(self, block_table_tensor: torch.Tensor, paged_kv_indices, paged_kv_indptr, paged_last_page_len, + qo_indptr, ) = self._get_paged_kv_tensors(block_table_tensor, seq_lens) attn_metadata = AiterMLADecodeMetadata( @@ -112,7 +121,8 @@ def _build_decode(self, block_table_tensor: torch.Tensor, seq_lens=seq_lens, paged_kv_indptr=paged_kv_indptr, paged_kv_indices=paged_kv_indices, - paged_kv_last_page_len=paged_last_page_len) + paged_kv_last_page_len=paged_last_page_len, + qo_indptr=qo_indptr) return attn_metadata @@ -137,7 +147,10 @@ def __init__( alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, **mla_args) - + assert (num_heads == 16 or num_heads == 128), ( + f"Aiter MLA only supports 16 or 128 number of heads.\n" + f"Provided {num_heads} number of heads.\n" + "Try adjusting tensor_parallel_size value.") unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap ] @@ -189,7 +202,18 @@ def _forward_decode( kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2) + if self.num_heads == 16: + # AITER MLA decode kernel only supports + # max_seqlen_q=1 when using 16 heads. + max_seqlen_qo = 1 + else: + # AITER MLA decode Kernel handles arbitrary + # max_seqlen_q values when using 128 heads. + assert attn_metadata.prefill is not None + max_seqlen_qo = attn_metadata.prefill.max_query_len + aiter_mla_decode_fwd(q, kv_buffer, o, self.scale, + attn_metadata.decode.qo_indptr, max_seqlen_qo, attn_metadata.decode.paged_kv_indptr, attn_metadata.decode.paged_kv_indices, attn_metadata.decode.paged_kv_last_page_len) From 1f079540db5f1080a2f61a730da50d3009934c5a Mon Sep 17 00:00:00 2001 From: Sebastian Schoennenbeck Date: Wed, 21 May 2025 22:41:23 +0200 Subject: [PATCH 039/192] [Bugfix] Consistent ascii handling in tool parsers (#17704) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sebastian Schönnenbeck --- .../tool_parsers/granite_20b_fc_tool_parser.py | 12 ++++++++---- .../openai/tool_parsers/granite_tool_parser.py | 12 ++++++++---- .../openai/tool_parsers/internlm2_tool_parser.py | 12 ++++++++---- .../openai/tool_parsers/jamba_tool_parser.py | 16 ++++++++++------ .../openai/tool_parsers/llama_tool_parser.py | 12 ++++++++---- .../openai/tool_parsers/phi4mini_tool_parser.py | 9 +++++---- .../openai/tool_parsers/pythonic_tool_parser.py | 9 ++++++--- 7 files changed, 53 insertions(+), 29 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index b93de6b41817..a589f814f88f 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -80,7 +80,8 @@ def extract_tool_calls( function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]), + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), ), ) for function_call in raw_function_calls ] @@ -166,7 +167,8 @@ def extract_tool_calls_streaming( if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -218,7 +220,8 @@ def extract_tool_calls_streaming( if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -226,7 +229,8 @@ def extract_tool_calls_streaming( if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index 6710e7938c43..b8bf142530ee 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -67,7 +67,8 @@ def extract_tool_calls( function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]), + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), ), ) for function_call in raw_function_calls ] @@ -151,7 +152,8 @@ def extract_tool_calls_streaming( if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -197,7 +199,8 @@ def extract_tool_calls_streaming( if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -205,7 +208,8 @@ def extract_tool_calls_streaming( if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( prev_args_json, cur_args_json) diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 5abd553d884d..3f2799f8010a 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -133,7 +133,8 @@ def extract_tool_calls_streaming( delta = None # first time to get parameters elif cur_arguments and not prev_arguments: - cur_arguments_json = json.dumps(cur_arguments) + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) arguments_delta = cur_arguments_json[:cur_arguments_json. index(delta_text) + @@ -148,8 +149,10 @@ def extract_tool_calls_streaming( self.current_tool_id] += arguments_delta # both prev and cur parameters, send the increase parameters elif cur_arguments and prev_arguments: - cur_args_json = json.dumps(cur_arguments) - prev_args_json = json.dumps(prev_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) argument_diff = extract_intermediate_diff( cur_args_json, prev_args_json) @@ -190,7 +193,8 @@ def extract_tool_calls( action_dict = json.loads(action) name, parameters = action_dict['name'], json.dumps( action_dict.get('parameters', action_dict.get('arguments', - {}))) + {})), + ensure_ascii=False) if not tools or name not in [t.function.name for t in tools]: ExtractedToolCallInformation(tools_called=False, diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 6cac6f8163bf..fbe2ecbb4701 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -96,8 +96,9 @@ def extract_tool_calls( function=FunctionCall( name=function_call["name"], # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"]))) - for function_call in raw_function_calls + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False), + )) for function_call in raw_function_calls ] content = model_output[:model_output. @@ -187,7 +188,7 @@ def extract_tool_calls_streaming( diff: Union[str, None] = current_tool_call.get("arguments") if diff: - diff = json.dumps(diff).replace( + diff = json.dumps(diff, ensure_ascii=False).replace( self.streamed_args_for_tool[self.current_tool_id], "") delta = DeltaMessage(tool_calls=[ @@ -248,7 +249,8 @@ def extract_tool_calls_streaming( "mid-arguments") delta = None elif cur_arguments and not prev_arguments: - cur_arguments_json = json.dumps(cur_arguments) + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) logger.debug("finding %s in %s", new_text, cur_arguments_json) @@ -267,8 +269,10 @@ def extract_tool_calls_streaming( self.current_tool_id] += arguments_delta elif cur_arguments and prev_arguments: - cur_args_json = json.dumps(cur_arguments) - prev_args_json = json.dumps(prev_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) logger.debug("Searching for diff between \n%s\n%s", cur_args_json, prev_args_json) diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 9307034f40d6..9338718908cd 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -88,7 +88,8 @@ def extract_tool_calls( # function call args are JSON but as a string arguments=json.dumps(raw_function_call["arguments"] \ if "arguments" in raw_function_call \ - else raw_function_call["parameters"]))) + else raw_function_call["parameters"], + ensure_ascii=False))) for raw_function_call in function_call_arr ] @@ -174,7 +175,8 @@ def extract_tool_calls_streaming( if self.current_tool_id >= 0: cur_arguments = current_tool_call.get("arguments") if cur_arguments: - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) sent = len( self.streamed_args_for_tool[self.current_tool_id]) argument_diff = cur_args_json[sent:] @@ -226,7 +228,8 @@ def extract_tool_calls_streaming( if cur_arguments: sent = len( self.streamed_args_for_tool[self.current_tool_id]) - cur_args_json = json.dumps(cur_arguments) + cur_args_json = json.dumps(cur_arguments, + ensure_ascii=False) prev_arguments = self.prev_tool_call_arr[ self.current_tool_id].get("arguments") @@ -234,7 +237,8 @@ def extract_tool_calls_streaming( if is_complete[self.current_tool_id]: argument_diff = cur_args_json[sent:] elif prev_arguments: - prev_args_json = json.dumps(prev_arguments) + prev_args_json = json.dumps(prev_arguments, + ensure_ascii=False) if cur_args_json != prev_args_json: prefix = find_common_prefix( diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index abf70a5e85c4..e4ac2c47ba08 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -79,10 +79,11 @@ def extract_tool_calls( name=raw_function_call["name"], # function call args are JSON but as a string arguments=json.dumps( - raw_function_call["arguments"] if "arguments" in - raw_function_call else - raw_function_call["parameters"]))) - for raw_function_call in function_call_arr + raw_function_call["arguments"] + if "arguments" in raw_function_call else + raw_function_call["parameters"], + ensure_ascii=False), + )) for raw_function_call in function_call_arr ] # get any content before the tool call diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index bb91a35af3be..5f5ee43b0482 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -200,9 +200,12 @@ def _handle_single_tool(call: ast.Call) -> ToolCall: arguments = {} for keyword in call.keywords: arguments[keyword.arg] = _get_parameter_value(keyword.value) - return ToolCall(type="function", - function=FunctionCall(name=function_name, - arguments=json.dumps(arguments))) + return ToolCall( + type="function", + function=FunctionCall(name=function_name, + arguments=json.dumps(arguments, + ensure_ascii=False)), + ) def _make_valid_python(text: str) -> Union[tuple[str, str], None]: From 20bd6f4d2ee10d6187d93e4bc6a64f78195243a2 Mon Sep 17 00:00:00 2001 From: Dhia Eddine Rhaiem <163106757+dhiaEddineRhaiem@users.noreply.github.com> Date: Thu, 22 May 2025 06:23:59 +0400 Subject: [PATCH 040/192] [FalconH1] Fix output dtype in RMSNorm fallback path for Falcon-H1 (e.g. 0.5B) (#18500) Signed-off-by: dhia.rhaiem Co-authored-by: younesbelkada Co-authored-by: Ilyas Chahed Co-authored-by: Jingwei Zuo --- vllm/model_executor/layers/mamba/mamba_mixer2.py | 8 +++++--- vllm/model_executor/models/falcon_h1.py | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index d44d2c790198..f94ab75f9a4f 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -77,7 +77,7 @@ def forward_native( input_dtype = x.dtype x = x * nn.functional.silu(gate.to(torch.float32)) if not self.use_rms_norm: - return x + return x.to(input_dtype) if self.n_groups == 1: if self.tp_size > 1: @@ -117,9 +117,11 @@ def forward_cuda( x: torch.Tensor, gate: torch.Tensor, ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: - + input_dtype = x.dtype if not self.use_rms_norm: - return x * nn.functional.silu(gate.to(torch.float32)) + # Keep gate in float32 for numerical stability during silu + return x * nn.functional.silu(gate.to( + torch.float32)).to(input_dtype) if self.tp_size > 1 or self.n_groups != 1: return self.forward_native(x, gate) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 591a75ffdb73..1c0e3911fcce 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -453,7 +453,6 @@ def forward( attn_metadata = get_forward_context().attn_metadata mamba2_metadata = prepare_mamba2_metadata( chunk_size=self.config.mamba_chunk_size, - input_ids=input_ids, attn_metadata=attn_metadata, ) if get_pp_group().is_first_rank: From 176d62e4ea6b71b4b8d8787ec544e7665fcce20b Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Thu, 22 May 2025 11:17:34 +0800 Subject: [PATCH 041/192] [MISC] update project urls in pyproject.toml (#18519) Signed-off-by: Andy Xie --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0b803a26b658..3011cffb8f1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,8 +35,8 @@ dynamic = [ "version", "dependencies", "optional-dependencies"] [project.urls] Homepage="https://github.com/vllm-project/vllm" -Documentation="https://vllm.readthedocs.io/en/latest/" -Slack="http://slack.vllm.ai/" +Documentation="https://docs.vllm.ai/en/latest/" +Slack="https://slack.vllm.ai/" [project.scripts] vllm = "vllm.entrypoints.cli.main:main" From 6e0fd34d3c46a65e0f0d14f472ec3e5da53b2411 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 21 May 2025 23:19:13 -0400 Subject: [PATCH 042/192] [CI] Fix race condition with StatelessProcessGroup.barrier (#18506) Signed-off-by: Russell Bryant --- tests/distributed/test_shm_broadcast.py | 10 +- .../device_communicators/shm_broadcast.py | 18 +- vllm/distributed/utils.py | 154 +++++++++++++++++- 3 files changed, 157 insertions(+), 25 deletions(-) diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index 711c2441f34b..f9eacc11d75f 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -9,7 +9,7 @@ from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.distributed.utils import StatelessProcessGroup -from vllm.utils import get_ip, get_open_port, update_environment_variables +from vllm.utils import get_open_port, update_environment_variables def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]: @@ -60,12 +60,12 @@ def worker_fn(): rank = dist.get_rank() if rank == 0: port = get_open_port() - ip = get_ip() + ip = '127.0.0.1' dist.broadcast_object_list([ip, port], src=0) else: recv = [None, None] dist.broadcast_object_list(recv, src=0) - ip, port = recv + ip, port = recv # type: ignore stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size()) @@ -107,10 +107,10 @@ def worker_fn(): if pg == dist.group.WORLD: dist.barrier() - print("torch distributed passed the test!") + print(f"torch distributed passed the test! Rank {rank}") else: pg.barrier() - print("StatelessProcessGroup passed the test!") + print(f"StatelessProcessGroup passed the test! Rank {rank}") def test_shm_broadcast(): diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index fa944407a703..40e57e6624d1 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -import os import pickle -import sys import time from contextlib import contextmanager from dataclasses import dataclass, field @@ -19,7 +17,7 @@ from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore import vllm.envs as envs -from vllm.distributed.utils import StatelessProcessGroup +from vllm.distributed.utils import StatelessProcessGroup, sched_yield from vllm.logger import init_logger from vllm.utils import (get_ip, get_open_port, get_open_zmq_ipc_path, is_valid_ipv6_address) @@ -28,20 +26,6 @@ logger = init_logger(__name__) -# We prefer to use os.sched_yield as it results in tighter polling loops, -# measured to be around 3e-7 seconds. However on earlier versions of Python -# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0) -USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1)) - or (sys.version_info[:2] == (3, 10) - and sys.version_info[2] >= 8)) - - -def sched_yield(): - if USE_SCHED_YIELD: - os.sched_yield() - else: - time.sleep(0) - class ShmRingBuffer: diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 6bb323d79d64..93a069d36c4b 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -6,9 +6,12 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import dataclasses import datetime +import os import pickle import socket +import sys import time +import uuid from collections import deque from collections.abc import Sequence from typing import Any, Optional @@ -27,6 +30,20 @@ logger = init_logger(__name__) +# We prefer to use os.sched_yield as it results in tighter polling loops, +# measured to be around 3e-7 seconds. However on earlier versions of Python +# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0) +USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1)) + or (sys.version_info[:2] == (3, 10) + and sys.version_info[2] >= 8)) + + +def sched_yield(): + if USE_SCHED_YIELD: + os.sched_yield() + else: + time.sleep(0) + def ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" @@ -212,10 +229,141 @@ def all_gather_obj(self, obj: Any) -> list[Any]: gathered_objs.append(recv_obj) return gathered_objs - def barrier(self): - """A barrier to synchronize all ranks.""" + def barrier(self, timeout: float = 30.0): + """A robust barrier to synchronize all ranks. + + + Uses a multi-phase approach to ensure all processes reach the barrier + before proceeding: + + 1. Each process signals it has reached the barrier + + 2. Each process signals that it has confirmed the arrival of all other + ranks. + + 3. Rank 0 waits for all other ranks to signal their departure to ensure + that all ranks have departed the barrier first. + + Args: + timeout: Maximum time in seconds to wait for each phase (in seconds) + + + Raises: + RuntimeError: If coordination fails or times out + """ + # Generate a barrier ID that is globally unique + try: + if self.rank == 0: + barrier_id = f"barrier_{uuid.uuid4()}" + self.broadcast_obj(barrier_id, src=0) + else: + barrier_id = self.broadcast_obj(None, src=0) + except Exception as e: + raise RuntimeError("Failed to broadcast barrier_id") from e + + # Phase 1: Signal arrival at barrier + # Wait for all processes to arrive + # We need all ranks to confirm the arrival of all other ranks. + # This is the key synchronization point. + arrival_key = f"arrival_{barrier_id}_{self.rank}" + try: + self.store.set(arrival_key, b"1") + except Exception as e: + raise RuntimeError("Failed to signal barrier arrival") from e + + start_time = time.time() + processes_arrived: set[int] = set() + + while len(processes_arrived) < self.world_size: + # Check for timeout + cur_time = time.time() + if cur_time - start_time > timeout: + raise RuntimeError("Barrier timed out after %f seconds", + timeout) + + # Check for each process + for i in range(self.world_size): + if i in processes_arrived: + continue + + key = f"arrival_{barrier_id}_{i}" + try: + # Try to get the key - if it exists, we'll get a value + # If it doesn't exist, it will throw an exception + self.store.get(key) + processes_arrived.add(i) + except KeyError: + # Key doesn't exist yet + pass + except Exception as check_e: + logger.debug("Error checking key existence: %s", check_e) + sched_yield() + + # Short sleep to avoid tight polling + if len(processes_arrived) < self.world_size: + sched_yield() + + # Phase 2: Signal departure from barrier + # We only care to block at this stage in rank 0, which runs the + # server side of the TCPStore. We want to make sure that all + # clients have departed the barrier before rank 0 in case the + # next thing after the barrier is a shutdown, including tearing + # down the TCPStore. Other ranks can exit the barrier immediately + # after signaling their departure. + departure_key = f"departure_{barrier_id}_{self.rank}" + try: + self.store.set(departure_key, b"1") + except Exception as e: + raise RuntimeError("Failed to signal barrier departure") from e + + if self.rank != 0: + return + + # Make rank 0 wait for all processes to signal departure + start_time = time.time() + processes_departed: set[int] = set() + + while len(processes_departed) < self.world_size: + # Check for timeout + if time.time() - start_time > timeout: + raise RuntimeError("Barrier departure timed out after %f s", + timeout) + + # Check for each process + for i in range(self.world_size): + if i in processes_departed: + continue + + key = f"departure_{barrier_id}_{i}" + try: + # Try to get the key - if it exists, we'll get a value + # If it doesn't exist, it will throw an exception + self.store.get(key) + processes_departed.add(i) + except KeyError: + # Key doesn't exist yet + pass + except Exception as check_e: + logger.debug("Error checking key existence: %s", check_e) + sched_yield() + + # Short sleep to avoid tight polling + if len(processes_departed) < self.world_size: + sched_yield() + + # Clean up keys to avoid leaking memory in the store for i in range(self.world_size): - self.broadcast_obj(None, src=i) + try: + self.store.delete_key(f"arrival_{barrier_id}_{i}") + except Exception: + logger.debug("Error deleting key: %s", + f'arrival_{barrier_id}_{i}') + + try: + self.store.delete_key(f"departure_{barrier_id}_{i}") + except Exception: + logger.debug("Error deleting key: %s", + f'departure_{barrier_id}_{i}') @staticmethod def create( From acb54ca8e11a72592de963cf2f7477635349d526 Mon Sep 17 00:00:00 2001 From: Rabi Mishra Date: Thu, 22 May 2025 08:51:14 +0530 Subject: [PATCH 043/192] Intialize io_thread_pool attribute in the beginning. (#18331) Signed-off-by: rabi --- vllm/v1/executor/multiproc_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 74b226b45424..2061806e6b36 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -50,6 +50,7 @@ def _init_executor(self) -> None: self.is_failed = False self.shutdown_event = threading.Event() self.failure_callback: Optional[FailureCallback] = None + self.io_thread_pool: Optional[ThreadPoolExecutor] = None self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -107,7 +108,6 @@ def _init_executor(self) -> None: # For pipeline parallel, we use a thread pool for asynchronous # execute_model. - self.io_thread_pool: Optional[ThreadPoolExecutor] = None if self.max_concurrent_batches > 1: # Note: must use only 1 IO thread to keep dequeue sequence # from the response queue From d022115cc603ed5985996f85460c3be5c7ae749e Mon Sep 17 00:00:00 2001 From: youngrok cha Date: Thu, 22 May 2025 12:21:47 +0900 Subject: [PATCH 044/192] [Bugfix] Inconsistent token calculation compared to HF in llava family (#18479) Signed-off-by: jaycha --- vllm/model_executor/models/llava_next.py | 6 ++++-- vllm/model_executor/models/llava_onevision.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index e731f1bfdb9a..581a32325d4c 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -135,11 +135,13 @@ def _get_num_unpadded_features( current_aspect_ratio = current_width / current_height if aspect_ratio > current_aspect_ratio: - new_height = (original_height * current_width) // original_width + new_height = int( + round(original_height * (current_width / original_width), 7)) padding = (current_height - new_height) // 2 current_height = current_height - (2 * padding) else: - new_width = (original_width * current_height) // original_height + new_width = int( + round(original_width * (current_height / original_height), 7)) padding = (current_width - new_width) // 2 current_width = current_width - (2 * padding) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 49f1ecb4be89..7ea759fd59b8 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -116,11 +116,13 @@ def _get_num_unpadded_features( current_aspect_ratio = current_width / current_height if aspect_ratio > current_aspect_ratio: - new_height = (original_height * current_width) // original_width + new_height = int( + round(original_height * (current_width / original_width), 7)) padding = (current_height - new_height) // 2 current_height = current_height - (2 * padding) else: - new_width = (original_width * current_height) // original_height + new_width = int( + round(original_width * (current_height / original_height), 7)) padding = (current_width - new_width) // 2 current_width = current_width - (2 * padding) From cf5984b2fe33e0ec56eca4c83c440d03943750e6 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 21 May 2025 20:25:25 -0700 Subject: [PATCH 045/192] [BugFix][DP] Send DP wave completion only from `dp_rank==0` (#18502) Signed-off-by: Nick Hill Co-authored-by: kourosh hakhamaneshi --- vllm/v1/engine/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2234b069621d..64e472457ee3 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -701,7 +701,7 @@ def _init_data_parallel(self, vllm_config: VllmConfig): for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)) - self.local_dp_rank = local_dp_rank + self.dp_rank = dp_rank self.dp_group = vllm_config.parallel_config.stateless_init_dp_group() self.current_wave = 0 @@ -774,7 +774,7 @@ def run_busy_loop(self): local_unfinished_reqs) if not self.engines_running: - if self.local_dp_rank == 0: + if self.dp_rank == 0: # Notify client that we are pausing the loop. logger.debug("Wave %d finished, pausing engine loop.", self.current_wave) From 51797775c3ffc1277be750eb046c8030f8eca280 Mon Sep 17 00:00:00 2001 From: Shane A Date: Wed, 21 May 2025 21:17:03 -0700 Subject: [PATCH 046/192] [Bugfix][Model] Make Olmo2Model weight loading return loaded weights (#18504) Signed-off-by: Shane A --- vllm/model_executor/models/olmo2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 0a1fb10c186e..33adacdae5f5 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -314,7 +314,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -325,6 +326,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() for name, loaded_weight in weights: if is_pp_missing_parameter(name, self): continue @@ -347,6 +349,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params class Olmo2ForCausalLM(nn.Module, SupportsPP): From db5a29ba19fcd51a7517af8dc49824ee3db7b8d0 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 22 May 2025 12:48:53 +0800 Subject: [PATCH 047/192] [Bugfix] Fix LoRA test (#18518) Signed-off-by: Jee Jee Li --- tests/lora/test_lora_functions.py | 2 +- tests/v1/sample/test_topk_topp_sampler.py | 136 ++++++++++++---------- 2 files changed, 73 insertions(+), 65 deletions(-) diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 204624a0540a..7ae33a848a0a 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -69,7 +69,7 @@ def run_check(fn, args, expected: list): run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11]) run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11]) - # Remove all LoRAs + # Remove all LoRAs. run_check(llm.remove_lora, 13, [12, 10, 11]) run_check(llm.remove_lora, 12, [10, 11]) run_check(llm.remove_lora, 11, [10]) diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py index a8a713d446b7..220f05c7ff1c 100644 --- a/tests/v1/sample/test_topk_topp_sampler.py +++ b/tests/v1/sample/test_topk_topp_sampler.py @@ -16,31 +16,40 @@ FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available +@pytest.fixture(autouse=True) +def reset_default_device(): + """ + Explicitly set the default device, which can affect subsequent tests. + Adding this fixture helps avoid this problem. + """ + original_device = torch.get_default_device() + yield + torch.set_default_device(original_device) + + def test_topk_impl_equivalance(): - with torch.device(DEVICE): - generator = Generator(device=DEVICE).manual_seed(33) + torch.set_default_device(DEVICE) + generator = Generator(device=DEVICE).manual_seed(33) - logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) + logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) - # Random top-k values between 1 and 9. - k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator) + # Random top-k values between 1 and 9. + k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator) - # Set k=vocab_size for ~50% of requests in the batch (top-k disabled). - k.masked_fill_( - torch.randint(0, - 2, (BATCH_SIZE, ), - generator=generator, - dtype=bool), VOCAB_SIZE) + # Set k=vocab_size for ~50% of requests in the batch (top-k disabled). + k.masked_fill_( + torch.randint(0, 2, (BATCH_SIZE, ), generator=generator, dtype=bool), + VOCAB_SIZE) - # Top-k only implementation - result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None) + # Top-k only implementation + result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None) - # Top-p + top-k - no_op_top_p = torch.tensor([1.0]) - result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p) + # Top-p + top-k + no_op_top_p = torch.tensor([1.0]) + result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p) - assert torch.allclose(result1, result2) + assert torch.allclose(result1, result2) def test_flashinfer_sampler(): @@ -58,50 +67,49 @@ def test_flashinfer_sampler(): pytest.skip( "FlashInfer not installed or not available on this platform.") - with torch.device(DEVICE): - generator = Generator(device=DEVICE).manual_seed(42) - - # Generate random logits - logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) - - # Generate various top-k and top-p values - k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator) - p_values = torch.rand( - (BATCH_SIZE, ), - generator=generator) * 0.5 + 0.5 # range in [0.5, 1.0] - - # Sometimes disable top-k (k=vocab_size) - k_values.masked_fill_( - torch.randint(0, - 2, (BATCH_SIZE, ), - generator=generator, - dtype=torch.bool), VOCAB_SIZE) - - # Sometimes disable top-p (p=1.0) - p_values.masked_fill_( - torch.randint(0, - 2, (BATCH_SIZE, ), - generator=generator, - dtype=torch.bool), 1.0) - - python_logits = apply_top_k_top_p( - logits=logits.clone(), - k=k_values, - p=p_values, - ) - python_probs = torch.softmax(python_logits, dim=-1) - - # FlashInfer only exposed renorm interfaces for probs so convert first - flashinfer_probs = torch.softmax(logits.clone(), dim=-1) - flashinfer_probs = top_k_renorm_probs( - probs=flashinfer_probs, - top_k=k_values, - ) - flashinfer_probs = top_p_renorm_probs( - probs=flashinfer_probs, - top_p=p_values, - ) - - # Compare the results - assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \ - "FlashInfer and Python sampling implementations do not match!" + torch.set_default_device(DEVICE) + generator = Generator(device=DEVICE).manual_seed(42) + + # Generate random logits + logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) + + # Generate various top-k and top-p values + k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator) + p_values = torch.rand( + (BATCH_SIZE, ), generator=generator) * 0.5 + 0.5 # range in [0.5, 1.0] + + # Sometimes disable top-k (k=vocab_size) + k_values.masked_fill_( + torch.randint(0, + 2, (BATCH_SIZE, ), + generator=generator, + dtype=torch.bool), VOCAB_SIZE) + + # Sometimes disable top-p (p=1.0) + p_values.masked_fill_( + torch.randint(0, + 2, (BATCH_SIZE, ), + generator=generator, + dtype=torch.bool), 1.0) + + python_logits = apply_top_k_top_p( + logits=logits.clone(), + k=k_values, + p=p_values, + ) + python_probs = torch.softmax(python_logits, dim=-1) + + # FlashInfer only exposed renorm interfaces for probs so convert first + flashinfer_probs = torch.softmax(logits.clone(), dim=-1) + flashinfer_probs = top_k_renorm_probs( + probs=flashinfer_probs, + top_k=k_values, + ) + flashinfer_probs = top_p_renorm_probs( + probs=flashinfer_probs, + top_p=p_values, + ) + + # Compare the results + assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \ + "FlashInfer and Python sampling implementations do not match!" From 23b67b37b246fb7cecb3815c6873fa3d18c3c0e7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 22 May 2025 15:11:46 +0800 Subject: [PATCH 048/192] [Doc] Fix invalid JSON in example args (#18527) Signed-off-by: DarkLight1337 --- docs/source/design/v1/torch_compile.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md index 4d8ce0fd9227..64b6f0cc0a9b 100644 --- a/docs/source/design/v1/torch_compile.md +++ b/docs/source/design/v1/torch_compile.md @@ -99,7 +99,9 @@ This time, Inductor compilation is completely bypassed, and we will load from di The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example: -`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"` +``` +vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}' +``` Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel. @@ -134,12 +136,14 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: -`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"` +``` +vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' +``` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. ### Full Cudagraph capture -It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config "{'full_cuda_graph': True}"` +It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config '{"full_cuda_graph": true}'`. Currently only FlashAttention 3 is compatible, and only when cascade attention is disabled. From e2d7d312441a54228b8b77f9bbc2b7bd522062e7 Mon Sep 17 00:00:00 2001 From: Satyajith Chilappagari Date: Thu, 22 May 2025 02:17:34 -0700 Subject: [PATCH 049/192] [Neuron] Update Dockerfile.neuron to use latest neuron release (2.23) (#18512) Signed-off-by: Satyajith Chilappagari --- docker/Dockerfile.neuron | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron index 2b63fe301bac..259dc5a23f78 100644 --- a/docker/Dockerfile.neuron +++ b/docker/Dockerfile.neuron @@ -1,6 +1,6 @@ # default base image # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04" +ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04" FROM $BASE_IMAGE @@ -22,8 +22,7 @@ WORKDIR ${APP_MOUNT}/vllm RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity -RUN python3 -m pip install sentencepiece transformers==4.48.0 -U -RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install pytest # uninstall transformers-neuronx package explicitly to avoid version conflict @@ -49,6 +48,8 @@ RUN python3 -m pip install -e tests/vllm_test_utils # FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps +RUN python3 -m pip install sentencepiece transformers==4.48.0 -U + # overwrite entrypoint to run bash script RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py From ebed81fbf5e4549b26b6baf74cea0fe4551dd915 Mon Sep 17 00:00:00 2001 From: aws-elaineyz Date: Thu, 22 May 2025 02:18:55 -0700 Subject: [PATCH 050/192] Update default neuron config for speculation (#18274) Signed-off-by: Elaine Zhao Co-authored-by: Shashwat Srijan Co-authored-by: Aakash Shetty --- vllm/model_executor/model_loader/neuronx_distributed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index b98cea7fe6e1..3a4d93c8c13f 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -502,7 +502,7 @@ def _get_default_neuron_config(model_config: ModelConfig, max_context_length=scheduler_config.max_model_len, seq_len=scheduler_config.max_model_len, enable_bucketing=True, - is_continuous_batching=(batch_size > 1), + is_continuous_batching=True, quantized=False, torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], padding_side="right", @@ -520,6 +520,7 @@ def _get_default_speculation_config(model_config: ModelConfig, args.""" neuron_config = dict( tp_degree=parallel_config.tensor_parallel_size, + ctx_batch_size=1, batch_size=scheduler_config.max_num_seqs, max_context_length=scheduler_config.max_model_len, seq_len=scheduler_config.max_model_len, @@ -527,6 +528,7 @@ def _get_default_speculation_config(model_config: ModelConfig, trace_tokengen_model=False, enable_fused_speculation=True, enable_bucketing=True, + is_continuous_batching=True, quantized=False, torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], on_device_sampling_config=dict( From fa72f9a8126051abb9d00144f11aeb7615f36d21 Mon Sep 17 00:00:00 2001 From: aws-elaineyz Date: Thu, 22 May 2025 02:20:36 -0700 Subject: [PATCH 051/192] Order sequence ids + config update to support specifying custom quantization layers (#18279) Signed-off-by: Elaine Zhao Co-authored-by: Tailin Pan Co-authored-by: Rishabh Rajesh Co-authored-by: Yishan McNabb Co-authored-by: Patrick Lange Co-authored-by: Maxwell Goldberg Co-authored-by: Aakash Shetty --- tests/neuron/2_core/test_mistral.py | 40 ++++++++++++++--- .../model_loader/neuronx_distributed.py | 43 ++++++++++++++++--- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py index 8acd082f2ded..cc3b53a9d7c9 100644 --- a/tests/neuron/2_core/test_mistral.py +++ b/tests/neuron/2_core/test_mistral.py @@ -7,7 +7,7 @@ def test_mistral(): llm = LLM(model="mistralai/Mistral-7B-v0.1", tensor_parallel_size=2, max_num_seqs=4, - max_model_len=512, + max_model_len=128, use_v2_block_manager=True, override_neuron_config={ "sequence_parallel_enabled": False, @@ -15,16 +15,46 @@ def test_mistral(): }, device="neuron") + # Send more prompts than the compiled batch size (4) and request + # varying generation lengths to test accuracy related to Neuron + # specific sequence id sorting. prompts = [ "The president of the United States is", "The capital of France is", + "What is Annapurna labs?", + "I believe the meaning of life is", + "Tell me a story about a brave knight", + "Hello, my name is Llama", ] - outputs = llm.generate(prompts, SamplingParams(top_k=1)) + + sampling_params = [ + SamplingParams(top_k=1, max_tokens=10), + SamplingParams(top_k=1, max_tokens=20), + SamplingParams(top_k=1, max_tokens=30), + SamplingParams(top_k=1, max_tokens=40), + SamplingParams(top_k=1, max_tokens=50), + SamplingParams(top_k=1, max_tokens=60) + ] + + outputs = llm.generate(prompts, sampling_params) expected_outputs = [ - " the most powerful person in the world. He is the head of state " - "and head", - " a city of many faces. It is a city of history, culture, art" + " the most powerful person in the world. He is", + " a city of many faces. It is a city of history, culture, art, " + "fashion, and", + "\n\nAnnapurna Labs is a semiconductor company that was founded " + "in 2013 by Amazon. The company is", + " to be happy.\n\nI believe that happiness is a choice.\n\nI " + "believe that happiness is a state of mind.\n\nI believe that " + "happiness is a journey.\n\nI believe", + " who rescued a princess from a dragon.\n\nTell me a story about" + " a princess who rescued herself from a dragon.\n\nTell me a " + "story about a princess who rescued herself from a dragon and " + "then rescued a knight from", + " and I am a 10 year old male. I am a very friendly and " + "affectionate boy who loves to be around people. I am a very " + "active boy who loves to play and run around. I am a very smart " + "boy who loves to learn new things. I am a very loyal boy" ] for expected_output, output in zip(expected_outputs, outputs): diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index 3a4d93c8c13f..557feea46a90 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -87,16 +87,29 @@ def forward( input_block_ids: torch.Tensor, sampling_params: torch.Tensor, ) -> torch.Tensor: + # sort block ids sequentially for perf/neuron support reasons + sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) + input_ids = torch.index_select(input_ids, 0, sorted_indices) + positions = torch.index_select(positions, 0, sorted_indices) + sampling_params = torch.index_select(sampling_params, 0, + sorted_indices) + output = self.model(input_ids, attention_mask=None, position_ids=positions, - seq_ids=input_block_ids, + seq_ids=sorted_input_block_ids, sampling_params=sampling_params) # on-device sampling if self.config.neuron_config.on_device_sampling_config: - return output.hidden_states + output = output.hidden_states else: - return output.logits[:, -1, :] + output = output.logits[:, -1, :] + + restored_indices = torch.argsort(sorted_indices) + if input_block_ids.shape[0] != 1: + output = torch.index_select(output, 0, restored_indices) + + return output def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: @@ -340,14 +353,26 @@ def forward( input_block_ids: torch.Tensor, sampling_params: torch.Tensor, ) -> torch.Tensor: + # sort block ids sequentially for perf/neuron support reasons + sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) + input_ids = torch.index_select(input_ids, 0, sorted_indices) + positions = torch.index_select(positions, 0, sorted_indices) + sampling_params = torch.index_select(sampling_params, 0, + sorted_indices) + output = self.model(input_ids, attention_mask=None, position_ids=positions, - seq_ids=input_block_ids, + seq_ids=sorted_input_block_ids, sampling_params=sampling_params) + restored_indices = torch.argsort(sorted_indices) + # CTX encoding if (positions[:, 0]).sum().item() == 0: - return output.fused_outputs[0][:, 0:1] + output = output.fused_outputs[0][:, 0:1] + if input_block_ids.shape[0] != 1: + output = torch.index_select(output, 0, restored_indices) + return output # Fused Spec (Generation) accepted_tokens_with_padding = output.fused_outputs[0] @@ -362,6 +387,10 @@ def forward( -1) >= generated_token_counts accepted_tokens_with_padding[mask] = -1 + if input_block_ids.shape[0] != 1: + accepted_tokens_with_padding = torch.index_select( + accepted_tokens_with_padding, 0, restored_indices) + return accepted_tokens_with_padding def sample( @@ -416,6 +445,10 @@ def load_weights(self, model_name_or_path: str, draft_neuron_config.speculation_length = 0 draft_neuron_config.trace_tokengen_model = True draft_neuron_config.enable_fused_speculation = False + if getattr(config.neuron_config, "draft_model_modules_to_not_convert", + None): + draft_neuron_config.modules_to_not_convert = ( + draft_neuron_config.draft_model_modules_to_not_convert) if config.neuron_config.enable_eagle_speculation: draft_neuron_config.is_eagle_draft = True draft_neuron_config.sequence_parallel_enabled = False From f6037d1907851323735fd181a0b4d3e7e79229a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=87=83?= Date: Thu, 22 May 2025 20:22:53 +0800 Subject: [PATCH 052/192] [Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18526) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 松灵 Co-authored-by: Cyrus Leung Co-authored-by: DarkLight1337 --- vllm/worker/model_runner.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 12025617e512..53e79adf9aae 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -729,7 +729,10 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( seq_group_metadata, range(positions[0], positions[0] + len(positions))) - if not mm_kwargs: + + # M-RoPE requires mrope_positions even for plain text; return early + # when mm_kwargs is empty only if inter_data.is_prompt is False. + if not mm_kwargs and not inter_data.is_prompt: return inter_data.multi_modal_kwargs = mm_kwargs @@ -741,12 +744,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, video_grid_thw = mm_kwargs.get("video_grid_thw", None) audio_feature_lengths = mm_kwargs.get("audio_feature_lengths", None) - assert ( - image_grid_thw is not None or video_grid_thw is not None - or audio_feature_lengths is not None), ( - "mrope embedding type requires multi-modal input mapper " - "returns 'image_grid_thw' or 'video_grid_thw' or " - "'audio_feature_lengths'.") second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) use_audio_in_video = mm_kwargs.get("use_audio_in_video", False) From a35a494745727179bee77980e31709f101b0bf58 Mon Sep 17 00:00:00 2001 From: lkchen Date: Thu, 22 May 2025 05:24:43 -0700 Subject: [PATCH 053/192] [Bugfix] Add kwargs to RequestOutput __init__ to be forward compatible (#18513) Signed-off-by: Linkun --- .buildkite/test-pipeline.yaml | 1 + tests/test_outputs.py | 14 ++++++++++++++ vllm/outputs.py | 9 +++++++++ 3 files changed, 24 insertions(+) create mode 100644 tests/test_outputs.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 29796184106d..4c2da527de79 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -59,6 +59,7 @@ steps: - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py - pytest -v -s multimodal - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker diff --git a/tests/test_outputs.py b/tests/test_outputs.py new file mode 100644 index 000000000000..c41bd6723ba1 --- /dev/null +++ b/tests/test_outputs.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm.outputs import RequestOutput + + +def test_request_output_forward_compatible(): + output = RequestOutput(request_id="test_request_id", + prompt="test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[], + finished=False, + example_arg_added_in_new_version="some_value") + assert output is not None diff --git a/vllm/outputs.py b/vllm/outputs.py index 6cd60575b00d..05026b569691 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -9,12 +9,15 @@ import torch from typing_extensions import TypeVar, deprecated +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, SequenceGroup, SequenceGroupBase, SequenceStatus) +logger = init_logger(__name__) + @dataclass class CompletionOutput: @@ -122,7 +125,13 @@ def __init__( *, multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None, kv_transfer_params: Optional[dict[str, Any]] = None, + # Forward compatibility, code that uses args added in new release can + # still run with older versions of vLLM without breaking. + **kwargs: Any, ) -> None: + if kwargs: + logger.warning_once("RequestOutput: Ignoring extra arguments: %s", + str(kwargs)) self.request_id = request_id self.prompt = prompt self.prompt_token_ids = prompt_token_ids From ca86a7cf6edbc5baa6a3974d92c2e6eda4fce695 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 22 May 2025 15:01:07 +0200 Subject: [PATCH 054/192] [CI/Build] Update bamba test model location (#18544) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/language/generation/test_hybrid.py | 2 +- tests/models/registry.py | 2 +- tests/v1/test_oracle.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 9b7a42acece5..604cb854b32f 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -31,7 +31,7 @@ # not compatible with pip-compile. "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", - "hmellor/bamba-tiny-random", + "hmellor/tiny-random-BambaForCausalLM", ] # Avoid OOM diff --git a/tests/models/registry.py b/tests/models/registry.py index a92dee3b642d..911a58e99d4c 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -124,7 +124,7 @@ def check_available_online( "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B", - extras={"tiny": "hmellor/bamba-tiny-random"}), # noqa: E501 + extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b", diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index c34c673e985e..1b77417a1bd3 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -12,7 +12,7 @@ "openai/whisper-large-v3", # transcription "facebook/bart-large-cnn", # encoder decoder "mistralai/Mamba-Codestral-7B-v0.1", # mamba - "hmellor/bamba-tiny-random", # hybrid + "hmellor/tiny-random-BambaForCausalLM", # hybrid "BAAI/bge-m3", # embedding ] From 71075029f214bd4db409ba553cf083a883fdd61f Mon Sep 17 00:00:00 2001 From: CYJiang <86391540+googs1025@users.noreply.github.com> Date: Thu, 22 May 2025 21:20:17 +0800 Subject: [PATCH 055/192] [Doc] Support --stream arg in openai_completion_client.py script (#18388) Signed-off-by: googs1025 --- ...enai_chat_completion_structured_outputs.py | 7 +++++-- ...etion_structured_outputs_structural_tag.py | 7 +++++-- .../openai_completion_client.py | 20 ++++++++++++++----- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index 660369e55d40..722d747a69bf 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -12,6 +12,9 @@ from openai import BadRequestError, OpenAI from pydantic import BaseModel +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + # Guided decoding by Choice (list of possible options) def guided_choice_completion(client: OpenAI, model: str): @@ -134,8 +137,8 @@ def extra_backend_options_completion(client: OpenAI, model: str): def main(): client: OpenAI = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", + base_url=openai_api_base, + api_key=openai_api_key, ) model = client.models.list().data[0].id diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py index 42aa12c451c0..08f939942508 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py @@ -7,11 +7,14 @@ # to enforce the format of a tool call response, but it could be used for # any structured output within a subset of the response. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + def main(): client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", + base_url=openai_api_base, + api_key=openai_api_key, ) messages = [{ diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 6ab7619bff19..77f721921da2 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +import argparse + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. @@ -7,7 +9,15 @@ openai_api_base = "http://localhost:8000/v1" -def main(): +def parse_args(): + parser = argparse.ArgumentParser(description="Client for vLLM API server") + parser.add_argument("--stream", + action="store_true", + help="Enable streaming response") + return parser.parse_args() + + +def main(args): client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") api_key=openai_api_key, @@ -18,18 +28,17 @@ def main(): model = models.data[0].id # Completion API - stream = False completion = client.completions.create( model=model, prompt="A robot may not injure a human being", echo=False, n=2, - stream=stream, + stream=args.stream, logprobs=3) print("-" * 50) print("Completion results:") - if stream: + if args.stream: for c in completion: print(c) else: @@ -38,4 +47,5 @@ def main(): if __name__ == "__main__": - main() + args = parse_args() + main(args) From 4e04eceb58288310932d4abfbb417f1415d05caf Mon Sep 17 00:00:00 2001 From: Bowen Wang Date: Thu, 22 May 2025 06:48:56 -0700 Subject: [PATCH 056/192] [Bugfix] Use random hidden states in dummy sampler run (#18543) Signed-off-by: Bowen Wang --- vllm/v1/worker/gpu_model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e26f97d816ae..759d69293a32 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1721,6 +1721,10 @@ def _dummy_sampler_run( self, hidden_states: torch.Tensor, ) -> torch.Tensor: + # The dummy hidden states may contain special values, + # like `inf` or `nan`. + # To avoid breaking the sampler, we use a random tensor here instead. + hidden_states = torch.rand_like(hidden_states) logits = self.model.compute_logits(hidden_states, None) num_reqs = logits.size(0) From 3f505233fdd25e4026bbc1824eaa7a19aa782d80 Mon Sep 17 00:00:00 2001 From: Calvin Chen <45745657+calvin0327@users.noreply.github.com> Date: Thu, 22 May 2025 22:07:10 +0800 Subject: [PATCH 057/192] [Doc] Add stream flag for chat completion example (#18524) Signed-off-by: calvin chen <120380290@qq.com> --- .../openai_chat_completion_client.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index 74e0c045d621..bf99777d5697 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -3,6 +3,9 @@ NOTE: start a supported chat completion model server with `vllm serve`, e.g. vllm serve meta-llama/Llama-2-7b-chat-hf """ + +import argparse + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. @@ -24,7 +27,15 @@ }] -def main(): +def parse_args(): + parser = argparse.ArgumentParser(description="Client for vLLM API server") + parser.add_argument("--stream", + action="store_true", + help="Enable streaming response") + return parser.parse_args() + + +def main(args): client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") api_key=openai_api_key, @@ -34,16 +45,23 @@ def main(): models = client.models.list() model = models.data[0].id + # Chat Completion API chat_completion = client.chat.completions.create( messages=messages, model=model, + stream=args.stream, ) print("-" * 50) print("Chat completion results:") - print(chat_completion) + if args.stream: + for c in chat_completion: + print(c) + else: + print(chat_completion) print("-" * 50) if __name__ == "__main__": - main() + args = parse_args() + main(args) From 93f71673ce1a6cd4ac6217c6ca8f7a74c920bcc0 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 22 May 2025 22:35:00 +0800 Subject: [PATCH 058/192] [BugFix][CPU] Fix x86 SHM distributed module initialization (#18536) Signed-off-by: jiang.li --- vllm/distributed/device_communicators/cpu_communicator.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py index d4b34900b951..c04218cb9f39 100644 --- a/vllm/distributed/device_communicators/cpu_communicator.py +++ b/vllm/distributed/device_communicators/cpu_communicator.py @@ -22,8 +22,10 @@ def __init__(self, super().__init__(cpu_group, device, device_group, unique_name) self.dist_module = torch.distributed - if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) \ - and hasattr(torch.ops._C, "init_shm_manager"): + if (current_platform.get_cpu_architecture() + == CpuArchEnum.X86) and hasattr( + torch.ops._C, + "init_shm_manager") and unique_name.startswith("tp"): self.dist_module = _CPUSHMDistributed(self) def all_reduce(self, input_): @@ -96,6 +98,8 @@ class _CPUSHMDistributed: def __init__(self, communicator: CpuCommunicator): instance_identifier = os.environ["VLLM_DIST_IDENT"] + unique_name = communicator.unique_name + instance_identifier = f"{instance_identifier}-{unique_name}" self.communicator = communicator group_ranks = [str(rank) for rank in self.communicator.ranks] From cb506ecb5afa58670c6105b589696e6e176f60aa Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Thu, 22 May 2025 22:50:46 +0800 Subject: [PATCH 059/192] [Misc] improve Automatic Prefix Caching example (#18554) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- .../features/automatic_prefix_caching.md | 76 +------------- .../automatic_prefix_caching.py | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+), 75 deletions(-) create mode 100644 examples/offline_inference/automatic_prefix_caching.py diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/source/features/automatic_prefix_caching.md index 59016d7fcf6b..5c5b37c2a071 100644 --- a/docs/source/features/automatic_prefix_caching.md +++ b/docs/source/features/automatic_prefix_caching.md @@ -14,81 +14,7 @@ Technical details on how vLLM implements APC can be found [here](#design-automat Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: -```python -import time -from vllm import LLM, SamplingParams - - -# A prompt containing a large markdown table. The table is randomly generated by GPT-4. -LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ -| ID | Name | Age | Occupation | Country | Email | Phone Number | Address | -|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| -| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | -| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | -| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | -| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | -| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | -| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | -| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | -| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | -| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | -| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| -| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | -| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | -| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | -| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | -| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | -| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | -| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | -| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | -| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | -| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | -| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | -| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | -| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| -| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | -| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | -| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | -| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | -| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | -| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | -| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | -""" - - -def get_generation_time(llm, sampling_params, prompts): - # time the generation - start_time = time.time() - output = llm.generate(prompts, sampling_params=sampling_params) - end_time = time.time() - # print the output and generation time - print(f"Output: {output[0].outputs[0].text}") - print(f"Generation time: {end_time - start_time} seconds.") - - -# set enable_prefix_caching=True to enable APC -llm = LLM( - model='lmsys/longchat-13b-16k', - enable_prefix_caching=True -) - -sampling_params = SamplingParams(temperature=0, max_tokens=100) - -# Querying the age of John Doe -get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", -) - -# Querying the age of Zack Blue -# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. -get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", -) -``` + ## Example workloads diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py new file mode 100644 index 000000000000..6d05d0b99d80 --- /dev/null +++ b/examples/offline_inference/automatic_prefix_caching.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Demonstration script for Automatic Prefix Caching (APC) in vLLM. + +Automatic Prefix Caching (APC) allows the vLLM engine to reuse cached +KV (key-value) pairs from previous prompts if a new query shares the same +prefix. This reduces redundant computation and improves inference speed. + +To enable APC, set `enable_prefix_caching=True` when initializing the +vLLM engine. + +This script uses a long Markdown table as the shared prompt prefix and +compares the generation time for two queries that share the same prefix +but ask different questions. + +Run: +python examples/offline_inference/automatic_prefix_caching.py +""" +import time + +from vllm import LLM, SamplingParams + +# ruff: noqa: E501 +# A prompt containing a large markdown table. The table is randomly generated by GPT-4. +LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ +| ID | Name | Age | Occupation | Country | Email | Phone Number | Address | +|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| +| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | +| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | +| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | +| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | +| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | +| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | +| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | +| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | +| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | +| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| +| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | +| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | +| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | +| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | +| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | +| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | +| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | +| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | +| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | +| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | +| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | +| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | +| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| +| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | +| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | +| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | +| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | +| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | +| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | +| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | +""" + + +def get_generation_time(llm, sampling_params, prompts): + # time the generation + start_time = time.time() + output = llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + # print the output and generation time + print("-" * 30) + print(f"Output: {output[0].outputs[0].text}") + print(f"Generation time: {end_time - start_time} seconds.") + print("-" * 30) + + +def main(): + # set enable_prefix_caching=True to enable APC + llm = LLM(model='lmsys/longchat-13b-16k', enable_prefix_caching=True) + + sampling_params = SamplingParams(temperature=0, max_tokens=100) + + # Querying the age of John Doe + get_generation_time( + llm, + sampling_params, + LONG_PROMPT + + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", + ) + + # Querying the age of Zack Blue + # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. + get_generation_time( + llm, + sampling_params, + LONG_PROMPT + + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", + ) + + +if __name__ == "__main__": + main() From 54631f826233dbd1c046f9a70e98bc2e25edff1a Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Thu, 22 May 2025 17:00:13 +0100 Subject: [PATCH 060/192] [Misc] Call `ndarray.tobytes()` directly instead of `ndarray.data.tobytes()` (#18347) Signed-off-by: Lukas Geiger --- vllm/multimodal/hasher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 53e289370a9f..f6ab72f4e9b8 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -43,7 +43,7 @@ def serialize_item(cls, obj: object) -> bytes: "ndarray", { "dtype": obj.dtype.str, "shape": obj.shape, - "data": obj.data.tobytes(), + "data": obj.tobytes(), }) logger.warning( From 1f3a1200e425faaa93b333111ee1d2480cc5d397 Mon Sep 17 00:00:00 2001 From: David Xia Date: Thu, 22 May 2025 14:34:06 -0400 Subject: [PATCH 061/192] [Bugfix] make `test_openai_schema.py` pass (#18224) Signed-off-by: David Xia Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 2 +- .../entrypoints/openai/test_openai_schema.py | 57 ++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4c2da527de79..0e4a0e2a531b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -126,7 +126,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index 5c585d54c429..cae2a3b59553 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Final + import pytest import schemathesis +from hypothesis import settings from schemathesis import GenerationConfig from ...utils import RemoteOpenAIServer @@ -9,6 +12,8 @@ MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct" MAXIMUM_IMAGES = 2 +DEFAULT_TIMEOUT_SECONDS: Final[int] = 10 +LONG_TIMEOUT_SECONDS: Final[int] = 60 @pytest.fixture(scope="module") @@ -42,8 +47,58 @@ def get_schema(server): schema = schemathesis.from_pytest_fixture("get_schema") +@schemathesis.hook +def before_generate_case(context: schemathesis.hooks.HookContext, strategy): + op = context.operation + assert op is not None + + def no_file_type(case: schemathesis.models.Case): + """ + This filter skips test cases for the `POST /tokenize` endpoint where the + HTTP request body uses `"type": "file"` in any message's content. + We expect these cases to fail because that type isn't implemented here + https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095 + + Example test cases that are skipped: + curl -X POST -H 'Content-Type: application/json' \ + -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ + http://localhost:8000/tokenize + + curl -X POST -H 'Content-Type: application/json' \ + -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ + http://localhost:8000/tokenize + """ # noqa: E501 + if (op.method.lower() == "post" and op.path == "/tokenize" + and hasattr(case, "body") and isinstance(case.body, dict) + and "messages" in case.body + and isinstance(case.body["messages"], list) + and len(case.body["messages"]) > 0): + for message in case.body["messages"]: + if not isinstance(message, dict): + continue + content = message.get("content", []) + if not isinstance(content, list) or len(content) == 0: + continue + if any(item.get("type") == "file" for item in content): + return False + return True + + return strategy.filter(no_file_type) + + @schema.parametrize() @schema.override(headers={"Content-Type": "application/json"}) +@settings(deadline=LONG_TIMEOUT_SECONDS * 1000) def test_openapi_stateless(case: schemathesis.Case): + key = ( + case.operation.method.upper(), + case.operation.path, + ) + timeout = { + # requires a longer timeout + ("POST", "/v1/chat/completions"): + LONG_TIMEOUT_SECONDS, + }.get(key, DEFAULT_TIMEOUT_SECONDS) + #No need to verify SSL certificate for localhost - case.call_and_validate(verify=False) + case.call_and_validate(verify=False, timeout=timeout) From 721fb9b1818ef23c15fd176c7ea49285de544021 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 23 May 2025 03:11:28 +0800 Subject: [PATCH 062/192] [Platform] Move platform check to right place (#18470) Signed-off-by: wangxiyuan --- vllm/config.py | 35 ++++++++++------------------------- vllm/platforms/cpu.py | 11 +++++++++++ vllm/platforms/hpu.py | 11 +++++++++++ vllm/platforms/neuron.py | 11 +++++++++++ vllm/platforms/tpu.py | 11 +++++++++++ vllm/platforms/xpu.py | 11 +++++++++++ vllm/utils.py | 6 ++++++ 7 files changed, 71 insertions(+), 25 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 3fa1db0e8390..1c916915a046 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -42,7 +42,10 @@ try_get_generation_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect -from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, + LayerBlockType, cuda_device_count_stateless, get_cpu_memory, get_open_port, is_torch_equal_or_newer, random_uuid, resolve_obj_by_qualname) @@ -64,12 +67,6 @@ ConfigT = TypeVar("ConfigT", bound=ConfigType) -# This value is chosen to have a balance between ITL and TTFT. Note it is -# not optimized for throughput. -_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 -_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 -_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 - TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"] @@ -2074,28 +2071,28 @@ def __post_init__(self) -> None: # so we don't reject sequences on account of a short # max_num_batched_tokens. self.max_num_batched_tokens = max( - self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) else: self.max_num_batched_tokens = ( - _DEFAULT_MAX_NUM_BATCHED_TOKENS) + DEFAULT_MAX_NUM_BATCHED_TOKENS) else: # If max_model_len is too short, use - # _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value + # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value # for higher throughput. self.max_num_batched_tokens = max( - self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) if self.runner_type == "pooling": # Choose specific value for higher throughput self.max_num_batched_tokens = max( self.max_num_batched_tokens, - _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, ) if self.is_multimodal_model: # The value needs to be at least the number of multimodal tokens self.max_num_batched_tokens = max( self.max_num_batched_tokens, - _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) # When using default settings, @@ -4316,18 +4313,6 @@ def __post_init__(self): "full_cuda_graph is not supported with " "cascade attention. Disabling cascade attention.") self.model_config.disable_cascade_attn = True - - if self.model_config and self.model_config.use_mla and \ - not (current_platform.is_cuda() or current_platform.is_rocm()): - logger.info( - "MLA is enabled on a non-GPU platform; forcing chunked " - "prefill and prefix caching to be disabled.") - self.scheduler_config.enable_chunked_prefill = False - self.scheduler_config.chunked_prefill_enabled = False - self.scheduler_config.max_num_batched_tokens = max( - self.scheduler_config.max_model_len, - _DEFAULT_MAX_NUM_BATCHED_TOKENS) - if self.cache_config is not None: self.cache_config.enable_prefix_caching = False diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 2d48af397636..5c0c90972b58 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -9,6 +9,7 @@ import torch from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import CpuArchEnum, Platform, PlatformEnum, _Backend @@ -177,6 +178,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.") os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on CPU.") diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 456b054b2b43..6f7c5a6d3cae 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -7,6 +7,7 @@ from vllm import envs from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import Platform, PlatformEnum, _Backend @@ -80,6 +81,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on HPU.") diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index e08337b8391d..9cd49fd34804 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -6,6 +6,7 @@ from vllm import envs from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import Platform, PlatformEnum @@ -56,6 +57,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: vllm_config.cache_config.block_size = \ vllm_config.model_config.max_model_len # type: ignore + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on Neuron.") diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 6c573c1b3635..0173b15697cf 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -9,6 +9,7 @@ from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger from vllm.sampling_params import SamplingParams, SamplingType +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import Platform, PlatformEnum, _Backend @@ -161,6 +162,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "Forcing --disable_chunked_mm_input.") scheduler_config.disable_chunked_mm_input = True + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on TPU.") diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 225e756cd7ce..785fb6ce1b79 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -5,6 +5,7 @@ import torch from vllm.logger import init_logger +from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -113,6 +114,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "ray" + if vllm_config.model_config and vllm_config.model_config.use_mla: + logger.info( + "MLA is enabled on a non-GPU platform; forcing chunked " + "prefill and prefix caching to be disabled.") + vllm_config.scheduler_config.enable_chunked_prefill = False + vllm_config.scheduler_config.chunked_prefill_enabled = False + vllm_config.scheduler_config.max_num_batched_tokens = max( + vllm_config.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on XPU.") diff --git a/vllm/utils.py b/vllm/utils.py index 0cd90c130d3e..bfc01972bbd2 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -77,6 +77,12 @@ logger = init_logger(__name__) +# This value is chosen to have a balance between ITL and TTFT. Note it is +# not optimized for throughput. +DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 +POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 +MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 + # Exception strings for non-implemented encoder/decoder scenarios # Reminder: Please update docs/source/features/compatibility_matrix.md From f8d2cc5f553e74a6eb8b15cd29da16bddb019af1 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 23 May 2025 03:11:53 +0800 Subject: [PATCH 063/192] [Compile][Platform] Make PiecewiseBackend pluggable and extendable (#18076) Signed-off-by: Mengqing Cao Co-authored-by: youkaichao --- vllm/compilation/backends.py | 206 +------------------- vllm/compilation/base_piecewise_backend.py | 71 +++++++ vllm/compilation/cuda_piecewise_backend.py | 213 +++++++++++++++++++++ vllm/platforms/cuda.py | 4 + vllm/platforms/interface.py | 7 + vllm/platforms/rocm.py | 4 + 6 files changed, 305 insertions(+), 200 deletions(-) create mode 100644 vllm/compilation/base_piecewise_backend.py create mode 100644 vllm/compilation/cuda_piecewise_backend.py diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 0c1381a565c1..8114cddcd9fa 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -6,9 +6,7 @@ import pprint import time from collections.abc import Sequence -from contextlib import ExitStack from typing import Any, Callable, Optional -from unittest.mock import patch import torch import torch.fx as fx @@ -16,13 +14,13 @@ import vllm.envs as envs from vllm.config import CompilationConfig, VllmConfig from vllm.logger import init_logger -from vllm.utils import weak_ref_tensors +from vllm.platforms import current_platform +from vllm.utils import resolve_obj_by_qualname from .compiler_interface import (CompilerInterface, EagerAdaptor, InductorAdaptor, InductorStandaloneAdaptor) from .counter import compilation_counter from .inductor_pass import InductorPass -from .monitor import end_monitoring_torch_compile from .pass_manager import PostGradPassManager logger = init_logger(__name__) @@ -297,7 +295,9 @@ def call_module(self, target: torch.fx.node.Target, num_graphs=len(self.compile_submod_names), runtime_shape=None) - self.module.__dict__[target] = PiecewiseBackend( + piecewise_backend = resolve_obj_by_qualname( + current_platform.get_piecewise_backend_cls()) + self.module.__dict__[target] = piecewise_backend( submod, self.vllm_config, self.graph_pool, index, len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_general_shape, self.vllm_backend) @@ -341,7 +341,7 @@ def __init__( ): global global_graph_pool if global_graph_pool is None: - global_graph_pool = torch.cuda.graph_pool_handle() + global_graph_pool = current_platform.graph_pool_handle() # TODO: in the future, if we want to use multiple # streams, it might not be safe to share a global pool. @@ -558,197 +558,3 @@ def copy_and_call(*args): return self.split_gm(*list_args) return copy_and_call - - -@dataclasses.dataclass -class ConcreteSizeEntry: - runtime_shape: int - need_to_compile: bool # the size is in compile_sizes - use_cudagraph: bool # the size is in cudagraph_capture_sizes - - compiled: bool = False - runnable: Callable = None # type: ignore - num_finished_warmup: int = 0 - cudagraph: Optional[torch.cuda.CUDAGraph] = None - output: Optional[Any] = None - - # for cudagraph debugging, track the input addresses - # during capture, and check if they are the same during replay - input_addresses: Optional[list[int]] = None - - -class PiecewiseBackend: - - def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, - graph_pool: Any, piecewise_compile_index: int, - total_piecewise_compiles: int, sym_shape_indices: list[int], - compiled_graph_for_general_shape: Callable, - vllm_backend: VllmBackend): - """ - The backend for piecewise compilation. - It mainly handles the compilation and cudagraph capturing. - - We will compile `self.graph` once for the general shape, - and then compile for different shapes specified in - `compilation_config.compile_sizes`. - - Independently, we will capture cudagraph for different shapes. - - If a shape needs both compilation and cudagraph, we will - compile it first, and then capture cudagraph. - """ - self.graph = graph - self.vllm_config = vllm_config - self.compilation_config = vllm_config.compilation_config - self.graph_pool = graph_pool - self.piecewise_compile_index = piecewise_compile_index - self.total_piecewise_compiles = total_piecewise_compiles - self.vllm_backend = vllm_backend - - self.is_first_graph = piecewise_compile_index == 0 - self.is_last_graph = ( - piecewise_compile_index == total_piecewise_compiles - 1) - - self.compile_sizes: set[int] = set( - self.compilation_config.compile_sizes) - self.cudagraph_capture_sizes: set[int] = set( - self.compilation_config.cudagraph_capture_sizes - ) if self.compilation_config.use_cudagraph else set() - - self.first_run_finished = False - - self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa - - self.sym_shape_indices = sym_shape_indices - - self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" - - # the entries for different shapes that we need to either - # compile or capture cudagraph - self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} - - # to_be_compiled_sizes tracks the remaining sizes to compile, - # and updates during the compilation process, so we need to copy it - self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() - for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): - self.concrete_size_entries[shape] = ConcreteSizeEntry( - runtime_shape=shape, - need_to_compile=shape in self.compile_sizes, - use_cudagraph=shape in self.cudagraph_capture_sizes, - ) - - def check_for_ending_compilation(self): - if self.is_last_graph and not self.to_be_compiled_sizes: - # no specific sizes to compile - # save the hash of the inductor graph for the next run - self.vllm_backend.compiler_manager.save_to_file() - end_monitoring_torch_compile(self.vllm_config) - - def __call__(self, *args) -> Any: - if not self.first_run_finished: - self.first_run_finished = True - self.check_for_ending_compilation() - return self.compiled_graph_for_general_shape(*args) - - runtime_shape = args[self.sym_shape_indices[0]] - if runtime_shape not in self.concrete_size_entries: - # we don't need to do anything for this shape - return self.compiled_graph_for_general_shape(*args) - - entry = self.concrete_size_entries[runtime_shape] - - if entry.runnable is None: - entry.runnable = self.compiled_graph_for_general_shape - - if entry.need_to_compile and not entry.compiled: - entry.compiled = True - self.to_be_compiled_sizes.remove(runtime_shape) - # args are real arguments - entry.runnable = self.vllm_backend.compiler_manager.compile( - self.graph, - args, - self.compilation_config.inductor_compile_config, - self.compilation_config, - graph_index=self.piecewise_compile_index, - num_graphs=self.total_piecewise_compiles, - runtime_shape=runtime_shape) - - # finished compilations for all required shapes - if self.is_last_graph and not self.to_be_compiled_sizes: - self.check_for_ending_compilation() - - if not entry.use_cudagraph: - return entry.runnable(*args) - - if entry.cudagraph is None: - if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups: # noqa - entry.num_finished_warmup += 1 - if self.is_first_graph: - logger.debug( - "Warming up %s/%s for shape %s", - entry.num_finished_warmup, - self.compilation_config.cudagraph_num_of_warmups, - runtime_shape) - return entry.runnable(*args) - - if self.is_first_graph: - # Since we capture cudagraph for many different shapes and - # capturing is fast, we don't need to log it for every shape. - # We only log it in the debug mode. - logger.debug("Capturing a cudagraph for shape %s", - runtime_shape) - - input_addresses = [ - x.data_ptr() for x in args if isinstance(x, torch.Tensor) - ] - entry.input_addresses = input_addresses - cudagraph = torch.cuda.CUDAGraph() - - with ExitStack() as stack: - if not self.is_first_graph: - # during every model forward, we will capture - # many pieces of cudagraphs (roughly one per layer). - # running gc again and again across layers will - # make the cudagraph capture very slow. - # therefore, we only run gc for the first graph, - # and disable gc for the rest of the graphs. - stack.enter_context(patch("gc.collect", lambda: None)) - stack.enter_context( - patch("torch.cuda.empty_cache", lambda: None)) - - # mind-exploding: carefully manage the reference and memory. - with torch.cuda.graph(cudagraph, pool=self.graph_pool): - # `output` is managed by pytorch's cudagraph pool - output = entry.runnable(*args) - if self.is_last_graph: - # by converting it to weak ref, - # the original `output` will immediately be released - # to save memory. It is only safe to do this for - # the last graph, because the output of the last graph - # will not be used by any other cuda graph. - output = weak_ref_tensors(output) - - # here we always use weak ref for the output - # to save memory - entry.output = weak_ref_tensors(output) - entry.cudagraph = cudagraph - - compilation_counter.num_cudagraph_caputured += 1 - - # important: we need to return the output, rather than - # the weak ref of the output, so that pytorch can correctly - # manage the memory during cuda graph capture - return output - - if self.is_debugging_mode: - # check if the input addresses are the same - new_input_addresses = [ - x.data_ptr() for x in args if isinstance(x, torch.Tensor) - ] - assert new_input_addresses == entry.input_addresses, ( - "Input addresses for cudagraphs are different during replay." - f" Expected {entry.input_addresses}, got {new_input_addresses}" - ) - - entry.cudagraph.replay() - return entry.output diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py new file mode 100644 index 000000000000..84d1e1f77739 --- /dev/null +++ b/vllm/compilation/base_piecewise_backend.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, Protocol + +import torch.fx as fx + +from vllm.compilation.backends import VllmBackend +from vllm.config import VllmConfig + + +class AbstractPiecewiseBackend(Protocol): + """ + PiecewiseBackend interface that allows platforms to extend + piecewise static graph. + """ + + def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, + graph_pool: Any, piecewise_compile_index: int, + total_piecewise_compiles: int, sym_shape_indices: list[int], + compiled_graph_for_general_shape: Callable, + vllm_backend: VllmBackend, **kwargs): + """ + Initializes the PiecewiseBackend class with compilation and + execution-related configurations. + + This class handles piecewise compilation, graph capturing, + and dispatching for specific input shapes. + + Args: + graph (fx.GraphModule): The graph represented in fx. + vllm_config (VllmConfig): Global configuration for vLLM. + graph_pool (Any): + Graph memory pool handle, e.g., + `torch.cuda.graph_pool_handle()`. + piecewise_compile_index (int): + Index of the current piecewise subgraph. + total_piecewise_compiles (int): + Total number of piecewise-compiled graphs. + sym_shape_indices (list[int]): + Indices of symbolic shape. + compiled_graph_for_general_shape (Callable): + Callable that executes the graph compiled for general shapes. + vllm_backend (VllmBackend): + Backend compiler that manages compilation and graph runtime + for vLLM. + + Keyword Args: + kwargs: Additional keyword arguments reserved for future + extensions or custom platforms. + """ + raise NotImplementedError + + def __call__(self, *args) -> Any: + """Executes the compiled graph for given input args. + + If this is the first invocation, executes the general compiled graph + and initiates the compilation process tracking. For subsequent calls, + dynamically dispatches execution to either a compiled graph or a static + graph based on the input shape. + + Args: + *args: Variable length input arguments to be passed into the + graph. The symbolic shape is expected to be in position + `sym_shape_indices[0]`. + + Returns: + Any: Output of the executed graph. This can be from the general + compiled graph, a specialized compiled version for the given shape, + or a replayed static graph. + """ + raise NotImplementedError diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py new file mode 100644 index 000000000000..0ad480e28cd7 --- /dev/null +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -0,0 +1,213 @@ +# SPDX-License-Identifier: Apache-2.0 + +import dataclasses +from contextlib import ExitStack +from typing import Any, Callable, Optional +from unittest.mock import patch + +import torch +import torch.fx as fx + +import vllm.envs as envs +from vllm.compilation.backends import VllmBackend +from vllm.compilation.counter import compilation_counter +from vllm.compilation.monitor import end_monitoring_torch_compile +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.utils import weak_ref_tensors + +logger = init_logger(__name__) + + +@dataclasses.dataclass +class ConcreteSizeEntry: + runtime_shape: int + need_to_compile: bool # the size is in compile_sizes + use_cudagraph: bool # the size is in cudagraph_capture_sizes + + compiled: bool = False + runnable: Callable = None # type: ignore + num_finished_warmup: int = 0 + cudagraph: Optional[torch.cuda.CUDAGraph] = None + output: Optional[Any] = None + + # for cudagraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: Optional[list[int]] = None + + +class CUDAPiecewiseBackend: + + def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, + graph_pool: Any, piecewise_compile_index: int, + total_piecewise_compiles: int, sym_shape_indices: list[int], + compiled_graph_for_general_shape: Callable, + vllm_backend: VllmBackend): + """ + The backend for piecewise compilation. + It mainly handles the compilation and cudagraph capturing. + + We will compile `self.graph` once for the general shape, + and then compile for different shapes specified in + `compilation_config.compile_sizes`. + + Independently, we will capture cudagraph for different shapes. + + If a shape needs both compilation and cudagraph, we will + compile it first, and then capture cudagraph. + """ + self.graph = graph + self.vllm_config = vllm_config + self.compilation_config = vllm_config.compilation_config + self.graph_pool = graph_pool + self.piecewise_compile_index = piecewise_compile_index + self.total_piecewise_compiles = total_piecewise_compiles + self.vllm_backend = vllm_backend + + self.is_first_graph = piecewise_compile_index == 0 + self.is_last_graph = ( + piecewise_compile_index == total_piecewise_compiles - 1) + + self.compile_sizes: set[int] = set( + self.compilation_config.compile_sizes) + self.cudagraph_capture_sizes: set[int] = set( + self.compilation_config.cudagraph_capture_sizes + ) if self.compilation_config.use_cudagraph else set() + + self.first_run_finished = False + + self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa + + self.sym_shape_indices = sym_shape_indices + + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + + # the entries for different shapes that we need to either + # compile or capture cudagraph + self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {} + + # to_be_compiled_sizes tracks the remaining sizes to compile, + # and updates during the compilation process, so we need to copy it + self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy() + for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): + self.concrete_size_entries[shape] = ConcreteSizeEntry( + runtime_shape=shape, + need_to_compile=shape in self.compile_sizes, + use_cudagraph=shape in self.cudagraph_capture_sizes, + ) + + def check_for_ending_compilation(self): + if self.is_last_graph and not self.to_be_compiled_sizes: + # no specific sizes to compile + # save the hash of the inductor graph for the next run + self.vllm_backend.compiler_manager.save_to_file() + end_monitoring_torch_compile(self.vllm_config) + + def __call__(self, *args) -> Any: + if not self.first_run_finished: + self.first_run_finished = True + self.check_for_ending_compilation() + return self.compiled_graph_for_general_shape(*args) + + runtime_shape = args[self.sym_shape_indices[0]] + if runtime_shape not in self.concrete_size_entries: + # we don't need to do anything for this shape + return self.compiled_graph_for_general_shape(*args) + + entry = self.concrete_size_entries[runtime_shape] + + if entry.runnable is None: + entry.runnable = self.compiled_graph_for_general_shape + + if entry.need_to_compile and not entry.compiled: + entry.compiled = True + self.to_be_compiled_sizes.remove(runtime_shape) + # args are real arguments + entry.runnable = self.vllm_backend.compiler_manager.compile( + self.graph, + args, + self.compilation_config.inductor_compile_config, + self.compilation_config, + graph_index=self.piecewise_compile_index, + num_graphs=self.total_piecewise_compiles, + runtime_shape=runtime_shape) + + # finished compilations for all required shapes + if self.is_last_graph and not self.to_be_compiled_sizes: + self.check_for_ending_compilation() + + if not entry.use_cudagraph: + return entry.runnable(*args) + + if entry.cudagraph is None: + if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups: # noqa + entry.num_finished_warmup += 1 + if self.is_first_graph: + logger.debug( + "Warming up %s/%s for shape %s", + entry.num_finished_warmup, + self.compilation_config.cudagraph_num_of_warmups, + runtime_shape) + return entry.runnable(*args) + + if self.is_first_graph: + # Since we capture cudagraph for many different shapes and + # capturing is fast, we don't need to log it for every shape. + # We only log it in the debug mode. + logger.debug("Capturing a cudagraph for shape %s", + runtime_shape) + + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses + cudagraph = torch.cuda.CUDAGraph() + + with ExitStack() as stack: + if not self.is_first_graph: + # during every model forward, we will capture + # many pieces of cudagraphs (roughly one per layer). + # running gc again and again across layers will + # make the cudagraph capture very slow. + # therefore, we only run gc for the first graph, + # and disable gc for the rest of the graphs. + stack.enter_context(patch("gc.collect", lambda: None)) + stack.enter_context( + patch("torch.cuda.empty_cache", lambda: None)) + + # mind-exploding: carefully manage the reference and memory. + with torch.cuda.graph(cudagraph, pool=self.graph_pool): + # `output` is managed by pytorch's cudagraph pool + output = entry.runnable(*args) + if self.is_last_graph: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph, because the output of the last graph + # will not be used by any other cuda graph. + output = weak_ref_tensors(output) + + # here we always use weak ref for the output + # to save memory + entry.output = weak_ref_tensors(output) + entry.cudagraph = cudagraph + + compilation_counter.num_cudagraph_caputured += 1 + + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during cuda graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + "Input addresses for cudagraphs are different during replay." + f" Expected {entry.input_addresses}, got {new_input_addresses}" + ) + + entry.cudagraph.replay() + return entry.output diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index bdee8b2f821d..0bdf15959302 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -311,6 +311,10 @@ def supports_v1(cls, model_config: "ModelConfig") -> bool: def use_custom_allreduce(cls) -> bool: return True + @classmethod + def get_piecewise_backend_cls(cls) -> str: + return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index b09e31e9ed46..20284b4e1801 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -478,6 +478,13 @@ def get_cu_count(cls, device_id: int = 0) -> int: """ raise NotImplementedError + @classmethod + def get_piecewise_backend_cls(cls) -> str: + """ + Get piecewise backend class for piecewise graph. + """ + return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend" # noqa + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 3c73843c3416..1685c65ad0b9 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -382,3 +382,7 @@ def get_cu_count(cls, device_id: int = 0) -> int: @classmethod def is_navi(cls) -> bool: return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName + + @classmethod + def get_piecewise_backend_cls(cls) -> str: + return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa From 6e588da0f4b90e695a20779c3d5a079e56ad3a7b Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 22 May 2025 15:13:54 -0400 Subject: [PATCH 064/192] [Build/CI] Fix CUDA 11.8 build (#17679) Signed-off-by: Tyler Michael Smith Signed-off-by: Lucas Wilkinson Signed-off-by: Tyler Michael Smith Co-authored-by: Lucas Wilkinson --- CMakeLists.txt | 6 ++- csrc/moe/moe_ops.h | 4 +- csrc/moe/moe_permute_unpermute_op.cu | 43 ++++++++++++++++++- .../moe_permute_unpermute_kernel.cu | 10 +++-- csrc/moe/torch_bindings.cpp | 4 +- .../cutlass_w8a8/scaled_mm_entry.cu | 2 +- docker/Dockerfile | 16 ++++--- .../kernels/moe/test_moe_permute_unpermute.py | 4 +- .../layers/fused_moe/moe_permute_unpermute.py | 4 ++ 9 files changed, 78 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a6c54be9530b..ffb801d62619 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,7 +30,11 @@ set(ignoreMe "${VLLM_PYTHON_PATH}") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. -set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL) + set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") +else() + set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") +endif() # Supported AMD GPU architectures. set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 0bae119a7c46..8fda434d452f 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -28,4 +28,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor num_tokens_post_pad, int64_t top_k, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_K, int64_t bit); -#endif \ No newline at end of file +#endif + +bool moe_permute_unpermute_supported(); \ No newline at end of file diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 76d5f0eab021..9a7465261abf 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -5,6 +5,9 @@ #include "permute_unpermute_kernels/dispatch.h" #include "core/registration.h" +// moe_permute kernels require at least CUDA 12.0 +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + void moe_permute( const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& topk_weights, //[n_token, topk] @@ -127,7 +130,45 @@ void moe_unpermute( }); } +#else + +void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, + torch::Tensor& topk_ids, + const torch::Tensor& token_expert_indicies, + const std::optional& expert_map, + int64_t n_expert, int64_t n_local_expert, int64_t topk, + const std::optional& align_block_size, + torch::Tensor& permuted_input, + torch::Tensor& expert_first_token_offset, + torch::Tensor& src_row_id2dst_row_id_map, + torch::Tensor& m_indices) { + TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); +} + +void moe_unpermute(const torch::Tensor& input, + const torch::Tensor& topk_weights, torch::Tensor& topk_ids, + const torch::Tensor& token_expert_indicies, + const std::optional& expert_map, + int64_t n_expert, int64_t n_local_expert, int64_t topk, + const std::optional& align_block_size, + torch::Tensor& permuted_input, + torch::Tensor& expert_first_token_offset, + torch::Tensor& src_row_id2dst_row_id_map, + torch::Tensor& m_indices) { + TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); +} + +#endif + +bool moe_permute_unpermute_supported() { +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + return true; +#else + return false; +#endif +} + TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_permute", &moe_permute); m.impl("moe_unpermute", &moe_unpermute); -} \ No newline at end of file +} diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu index aa353d0f0437..de2c153882d9 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu @@ -1,6 +1,9 @@ #include "moe_permute_unpermute_kernel.h" +// moe_permute kernels require at least CUDA 12.0 +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000) + // CubKeyValueSorter definition begin CubKeyValueSorter::CubKeyValueSorter() : num_experts_(0), num_bits_(sizeof(int) * 8) {} @@ -131,9 +134,6 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size, int num_experts) { auto tidx = threadIdx.x; auto bidx = blockIdx.x; - auto lidx = tidx & 31; - auto widx = tidx >> 5; - auto warp_count = (blockDim.x + 31) >> 5; auto offset = bidx * blockDim.x; auto bound = min(offset + blockDim.x, size); extern __shared__ int smem_expert_map[]; @@ -226,4 +226,6 @@ void getMIndices(int64_t* expert_first_token_offset, expert_first_token_offset, align_expert_first_token_offset, m_indices, num_local_expert, align_block_size); } -} \ No newline at end of file +} + +#endif diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 05f515e2e783..7d35ec79ead4 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -77,7 +77,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " "expert_first_token_offset, int n_expert, int n_local_expert,int " "topk, Tensor! hidden_states)->()"); - // conditionally compiled so impl registration is in source file + + m.def("moe_permute_unpermute_supported() -> bool"); + m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); #endif } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 3c258ddce61e..e9b408fbf2ee 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -123,7 +123,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) { } bool cutlass_group_gemm_supported(int64_t cuda_device_capability) { - // CUTLASS groped FP8 kernels need at least CUDA 12.3 + // CUTLASS grouped FP8 kernels need at least CUDA 12.3 // and SM90 (Hopper) #if defined CUDA_VERSION diff --git a/docker/Dockerfile b/docker/Dockerfile index a35056f78587..cc3499d1f0a9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -263,8 +263,11 @@ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \ else \ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \ - fi && \ - export FLASHINFER_ENABLE_AOT=1; \ + fi; \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + if [ "$CUDA_MAJOR" -lt 12 ]; then \ + export FLASHINFER_ENABLE_SM90=0; \ + fi; \ uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \ fi COPY examples examples @@ -275,7 +278,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ uv pip list -# Although we build Flashinfer with AOT mode, there's still +# Even when we build Flashinfer with AOT mode, there's still # some issues w.r.t. JIT compilation. Therefore we need to # install build dependencies for JIT compilation. # TODO: Remove this once FlashInfer AOT wheel is fixed @@ -303,8 +306,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" # install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/dev.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + if [ "$CUDA_MAJOR" -ge 12 ]; then \ + uv pip install --system -r requirements/dev.txt; \ + fi # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index dfcd61f77587..10e6ac64df87 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.layer import determine_expert_map from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( - moe_permute, moe_unpermute) + moe_permute, moe_permute_unpermute_supported, moe_unpermute) from vllm.platforms import current_platform NUM_EXPERTS = [16, 64] @@ -167,6 +167,8 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor, def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, n_expert: int, ep_size: int, dtype: torch.dtype, align_block_size: Optional[int]): + if not moe_permute_unpermute_supported(): + pytest.skip("moe_permute_unpermute is not supported on this platform.") fill_invalid_expert = 0 ep_rank = np.random.randint(0, ep_size) expert_map = None diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index 270e7cf1298a..cb396f26c96e 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -182,3 +182,7 @@ def moe_unpermute( expert_first_token_offset, n_expert, n_local_expert, topk, hidden_states) return hidden_states + + +def moe_permute_unpermute_supported(): + return torch.ops._moe_C.moe_permute_unpermute_supported() From 7b9d832c80042dc5fae8658ea754379414f19de6 Mon Sep 17 00:00:00 2001 From: lkchen Date: Thu, 22 May 2025 14:33:16 -0700 Subject: [PATCH 065/192] [Tool] Add NIXL installation script (#18172) Signed-off-by: Linkun --- tools/install_nixl.sh | 109 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 tools/install_nixl.sh diff --git a/tools/install_nixl.sh b/tools/install_nixl.sh new file mode 100644 index 000000000000..56717cfb77f7 --- /dev/null +++ b/tools/install_nixl.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# Usage: ./install_nixl.sh [--force] + +FORCE=false +if [ "$1" == "--force" ]; then + FORCE=true +fi + +SUDO=false +if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then + SUDO=true +fi + +ARCH=$(uname -m) + +ROOT_DIR="/usr/local" +mkdir -p "$ROOT_DIR" +GDR_HOME="$ROOT_DIR/gdrcopy" +UCX_HOME="$ROOT_DIR/ucx" +NIXL_HOME="$ROOT_DIR/nixl" +CUDA_HOME=/usr/local/cuda + +export PATH="$GDR_HOME/bin:$UCX_HOME/bin:$NIXL_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$GDR_HOME/lib:$UCX_HOME/lib:$NIXL_HOME/lib/$ARCH-linux-gnu:$LD_LIBRARY_PATH" + +TEMP_DIR="nixl_installer" +mkdir -p "$TEMP_DIR" +cd "$TEMP_DIR" + +pip install meson ninja pybind11 + +if [ ! -e "/dev/gdrdrv" ] || [ "$FORCE" = true ]; then + echo "Installing gdrcopy\n" + wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz + tar xzf v2.5.tar.gz; rm v2.5.tar.gz + cd gdrcopy-2.5 + make prefix=$GDR_HOME CUDA=$CUDA_HOME all install + + if $SUDO; then + echo "Running insmod.sh with sudo" + sudo ./insmod.sh + else + echo "Skipping insmod.sh - sudo not available" + echo "Please run 'sudo ./gdrcopy-2.5/insmod.sh' manually if needed" + fi + + cd .. +else + echo "Found /dev/gdrdrv. Skipping gdrcopy installation" +fi + +if ! command -v ucx_info &> /dev/null || [ "$FORCE" = true ]; then + echo "Installing UCX" + wget https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz + tar xzf ucx-1.18.0.tar.gz; rm ucx-1.18.0.tar.gz + cd ucx-1.18.0 + + # Checking Mellanox NICs + MLX_OPTS="" + if lspci | grep -i mellanox > /dev/null || command -v ibstat > /dev/null; then + echo "Mellanox NIC detected, adding Mellanox-specific options" + MLX_OPTS="--with-rdmacm \ + --with-mlx5-dv \ + --with-ib-hw-tm" + fi + + ./configure --prefix=$UCX_HOME \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=$CUDA_HOME \ + --with-dm \ + --with-gdrcopy=$GDR_HOME \ + --with-verbs \ + --enable-mt \ + $MLX_OPTS + make -j + make -j install-strip + + if $SUDO; then + echo "Running ldconfig with sudo" + sudo ldconfig + else + echo "Skipping ldconfig - sudo not available" + echo "Please run 'sudo ldconfig' manually if needed" + fi + + cd .. +else + echo "Found existing UCX. Skipping UCX installation" +fi + +if ! command -v nixl_test &> /dev/null || [ "$FORCE" = true ]; then + echo "Installing NIXL" + wget https://github.com/ai-dynamo/nixl/archive/refs/tags/0.2.0.tar.gz + tar xzf 0.2.0.tar.gz; rm 0.2.0.tar.gz + cd nixl-0.2.0 + meson setup build --prefix=$NIXL_HOME -Ducx_path=$UCX_HOME + cd build + ninja + ninja install + + cd ../.. +else + echo "Found existing NIXL. Skipping NIXL installation" +fi From a04720bc36401d831cb048c3917b9e58173d9c1d Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Thu, 22 May 2025 18:17:33 -0400 Subject: [PATCH 066/192] [V1][Spec Decode][Bugfix] Load quantize weights for EAGLE (#18290) --- vllm/transformers_utils/configs/eagle.py | 6 ++++-- vllm/v1/spec_decode/eagle.py | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 586d5c7f5e54..377523efefc3 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -52,13 +52,15 @@ def __init__(self, assert self.model is not None, \ "model should not be None when method is eagle" kwargs["architectures"] = [ - f"Eagle{arch}" for arch in self.model.architectures + f"Eagle{arch}" if not arch.startswith("Eagle") \ + else arch for arch in self.model.architectures ] elif method == "eagle3": assert self.model is not None, \ "model should not be None when method is eagle3" kwargs["architectures"] = [ - f"Eagle3{arch}" for arch in self.model.architectures + f"Eagle3{arch}" if not arch.startswith("Eagle3") \ + else arch for arch in self.model.architectures ] else: raise ValueError(f"Invalid method {method}. \ diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5b84bc1f5ec3..19fb2a2af7dd 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -9,7 +9,8 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model_loader -from vllm.model_executor.model_loader.utils import set_default_torch_dtype +from vllm.model_executor.model_loader.utils import ( + process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.triton_utils import tl, triton @@ -308,6 +309,9 @@ def load_model(self, target_model: nn.Module) -> None: loaded_weights = self.model.load_weights( loader.get_all_weights(draft_model_config, self.model)) + process_weights_after_loading(self.model, draft_model_config, + target_device) + # share embed_tokens with the target model if needed if get_pp_group().world_size == 1: assert "model.embed_tokens.weight" not in loaded_weights, \ From c91fe7b1b9c4398c6d4c980fc480ada0da8a0b23 Mon Sep 17 00:00:00 2001 From: Kai Wu Date: Thu, 22 May 2025 16:44:08 -0700 Subject: [PATCH 067/192] [Frontend][Bug Fix] Update llama4 pythonic jinja template and llama4_pythonic parser (#17917) Signed-off-by: Kai Wu --- docs/source/features/tool_calling.md | 11 +- .../tool_chat_template_llama4_pythonic.jinja | 100 +++--- .../test_llama4_pythonic_tool_parser.py | 193 +++++++++++ tests/tool_use/utils.py | 2 +- .../openai/tool_parsers/__init__.py | 4 +- .../llama4_pythonic_tool_parser.py | 303 ++++++++++++++++++ 6 files changed, 541 insertions(+), 72 deletions(-) create mode 100644 tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py create mode 100644 vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md index 2795b769345e..f76128406bfd 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -158,13 +158,13 @@ All Llama 3.1, 3.2 and 4 models should be supported. * `meta-llama/Llama-3.2-*` * `meta-llama/Llama-4-*` -The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. +The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for llama 4 models, it is recommended to use the `llama4_pythonic` tool parser. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: -1. Parallel tool calls are not supported. +1. Parallel tool calls are not supported for llama 3, but it is supported in llama 4 models. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -177,11 +177,10 @@ images. Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}` -VLLM also provides a JSON based chat template for Llama 4: -* - this is based on the "official" chat template for the Llama 4 -models, but tweaked so that it works better with vLLM. +VLLM also provides a pythonic and JSON based chat template for Llama 4, but pythonic tool calling is recommended: +* - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models. -For Llama 4 use `--tool-call-parser llama4_json examples/tool_chat_template_llama4_json.jinja`. +For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`. #### IBM Granite diff --git a/examples/tool_chat_template_llama4_pythonic.jinja b/examples/tool_chat_template_llama4_pythonic.jinja index bd18a35bdda9..bbed3d8205e0 100644 --- a/examples/tool_chat_template_llama4_pythonic.jinja +++ b/examples/tool_chat_template_llama4_pythonic.jinja @@ -1,16 +1,17 @@ {{- bos_token }} -{%- if custom_tools is defined %} +{%- if custom_tools is defined and custom_tools%} {%- set tools = custom_tools %} {%- endif %} -{%- if not tools_in_user_message is defined %} - {%- set tools_in_user_message = false %} -{%- endif %} -{%- if not tools is defined %} +{%- if tools is defined and tools %} + {%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %} +{%- else %} {%- set tools = none %} {%- endif %} + {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} + {%- set user_provided_system_message = true %} {%- if messages[0]['content'] is string %} {%- set system_message = messages[0]['content']|trim %} {%- else %} @@ -18,68 +19,33 @@ {%- endif %} {%- set messages = messages[1:] %} {%- else %} - {%- if tools is not none %} - {#- Add default tool system message when tools are provided #} - {%- set system_message = "You are a helpful assistant with tool calling " - "capabilities. Only reply with a tool call if the function exists in the " - "library provided by the user. If it doesn't exist, just reply directly in " - "natural language. When you receive a tool call response, use the output to " - "format an answer to the original user question." %} + {%- if tools is not none %} + {#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #} + {#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/ #} + {%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %} {%- else %} {%- set system_message = "" %} {%- endif %} {%- endif %} - -{#- System message if the user supplied one, or if tools are used (default tool system message) #} +{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #} {%- if system_message %} {#- always use user provided system message to override default tool system message #} {{- "<|header_start|>system<|header_end|>\n\n" }} {{- system_message }} - {%- if tools is not none and not tools_in_user_message %} - {{- "Tools: You have access to the following tools. You might need to use one " - "or more function/tool calls to fulfill the task. \n" - "If none are needed, then proceed to the response.\n\n" - "Tool Call Syntax: You can call tools using the following syntax:\n" - "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n" - "Do not include anything else when calling the tools with the syntax above.\n\n" - "Here is a list of functions in JSON format that you can invoke.\n " }} - {%- for t in tools %} - {{- t | tojson(indent=4) }} - {{- "\n\n" }} - {%- endfor %} + {%- if user_provided_system_message and tools %} + {{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }} + {{- tool_definition -}} + {%- elif tool_definition %} + {{- tool_definition -}} {%- endif %} {{- "<|eot|>" }} {%- endif %} -{#- Custom tools are passed in a user message with some extra guidance #} -{%- if tools_in_user_message and tools is not none %} - {#- Extract the first user message so we can plug it in here #} - {%- if messages | length != 0 %} - {%- if messages[0]['content'] is string %} - {%- set first_user_message = messages[0]['content']|trim %} - {%- else %} - {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %} - {%- endif %} - {%- set messages = messages[1:] %} - {%- else %} - {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} - {%- endif %} - {{- '<|header_start|>user<|header_end|>\n\n' -}} - {{- first_user_message}} - {{- "\nHere is a list of functions in JSON format that you can invoke:"}} - {%- for t in tools %} - {{- t | tojson(indent=4) }} - {{- "\n\n" }} - {%- endfor %} - {{- "Should you decide to return the function call(s), put them in the format " - "of [func_name1(params_name1=params_value1, params_name2=params_value2, " - "...), ...]\nDo not include anything else when calling the tools with the " - "syntax above." }} -{%- endif %} - +{#- Now deal with all other messages #} {%- for message in messages %} - {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} - {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }} + {#- Base case: messages that are not from tool role and has empty tool_call list #} + {%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and message.tool_calls|length != 0 )) %} + {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }} {%- if message['content'] is string %} {{- message['content'] }} {%- else %} @@ -91,10 +57,12 @@ {%- endif %} {%- endfor %} {%- endif %} - {{- "<|eot|>" }} - {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %} - {%- set tool_call = message.tool_calls[0].function %} - {{- '<|header_start|>assistant<|header_end|>\n\n' -}} + {{- "<|eot|>" }} + {#- Tool case: messages has non-empty tool_call list, must from assistant #} + {%- elif 'tool_calls' in message %} + {#- assume tool_calls are always coming from assistant #} + {%- if message.role == 'assistant' %} + {{- '<|header_start|>assistant<|header_end|>\n\n' -}} {%- if message['content'] is string %} {{- message['content'] }} {%- else %} @@ -106,32 +74,36 @@ {%- endif %} {%- endfor %} {%- endif %} + {{- "[" }} {%- for tool_call in message.tool_calls %} {%- if tool_call.function is defined %} {%- set tool_call = tool_call.function %} {%- endif %} - {{- tool_call.name + '(' -}} + {{- tool_call.name + '(' -}} {%- for param in tool_call.arguments %} - {{- param + '=' -}} + {{- param + '="' -}} {{- "%s" | format(tool_call.arguments[param]) -}} + {{- '"' -}} {% if not loop.last %}, {% endif %} {%- endfor %} {{- ')' -}} {% if not loop.last %}, {% endif %} {%- endfor %} - {{- "<|eom|>" }} + {{- "]<|eot|>" }} +{%- endif %} +{#- Tool_response case: messages are from tool_response #} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|header_start|>ipython<|header_end|>\n\n" }} {%- if message.content is string %} - {{- message.content | tojson }} + {{- message.content | tojson }} {%- else %} {%- for content in message['content'] %} {%- if content['type'] == 'text' %} - {{- content['text'] | tojson }} + {{- content['text'] | tojson }} {%- endif %} {%- endfor %} {%- endif %} - {{- "<|eom|>" }} + {{- "<|eot|>" }} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py new file mode 100644 index 000000000000..92ba1376e200 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -0,0 +1,193 @@ +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock + +import pytest + +from tests.entrypoints.openai.tool_parsers.utils import ( + run_tool_extraction, run_tool_extraction_streaming) +from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager + +# Test cases similar to pythonic parser but with Llama4 specific format +SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]" +SIMPLE_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "LA", "metric": "C"}', +) +MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', " + "age=9, " + "address={'city': 'LA', 'state': 'CA'}, " + "role=None, " + "passed_test=True, " + "aliases=['John', 'Johnny'])]") +MORE_TYPES_FUNCTION_CALL = FunctionCall( + name="register_user", + arguments='{"name": "Doe", ' + '"age": 9, ' + '"address": {"city": "LA", "state": "CA"}, ' + '"role": null, ' + '"passed_test": true, ' + '"aliases": ["John", "Johnny"]}', +) +PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]" +PARAMETERLESS_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{}', +) +EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]" +EMPTY_DICT_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"additional_data": {}}', +) +EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]" +EMPTY_LIST_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"steps": []}', +) +ESCAPED_STRING_FUNCTION_OUTPUT = ( + r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]") +ESCAPED_STRING_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}', +) +PYTHON_TAG_FUNCTION_OUTPUT = ( + "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>") + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_tool_call(streaming: bool): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + model_output = "How can I help you today?" + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert content == model_output + assert len(tool_calls) == 0 + + +test_str = "<|python_start|>" +test_str += "[get_weather(city='LA', metric='C')," +test_str += "register_user(name='Doe', age=9)]" +TEST_CASES = [ + pytest.param(True, + ESCAPED_STRING_FUNCTION_OUTPUT, + [ESCAPED_STRING_FUNCTION_CALL], + id="simple_streaming"), + pytest.param(False, + SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], + id="simple_nonstreaming"), + pytest.param(True, + MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL], + id="more_types_streaming"), + pytest.param(False, + MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL], + id="more_types_nonstreaming"), + pytest.param(True, + PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_streaming"), + pytest.param(False, + PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_nonstreaming"), + pytest.param(True, + EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_streaming"), + pytest.param(False, + EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_nonstreaming"), + pytest.param(True, + EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_streaming"), + pytest.param(False, + EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_nonstreaming"), + pytest.param(True, + ESCAPED_STRING_FUNCTION_OUTPUT, + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_streaming"), + pytest.param(False, + ESCAPED_STRING_FUNCTION_OUTPUT, + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_nonstreaming"), + pytest.param( + True, + "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]", + [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_streaming"), + pytest.param( + False, + "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]", + [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_nonstreaming"), + pytest.param(True, + PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], + id="python_tag_streaming"), + pytest.param(False, + PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL], + id="python_tag_nonstreaming"), + pytest.param(True, + test_str, [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_streaming"), + pytest.param(False, + "<|python_start|>[get_weather(city='LA', metric='C'), " + + "register_user(name='Doe', age=9)]", [ + SIMPLE_FUNCTION_CALL, + FunctionCall(name="register_user", + arguments='{"name": "Doe", "age": 9}') + ], + id="parallel_calls_nonstreaming"), +] + + +@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", + TEST_CASES) +def test_tool_call(streaming: bool, model_output: str, + expected_tool_calls: list[FunctionCall]): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert len(tool_calls) == len(expected_tool_calls) + for actual, expected in zip(tool_calls, expected_tool_calls): + assert actual.type == "function" + assert actual.function == expected + + +def test_streaming_tool_call_with_large_steps(): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + model_output_deltas = [ + "<|python_start|>[get_weather(city='LA', metric='C'), " + "get_weather(), " + "do_something_cool(steps=[])]<|python_end|>", + ] + + reconstructor = run_tool_extraction_streaming( + tool_parser, model_output_deltas, assert_one_tool_per_delta=False) + + assert reconstructor.other_content == "" + assert len(reconstructor.tool_calls) == 3 + assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL + assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL + assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index c14eaf71e978..efa6455c41df 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -88,7 +88,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]], "meta-llama/Llama-4-Scout-17B-16E-Instruct", "arguments": [ "--enforce-eager", "--no-enable-prefix-caching", - "--tool-call-parser", "pythonic", "--chat-template", + "--tool-call-parser", "llama4_pythonic", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp", "4" diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index f7c7112b124f..054c0b006b2f 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -7,6 +7,7 @@ from .hermes_tool_parser import Hermes2ProToolParser from .internlm2_tool_parser import Internlm2ToolParser from .jamba_tool_parser import JambaToolParser +from .llama4_pythonic_tool_parser import Llama4PythonicToolParser from .llama_tool_parser import Llama3JsonToolParser from .mistral_tool_parser import MistralToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser @@ -16,5 +17,6 @@ "ToolParser", "ToolParserManager", "Granite20bFCToolParser", "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser", - "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser" + "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser", + "DeepSeekV3ToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py new file mode 100644 index 000000000000..f483ac4eeee6 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -0,0 +1,303 @@ +# SPDX-License-Identifier: Apache-2.0 + +import ast +import json +import re +from collections.abc import Sequence +from typing import Any, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class _UnexpectedAstError(Exception): + pass + + +@ToolParserManager.register_module("llama4_pythonic") +class Llama4PythonicToolParser(ToolParser): + """ + Toolcall parser for Llama4 that produce tool calls in a pythonic style + Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic + """ + # TODO(mdepinet): Possible future improvements: + # 1. Support text + tools separated by either <|python_tag|> or \n\n + # 2. Support tools outside of a list (or separated by a semicolon). + # This depends on item 1 for consistent streaming. + # Neither of these are necessary for e.g. ToolACE, but both would help make + # Llama3.2 models more reliable. + + TOOL_CALL_REGEX = re.compile( + r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]", + re.DOTALL) + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # Rename for readability. This is NOT a tool id. + @property + def current_tool_index(self) -> int: + return self.current_tool_id + + @current_tool_index.setter + def current_tool_index(self, value: int) -> None: + self.current_tool_id = value + + def extract_tool_calls( + self, model_output: str, + request: ChatCompletionRequest) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + + # remove <|python_start|> and <|python_end|> + # as Llama 4 model sometime will output those tokens + if model_output.startswith("<|python_start|>"): + model_output = model_output[len("<|python_start|>"):] + model_output = model_output.replace("<|python_end|>", "") + if not (self.TOOL_CALL_REGEX.match(model_output)): + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + try: + module = ast.parse(model_output) + parsed = getattr(module.body[0], "value", None) + if isinstance(parsed, ast.List) and all( + isinstance(e, ast.Call) for e in parsed.elts): + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=[ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ], + content=None) + else: + raise _UnexpectedAstError( + "Tool output must be a list of function calls") + except Exception: + logger.exception("Error in extracting tool call from response.") + # Treat as regular text + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + if not current_text.startswith("[") and not current_text.startswith( + "<|python_start|>"): + return DeltaMessage(content=delta_text) + + try: + # remove <|python_start|> and <|python_end|> + if current_text.startswith("<|python_start|>"): + current_text = current_text[len("<|python_start|>"):] + if current_text.endswith("<|python_end|>"): + current_text = current_text[:current_text. + rfind("<|python_end|>")] + valid_and_added_text = _make_valid_python(current_text) + if valid_and_added_text is None: + return None + valid_text, added_text = valid_and_added_text + + module = ast.parse(valid_text) + parsed = getattr(module.body[0], "value", None) + if not isinstance(parsed, ast.List) or not all( + isinstance(e, ast.Call) for e in parsed.elts): + raise _UnexpectedAstError( + "Tool output must be a list of function calls") + tool_calls = [ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ] + + tool_deltas = [] + for index, new_call in enumerate(tool_calls): + if index < self.current_tool_index: + continue + + self.current_tool_index = index + if len(self.streamed_args_for_tool) == index: + self.streamed_args_for_tool.append("") + + new_call_complete = index < len( + tool_calls) - 1 or ")]" not in added_text + if new_call_complete: + self.current_tool_index += 1 + + withheld_suffix = (added_text[:-2] + if not new_call_complete else "") + if not new_call_complete and added_text[-2] == ")": + # Function call is incomplete. Withhold the closing bracket. + withheld_suffix = withheld_suffix + "}" + # Strings get single quotes in the model-produced string. + # JSON requires double quotes. + withheld_suffix = withheld_suffix.replace("'", '"') + delta = _compute_tool_delta(self.streamed_args_for_tool[index], + new_call, index, withheld_suffix) + + if delta is not None: + tool_deltas.append(delta) + if (delta.function is not None + and delta.function.arguments is not None): + self.streamed_args_for_tool[ + index] += delta.function.arguments + + # HACK: serving_chat.py inspects the internal state of tool parsers + # when determining it's final streaming delta, automatically + # adding autocompleted JSON. + # These two lines avoid that nonsense while ensuring finish_reason + # is set to tool_calls when at least one tool is called. + if tool_deltas and not self.prev_tool_call_arr: + self.prev_tool_call_arr = [{"arguments": {}}] + + if tool_deltas: + return DeltaMessage(tool_calls=tool_deltas) + elif not added_text and self.current_tool_id > 0: + # Return an empty DeltaMessage once the tool calls are all done + # so that finish_reason gets set. + return DeltaMessage(content='') + else: + return None + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None + + +def _get_parameter_value(val: ast.expr) -> Any: + if isinstance(val, ast.Constant): + return val.value + elif isinstance(val, ast.Dict): + if not all(isinstance(k, ast.Constant) for k in val.keys): + raise _UnexpectedAstError( + "Dict tool call arguments must have literal keys") + return { + k.value: _get_parameter_value(v) # type: ignore + for k, v in zip(val.keys, val.values) + } + elif isinstance(val, ast.List): + return [_get_parameter_value(v) for v in val.elts] + else: + raise _UnexpectedAstError("Tool call arguments must be literals") + + +def _handle_single_tool(call: ast.Call) -> ToolCall: + if not isinstance(call.func, ast.Name): + raise _UnexpectedAstError("Invalid tool call name") + function_name = call.func.id + arguments = {} + for keyword in call.keywords: + arguments[keyword.arg] = _get_parameter_value(keyword.value) + return ToolCall(type="function", + function=FunctionCall(name=function_name, + arguments=json.dumps(arguments))) + + +def _make_valid_python(text: str) -> Union[tuple[str, str], None]: + bracket_stack = [] + for index, char in enumerate(text): + if char in {"[", "(", "{"}: + bracket_stack.append(char) + elif char == "]": + if not bracket_stack or bracket_stack.pop() != "[": + raise _UnexpectedAstError("Mismatched square brackets") + elif char == ")": + if not bracket_stack or bracket_stack.pop() != "(": + raise _UnexpectedAstError("Mismatched parentheses") + elif char == "}": + if not bracket_stack or bracket_stack.pop() != "{": + raise _UnexpectedAstError("Mismatched curly braces") + elif char in {"'", '"'}: + if bracket_stack and bracket_stack[-1] == char: + if index > 0 and text[index - 1] == "\\": + # Treat an escaped quote as a regular character + pass + else: + bracket_stack.pop() + elif bracket_stack and bracket_stack[-1] in {"'", '"'}: + # Double quote within a single quote string or vice versa. + pass + else: + bracket_stack.append(char) + + text = text.rstrip() + if text.endswith("=") or text.endswith(":"): + # Since we have no type information for this property/parameter value, + # we can't fill in a valid value. + return None + if bracket_stack and bracket_stack[-1] == "{": + trailing_dict_text = text[:text.rfind("{")] + num_keys = trailing_dict_text.count(":") + num_values = trailing_dict_text.count(",") + if num_keys <= num_values: + return None # Incomplete property name within parameter value + if bracket_stack and bracket_stack[-1] == "(": + trailing_params_text = text[:text.rfind("(")] + num_full_param_names = trailing_params_text.count("=") + num_full_param_values = trailing_params_text.count(",") + if num_full_param_names <= num_full_param_values: + return None # Incomplete parameter name + if text.endswith(","): + text = text[:-1] + if bracket_stack and bracket_stack[-1] == "[" and not text.endswith( + "[") and not text.endswith(")"): + return None # Incomplete function name + + added_text = "" + for char in reversed(bracket_stack): + if char == "[": + added_text += "]" + elif char == "(": + added_text += ")" + elif char == "{": + added_text += "}" + elif char == "'": + added_text += "'" + elif char == '"': + added_text += '"' + + return text + added_text, added_text + + +def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall, + index: int, + withheld_suffix: str) -> Union[DeltaToolCall, None]: + new_call_args = new_call.function.arguments + if withheld_suffix: + assert new_call_args.endswith(withheld_suffix) + new_call_args = new_call_args[:-len(withheld_suffix)] + if not previously_sent_args: + return DeltaToolCall(id=new_call.id, + type="function", + index=index, + function=DeltaFunctionCall( + name=new_call.function.name, + arguments=new_call_args, + )) + + arg_diff = new_call_args[len(previously_sent_args):] + return DeltaToolCall( + id=None, index=index, function=DeltaFunctionCall( + arguments=arg_diff)) if arg_diff else None From c32e249a23169353ccc02d7c6099a8c90ca4bbf6 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Thu, 22 May 2025 21:44:18 -0400 Subject: [PATCH 068/192] [Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926) Signed-off-by: Sanger Steel --- .buildkite/test-pipeline.yaml | 1 + examples/other/tensorize_vllm_model.py | 96 +++++++-- .../openai/test_tensorizer_entrypoint.py | 97 ++++++++++ tests/lora/test_llama_tp.py | 120 ++++++++++-- tests/tensorizer_loader/conftest.py | 8 - tests/tensorizer_loader/test_tensorizer.py | 162 +++++----------- vllm/engine/arg_utils.py | 3 +- vllm/lora/models.py | 73 ++++--- vllm/lora/peft_helper.py | 28 ++- vllm/lora/request.py | 1 + vllm/lora/worker_manager.py | 4 +- .../model_executor/model_loader/tensorizer.py | 182 ++++++++++++++++-- .../model_loader/tensorizer_loader.py | 5 +- vllm/v1/engine/core.py | 7 + vllm/v1/worker/gpu_model_runner.py | 12 +- vllm/v1/worker/gpu_worker.py | 8 + 16 files changed, 608 insertions(+), 199 deletions(-) create mode 100644 tests/entrypoints/openai/test_tensorizer_entrypoint.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0e4a0e2a531b..017dba3d2d55 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -128,6 +128,7 @@ steps: - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Distributed Tests (4 GPUs) # 10min diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py index 7d11ba51a094..b1f2ce871bb4 100644 --- a/examples/other/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -6,11 +6,12 @@ import os import uuid -from vllm import LLM +from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs, - TensorizerConfig, - tensorize_vllm_model) +from vllm.lora.request import LoRARequest +from vllm.model_executor.model_loader.tensorizer import ( + TensorizerArgs, TensorizerConfig, tensorize_lora_adapter, + tensorize_vllm_model) from vllm.utils import FlexibleArgumentParser # yapf conflicts with isort for this docstring @@ -27,7 +28,7 @@ To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python -m examples.other.tensorize_vllm_model \ +python examples/other/tensorize_vllm_model.py \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -47,7 +48,7 @@ To deserialize a model, you can run something like this from the root level of this repository: -python -m examples.other.tensorize_vllm_model \ +python examples/other/tensorize_vllm_model.py \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -69,7 +70,7 @@ Or for deserializing: -`python -m examples.other.tensorize_vllm_model deserialize --help`. +`python examples/other/tensorize_vllm_model.py deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -90,11 +91,27 @@ In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python -m examples.other.tensorize_vllm_model deserialize --help` +`python examples/other/tensorize_vllm_model.py deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and `--path-to-tensors` are functionally the same in this case. + +Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter +can be serialized directly with the path to the LoRA adapter on HF Hub and +a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter +will serialize the LoRA adapter artifacts to `--serialized-directory`. + +You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring +the LoRA artifacts are in your model artifacts directory and specifying +`--enable-lora`. For instance: + +``` +vllm serve \ + --load-format tensorizer \ + --model-loader-extra-config '{"tensorizer_uri": ".tensors"}' \ + --enable-lora +``` """ @@ -107,6 +124,19 @@ def parse_args(): "also supported, although libsodium must be installed to " "use it.") parser = EngineArgs.add_cli_args(parser) + + parser.add_argument( + "--lora-path", + type=str, + required=False, + help="Path to a LoRA adapter to " + "serialize along with model tensors. This can then be deserialized " + "along with the model by passing a tensorizer_config kwarg to " + "LoRARequest with type TensorizerConfig. See the docstring for this " + "for a usage example." + + ) + subparsers = parser.add_subparsers(dest='command') serialize_parser = subparsers.add_parser( @@ -169,11 +199,42 @@ def parse_args(): def deserialize(): - llm = LLM(model=args.model, - load_format="tensorizer", - tensor_parallel_size=args.tensor_parallel_size, - model_loader_extra_config=tensorizer_config - ) + if args.lora_path: + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + llm = LLM(model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config, + enable_lora=True, + ) + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] + ) + + # Truncating this as the extra text isn't necessary + prompts = [ + "[user] Write a SQL query to answer the question based on ..." + ] + + # Test LoRA load + print( + llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql-lora", + 1, + args.lora_path, + tensorizer_config = tensorizer_config) + ) + ) + else: + llm = LLM(model=args.model, + load_format="tensorizer", + tensor_parallel_size=args.tensor_parallel_size, + model_loader_extra_config=tensorizer_config + ) return llm @@ -197,7 +258,10 @@ def deserialize(): model_name = model_ref.split("/")[1] - keyfile = args.keyfile if args.keyfile else None + if args.command == "serialize" or args.command == "deserialize": + keyfile = args.keyfile + else: + keyfile = None if args.model_loader_extra_config: config = json.loads(args.model_loader_extra_config) @@ -228,6 +292,10 @@ def deserialize(): encryption_keyfile=keyfile, **credentials) + if args.lora_path: + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + tensorize_lora_adapter(args.lora_path, tensorizer_config) + tensorize_vllm_model(engine_args, tensorizer_config) elif args.command == "deserialize": diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py new file mode 100644 index 000000000000..f1ab7223048d --- /dev/null +++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: Apache-2.0 +import gc +import json +import tempfile + +import openai +import pytest +import pytest_asyncio +import torch.cuda + +from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.model_loader.tensorizer import ( + TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model) + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "unsloth/llama-3.2-1b-Instruct" +LORA_PATH = "davzoku/finqa_adapter_1b" + + +def _cleanup(): + gc.collect() + torch.cuda.empty_cache() + + +@pytest.fixture(autouse=True) +def cleanup(): + _cleanup() + + +@pytest.fixture(scope='module') +def tmp_dir(): + with tempfile.TemporaryDirectory() as path: + yield path + + +@pytest.fixture(scope='module') +def model_uri(tmp_dir): + yield f"{tmp_dir}/model.tensors" + + +@pytest.fixture(scope="module") +def tensorize_model_and_lora(tmp_dir, model_uri): + tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri, + lora_dir=tmp_dir) + args = EngineArgs(model=MODEL_NAME, device="cuda") + + tensorize_lora_adapter(LORA_PATH, tensorizer_config) + tensorize_vllm_model(args, tensorizer_config) + + # Manually invoke a _cleanup() here, as the cleanup() + # fixture won't be guaranteed to be called after this + # when this fixture is used for a test + _cleanup() + yield + + +@pytest.fixture(scope="module") +def server(model_uri, tensorize_model_and_lora): + model_loader_extra_config = { + "tensorizer_uri": model_uri, + } + + ## Start OpenAI API server + args = [ + "--load-format", "tensorizer", "--device", "cuda", + "--model-loader-extra-config", + json.dumps(model_loader_extra_config), "--enable-lora" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): + _cleanup() + completion = await client.completions.create(model=model_name, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + assert completion.model == MODEL_NAME + assert len(completion.choices) == 1 + assert len(completion.choices[0].text) >= 5 + assert completion.choices[0].finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index e3a054bd6206..37bbc3cfa7d0 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +import subprocess +import sys +from typing import Union import pytest import ray import vllm +from vllm import LLM from vllm.lora.request import LoRARequest +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from ..utils import create_new_process_for_each_test, multi_gpu_test +from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora): pass -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: +def do_sample(llm: vllm.LLM, + lora_path: str, + lora_id: int, + tensorizer_config_dict: Union[dict, None] = None) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -45,15 +53,28 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 ] + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) + + if tensorizer_config_dict is not None: + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest( + str(lora_id), + lora_id, + lora_path, + tensorizer_config_dict=tensorizer_config_dict) + if lora_id else None) + else: + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) # Print the outputs. generated_texts: list[str] = [] for output in outputs: @@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -def generate_and_test(llm, sql_lora_files): +def generate_and_test(llm, + sql_lora_files, + tensorizer_config_dict: Union[dict, None] = None): print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=1) == EXPECTED_LORA_OUTPUT print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT + assert do_sample(llm, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=2) == EXPECTED_LORA_OUTPUT print("removing lora") @@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): enable_chunked_prefill=True, ) generate_and_test(llm, sql_lora_files) + + +@multi_gpu_test(num_gpus=2) +@create_new_process_for_each_test() +def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, + sql_lora_huggingface_id): + + # Run the tensorizing of the LoRA adapter and the model in a subprocess + # to guarantee cleanup + + tp_size = 2 + model_name = "model-rank-%03d.tensors" + + model_ref = MODEL_PATH + lora_path = sql_lora_huggingface_id + suffix = "test" + try: + result = subprocess.run([ + sys.executable, + f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model", + MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size", + str(tp_size), "serialize", "--serialized-directory", + str(tmp_path), "--suffix", suffix + ], + check=True, + capture_output=True, + text=True) + except subprocess.CalledProcessError as e: + print("Tensorizing failed.") + print("STDOUT:\n", e.stdout) + print("STDERR:\n", e.stderr) + raise + + print("STDOUT:\n", result.stdout) + + model_uri = tmp_path / "vllm" / model_ref / suffix / model_name + tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) + tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir + + loaded_vllm_model = LLM(model=model_ref, + load_format="tensorizer", + enable_lora=True, + enforce_eager=True, + model_loader_extra_config=tensorizer_config, + max_num_seqs=13, + tensor_parallel_size=2, + max_loras=2) + + tensorizer_config_dict = tensorizer_config.to_dict() + + print("lora adapter created") + assert do_sample(loaded_vllm_model, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 1") + assert do_sample(loaded_vllm_model, + sql_lora_files, + tensorizer_config_dict=tensorizer_config_dict, + lora_id=1) == EXPECTED_LORA_OUTPUT diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 7efef163d2b9..ce8689f5b89c 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -5,14 +5,6 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Tensorizer only tested on V0 so far. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - @pytest.fixture(autouse=True) def cleanup(): cleanup_dist_env_and_memory(shutdown_ray=True) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 7136dd44de03..b6286e148397 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import gc -import json import os import pathlib import subprocess -from functools import partial from unittest.mock import MagicMock, patch -import openai import pytest import torch -from huggingface_hub import snapshot_download from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -22,12 +18,11 @@ is_vllm_tensorized, load_with_tensorizer, open_stream, - serialize_vllm_model, tensorize_vllm_model) # yapf: enable -from vllm.utils import PlaceholderModule, import_from_path +from vllm.utils import PlaceholderModule -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import VLLM_PATH try: from tensorizer import EncryptionParams @@ -103,6 +98,7 @@ def test_can_deserialize_s3(vllm_runner): @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): + args = EngineArgs(model=model_ref) with vllm_runner(model_ref) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") key_path = tmp_path / (model_ref + ".key") @@ -110,15 +106,13 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, - encryption_keyfile=key_path) + config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path), + encryption_keyfile=str(key_path)) - vllm_model.apply_model( - partial(serialize_vllm_model, - tensorizer_config=config_for_serializing)) + tensorize_vllm_model(args, config_for_serializing) - config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, - encryption_keyfile=key_path) + config_for_deserializing = TensorizerConfig( + tensorizer_uri=str(model_path), encryption_keyfile=str(key_path)) with vllm_runner(model_ref, load_format="tensorizer", @@ -154,113 +148,46 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, assert outputs == deserialized_outputs -def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): - multilora_inference = import_from_path( - "examples.offline_inference.multilora_inference", - EXAMPLES_PATH / "offline_inference/multilora_inference.py", - ) - - model_ref = "meta-llama/Llama-2-7b-hf" - lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - test_prompts = multilora_inference.create_test_prompts(lora_path) - - # Serialize model before deserializing and binding LoRA adapters - with vllm_runner(model_ref) as vllm_model: - model_path = tmp_path / (model_ref + ".tensors") - - vllm_model.apply_model( - partial( - serialize_vllm_model, - tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) - - with vllm_runner( - model_ref, - load_format="tensorizer", - model_loader_extra_config=TensorizerConfig( - tensorizer_uri=model_path, - num_readers=1, - ), - enable_lora=True, - max_loras=1, - max_lora_rank=8, - max_cpu_loras=2, - max_num_seqs=50, - max_model_len=1000, - ) as loaded_vllm_model: - multilora_inference.process_requests( - loaded_vllm_model.model.llm_engine, test_prompts) - - assert loaded_vllm_model - - -def test_load_without_tensorizer_load_format(vllm_runner): +def test_load_without_tensorizer_load_format(vllm_runner, capfd): model = None - with pytest.raises(ValueError): + try: model = vllm_runner( model_ref, model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) - del model - gc.collect() - torch.cuda.empty_cache() - - -@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") -def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): - ## Serialize model - with vllm_runner(model_ref) as vllm_model: - model_path = tmp_path / (model_ref + ".tensors") - - vllm_model.apply_model( - partial( - serialize_vllm_model, - tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) - - model_loader_extra_config = { - "tensorizer_uri": str(model_path), - } - - ## Start OpenAI API server - openai_args = [ - "--dtype", - "float16", - "--load-format", - "tensorizer", - "--model-loader-extra-config", - json.dumps(model_loader_extra_config), - ] - - with RemoteOpenAIServer(model_ref, openai_args) as server: - print("Server ready.") - - client = server.get_client() - completion = client.completions.create(model=model_ref, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert len(completion.choices) == 1 - assert len(completion.choices[0].text) >= 5 - assert completion.choices[0].finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - -def test_raise_value_error_on_invalid_load_format(vllm_runner): + except RuntimeError: + out, err = capfd.readouterr() + combined_output = out + err + assert ("ValueError: Model loader extra config " + "is not supported for load " + "format LoadFormat.AUTO") in combined_output + finally: + del model + gc.collect() + torch.cuda.empty_cache() + + +def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd): model = None - with pytest.raises(ValueError): + try: model = vllm_runner( model_ref, load_format="safetensors", model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) - del model - gc.collect() - torch.cuda.empty_cache() + except RuntimeError: + out, err = capfd.readouterr() + + combined_output = out + err + assert ("ValueError: Model loader extra config is not supported " + "for load format LoadFormat.SAFETENSORS") in combined_output + finally: + del model + gc.collect() + torch.cuda.empty_cache() @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") -def test_tensorizer_with_tp_path_without_template(vllm_runner): - with pytest.raises(ValueError): +def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd): + try: model_ref = "EleutherAI/pythia-1.4b" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" @@ -275,6 +202,13 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner): tensor_parallel_size=2, disable_custom_all_reduce=True, ) + except RuntimeError: + out, err = capfd.readouterr() + combined_output = out + err + assert ("ValueError: For a sharded model, tensorizer_uri " + "should include a string format template like '%04d' " + "to be formatted with the rank " + "of the shard") in combined_output @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") @@ -288,7 +222,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( enforce_eager=True, ) as base_model: outputs = base_model.generate(prompts, sampling_params) - base_model.model.llm_engine.model_executor.shutdown() # load model with two shards and serialize with encryption model_path = str(tmp_path / (model_ref + "-%02d.tensors")) @@ -296,7 +229,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( tensorizer_config = TensorizerConfig( tensorizer_uri=model_path, - encryption_keyfile=key_path, + encryption_keyfile=str(key_path), ) tensorize_vllm_model( @@ -331,14 +264,13 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): model_ref = "facebook/opt-125m" model_path = tmp_path / (model_ref + ".tensors") config = TensorizerConfig(tensorizer_uri=str(model_path)) + args = EngineArgs(model=model_ref, device="cuda") with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - vllm_model.apply_model( - partial(serialize_vllm_model, tensorizer_config=config)) - - assert is_vllm_tensorized(config) + tensorize_vllm_model(args, config) + assert is_vllm_tensorized(config) with vllm_runner(model_ref, load_format="tensorizer", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5650742ff972..12c306e98048 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1195,8 +1195,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: ############################################################# # Unsupported Feature Flags on V1. - if (self.load_format == LoadFormat.TENSORIZER.value - or self.load_format == LoadFormat.SHARDED_STATE.value): + if self.load_format == LoadFormat.SHARDED_STATE.value: _raise_or_fallback( feature_name=f"--load_format {self.load_format}", recommend_to_remove=False) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 959fe4a672a6..83aef62451a1 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -29,6 +29,7 @@ get_supported_lora_modules, is_regex_target_modules, parse_fine_tuned_lora_name, replace_submodule) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -185,19 +186,19 @@ def from_lora_tensors( @classmethod def from_local_checkpoint( - cls, - lora_dir: str, - expected_lora_modules: list[str], - peft_helper: PEFTHelper, - *, - lora_model_id: Optional[int] = None, - device: str = "cuda", - dtype: Optional[torch.dtype] = None, - target_embedding_padding: Optional[int] = None, - embedding_modules: Optional[dict[str, str]] = None, - embedding_padding_modules: Optional[list[str]] = None, - weights_mapper: Optional[WeightsMapper] = None, - ) -> "LoRAModel": + cls, + lora_dir: str, + expected_lora_modules: list[str], + peft_helper: PEFTHelper, + *, + lora_model_id: Optional[int] = None, + device: str = "cuda", + dtype: Optional[torch.dtype] = None, + target_embedding_padding: Optional[int] = None, + embedding_modules: Optional[dict[str, str]] = None, + embedding_padding_modules: Optional[list[str]] = None, + weights_mapper: Optional[WeightsMapper] = None, + tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. Args: @@ -219,10 +220,36 @@ def from_local_checkpoint( lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + tensors: dict[str, torch.Tensor] = {} + unexpected_modules: list[Union[list[str], str]] = [] + + def check_unexpected_modules(modules: dict): + for lora_module in modules.keys(): # noqa + module_name, _, _ = parse_fine_tuned_lora_name( + lora_module, weights_mapper) + part_name = module_name.split(".")[-1] + if part_name not in expected_lora_modules: + unexpected_modules.append(module_name) + if unexpected_modules: + raise ValueError( + f"While loading {lora_dir}, expected" + f" target modules in {expected_lora_modules}" + f" but received {unexpected_modules}." + f" Please verify that the loaded LoRA module is correct") - unexpected_modules: list[Union[list[str], str]] - if os.path.isfile(lora_tensor_path): - tensors: dict[str, torch.Tensor] = {} + if tensorizer_config_dict: + from tensorizer import TensorDeserializer + + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir, + "adapter_model.tensors") + tensorizer_args = tensorizer_config._construct_tensorizer_args() + tensors = TensorDeserializer(lora_tensor_path, + dtype=tensorizer_config.dtype, + **tensorizer_args.deserializer_params) + check_unexpected_modules(tensors) + + elif os.path.isfile(lora_tensor_path): # Find unexpected modules. # Use safetensor key as a source of truth to find expected modules. # in peft if you have target_modules A, B, C and C does not exist @@ -232,20 +259,8 @@ def from_local_checkpoint( unexpected_modules = [] with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore - for lora_module in f.keys(): # noqa - module_name, _, _ = parse_fine_tuned_lora_name( - lora_module, weights_mapper) - part_name = module_name.split(".")[-1] - if part_name not in expected_lora_modules: - unexpected_modules.append(module_name) - if unexpected_modules: - raise ValueError( - f"While loading {lora_dir}, expected" - f" target modules in {expected_lora_modules}" - f" but received {unexpected_modules}." - f" Please verify that the loaded LoRA module is correct" - ) # Load tensors if there are only expected modules. + check_unexpected_modules(f) for module in f.keys(): # noqa tensors[module] = f.get_tensor(module) elif os.path.isfile(lora_bin_file_path): diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index d5de63f5baad..7d335e5f7fab 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -10,6 +10,7 @@ from vllm.config import LoRAConfig from vllm.logger import init_logger +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig logger = init_logger(__name__) @@ -89,12 +90,31 @@ def from_dict(cls, config_dict: dict) -> "PEFTHelper": return cls(**filtered_dict) @classmethod - def from_local_dir(cls, lora_path: str, - max_position_embeddings: Optional[int]) -> "PEFTHelper": + def from_local_dir( + cls, + lora_path: str, + max_position_embeddings: Optional[int], + tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper": lora_config_path = os.path.join(lora_path, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) + if tensorizer_config_dict: + tensorizer_config = TensorizerConfig(**tensorizer_config_dict) + tensorizer_args = tensorizer_config._construct_tensorizer_args() + from tensorizer.stream_io import open_stream + lora_config_path = os.path.join(tensorizer_config.lora_dir, + "adapter_config.json") + with open_stream(lora_config_path, + mode="rb", + **tensorizer_args.stream_params) as f: + config = json.load(f) + + logger.info("Successfully deserialized LoRA config from %s", + tensorizer_config.lora_dir) + + else: + with open(lora_config_path) as f: + config = json.load(f) + config["vllm_max_position_embeddings"] = max_position_embeddings return cls.from_dict(config) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index badfaa419377..616e94f8d678 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -31,6 +31,7 @@ class LoRARequest( lora_local_path: Optional[str] = msgspec.field(default=None) long_lora_max_len: Optional[int] = None base_model_name: Optional[str] = msgspec.field(default=None) + tensorizer_config_dict: Optional[dict] = None def __post_init__(self): if self.lora_local_path: diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 8e5bc6106659..afc8a8dc3b26 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -100,7 +100,8 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: lora_path = get_adapter_absolute_path(lora_request.lora_path) peft_helper = PEFTHelper.from_local_dir( - lora_path, self.max_position_embeddings) + lora_path, self.max_position_embeddings, + lora_request.tensorizer_config_dict) # Validates the LoRA configuration against requirements before # loading weights, throwing an exception if validation fails. @@ -125,6 +126,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: self.lora_config.lora_extra_vocab_size, embedding_modules=self.embedding_modules, embedding_padding_modules=self.embedding_padding_modules, + tensorizer_config_dict=lora_request.tensorizer_config_dict, weights_mapper=hf_to_vllm_mapper) except FileNotFoundError as e: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 459c4b4392e3..900f12ebe6ca 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -1,24 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import contextlib +import contextvars import dataclasses import io +import json import os import re +import threading import time from collections.abc import Generator from dataclasses import dataclass from functools import partial -from typing import BinaryIO, Optional, Union +from typing import Any, BinaryIO, Optional, Union import torch from torch import nn +from torch.utils._python_dispatch import TorchDispatchMode from transformers import PretrainedConfig import vllm.envs as envs from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -58,9 +62,79 @@ logger = init_logger(__name__) +class MetaTensorMode(TorchDispatchMode): + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + if func._schema.name == "aten::empty" and "device" not in kwargs: + kwargs["device"] = "meta" + + return func(*args, **kwargs) + + +def meta_tensor_mode(loading_code=None, ): + + if loading_code is None: + return _NoInitOrTensorImpl.context_manager() + elif callable(loading_code): + with _NoInitOrTensorImpl.context_manager(): + return loading_code() + else: + raise TypeError( + "expected a callable to evaluate," + " or None if being used as a context manager;" + f' got an object of type "{type(loading_code).__name__}" instead.') + + +class _NoInitOrTensorImpl: + _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm) + _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES) + + is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active", + default=False) + _count_active: int = 0 + _count_active_lock = threading.Lock() + + @classmethod + @contextlib.contextmanager + def context_manager(cls): + if cls.is_active.get(): + yield + return + + with cls._count_active_lock: + cls._count_active += 1 + if cls._count_active == 1: + for mod in cls._MODULES: + mod.reset_parameters = cls._disable(mod.reset_parameters) + + reset_token = cls.is_active.set(True) + + try: + with MetaTensorMode(): + yield + finally: + cls.is_active.reset(reset_token) + with cls._count_active_lock: + cls._count_active -= 1 + if cls._count_active == 0: + for mod, original in cls._MODULE_ORIGINALS: + mod.reset_parameters = original + + @staticmethod + def _disable(func): + + def wrapper(*args, **kwargs): + if not _NoInitOrTensorImpl.is_active.get(): + return func(*args, **kwargs) + + return wrapper + + @dataclass class TensorizerConfig: - tensorizer_uri: str + tensorizer_uri: Union[str, None] = None vllm_tensorized: Optional[bool] = False verify_hash: Optional[bool] = False num_readers: Optional[int] = None @@ -71,12 +145,29 @@ class TensorizerConfig: model_class: Optional[type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None + lora_dir: Optional[str] = None _is_sharded: bool = False def __post_init__(self): # check if the configuration is for a sharded vLLM model self._is_sharded = isinstance(self.tensorizer_uri, str) \ and re.search(r'%0\dd', self.tensorizer_uri) is not None + if not self.tensorizer_uri and not self.lora_dir: + raise ValueError("tensorizer_uri must be provided.") + if not self.tensorizer_uri and self.lora_dir: + self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors" + assert self.tensorizer_uri is not None, ("tensorizer_uri must be " + "provided.") + self.tensorizer_dir = os.path.dirname(self.tensorizer_uri) + self.lora_dir = self.tensorizer_dir + + @classmethod + def as_dict(cls, *args, **kwargs) -> dict[str, Any]: + cfg = TensorizerConfig(*args, **kwargs) + return dataclasses.asdict(cfg) + + def to_dict(self) -> dict[str, Any]: + return dataclasses.asdict(self) def _construct_tensorizer_args(self) -> "TensorizerArgs": tensorizer_args = { @@ -140,7 +231,9 @@ class TensorizerArgs: Args: tensorizer_uri: Path to serialized model tensors. Can be a local file - path or a S3 URI. + path or a S3 URI. This is a required field unless lora_dir is + provided and the config is meant to be used for the + `tensorize_lora_adapter` function. vllm_tensorized: If True, indicates that the serialized model is a vLLM model. This is used to determine the behavior of the TensorDeserializer when loading tensors from a serialized model. @@ -296,10 +389,10 @@ def _init_model(self): model_args.torch_dtype = self.tensorizer_config.dtype assert self.tensorizer_config.model_class is not None # TODO: Do we need to consider old-style model class? - with no_init_or_tensor(), set_current_vllm_config(self.vllm_config, - check_compile=True): + with meta_tensor_mode(), set_current_vllm_config(self.vllm_config, + check_compile=True): return self.tensorizer_config.model_class( - vllm_config=self.vllm_config, ) + vllm_config=self.vllm_config) def _resize_lora_embeddings(self): """Modify LoRA embedding layers to use bigger tensors @@ -467,8 +560,73 @@ def tensorize_vllm_model(engine_args: EngineArgs, ) as stream: stream.write(encryption_params.key) - engine = LLMEngine.from_engine_args(engine_args) - engine.model_executor.collective_rpc( - "save_tensorized_model", - kwargs=dict(tensorizer_config=tensorizer_config), - ) + from vllm import LLMEngine + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + + if not envs.VLLM_USE_V1: + engine = LLMEngine.from_engine_args(engine_args) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) + else: + engine = V1LLMEngine.from_vllm_config(engine_config) + engine.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) + + +def tensorize_lora_adapter(lora_path: str, + tensorizer_config: TensorizerConfig): + """ + Uses tensorizer to serialize a LoRA adapter. Assumes that the files + needed to load a LoRA adapter are a safetensors-format file called + adapter_model.safetensors and a json config file called adapter_config.json. + + Serializes the files in the tensorizer_config.lora_dir + """ + import safetensors + + from vllm.lora.utils import get_adapter_absolute_path + + lora_dir = get_adapter_absolute_path(lora_path) + + tensor_path = config_path = "" + + for file in os.listdir(lora_dir): + if file.startswith("adapter_model"): + tensor_path = lora_dir + "/" + file + if file.startswith("adapter_config"): + config_path = lora_dir + "/" + file + if tensor_path and config_path: + break + + if tensor_path.endswith(".safetensors"): + tensors = safetensors.torch.load_file(tensor_path) + elif tensor_path.endswith(".bin"): + tensors = torch.load(tensor_path) + else: + raise ValueError("Unsupported file: %s", tensor_path) + + with open(config_path) as f: + config = json.load(f) + + tensorizer_args = tensorizer_config._construct_tensorizer_args() + + with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json", + mode="wb+", + **tensorizer_args.stream_params) as f: + + f.write(json.dumps(config).encode("utf-8")) + + lora_uri = (f"{tensorizer_config.lora_dir}" + f"/adapter_model.tensors") + with open_stream(lora_uri, mode="wb+", + **tensorizer_args.stream_params) as f: + serializer = TensorSerializer(f) + serializer.write_state_dict(tensors) + serializer.close() + + logger.info("Successfully serialized LoRA files to %s", + str(tensorizer_config.lora_dir)) diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 4107e741fd8f..ac9ef6164388 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -2,6 +2,7 @@ # ruff: noqa: SIM117 import copy from collections.abc import Generator +from typing import Union import torch from torch import nn @@ -111,8 +112,10 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: @staticmethod def save_model( model: torch.nn.Module, - tensorizer_config: TensorizerConfig, + tensorizer_config: Union[TensorizerConfig, dict], ) -> None: + if isinstance(tensorizer_config, dict): + tensorizer_config = TensorizerConfig(**tensorizer_config) serialize_vllm_model( model=model, tensorizer_config=tensorizer_config, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 64e472457ee3..740ba60fe231 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -340,6 +340,13 @@ def collective_rpc(self, return self.model_executor.collective_rpc(method, timeout, args, kwargs) + def save_tensorized_model( + self, + tensorizer_config, + ) -> None: + self.model_executor.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 759d69293a32..6d4888363d50 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -25,7 +25,7 @@ from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader import TensorizerLoader, get_model from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -60,6 +60,7 @@ if TYPE_CHECKING: import xgrammar as xgr + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") @@ -1534,6 +1535,15 @@ def load_model(self) -> None: time_after_load - time_before_load) prepare_communication_buffer_for_model(self.model) + def save_tensorized_model( + self, + tensorizer_config: "TensorizerConfig", + ) -> None: + TensorizerLoader.save_model( + self.model, + tensorizer_config=tensorizer_config, + ) + def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 93129d987940..2b945cc4111a 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -31,6 +31,7 @@ logger = init_logger(__name__) if TYPE_CHECKING: + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput @@ -326,6 +327,13 @@ def save_sharded_state( max_size=max_size, ) + def save_tensorized_model( + self, + tensorizer_config: "TensorizerConfig", + ) -> None: + self.model_runner.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + def init_worker_distributed_environment( vllm_config: VllmConfig, From 46791e1b4bad8698043eb632bb856306921f2334 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 22 May 2025 20:45:35 -0500 Subject: [PATCH 069/192] [AMD] [P/D] Compute num gpus for ROCm correctly in run_accuracy_test.sh (#18568) Signed-off-by: Randall Smith --- .../nixl_integration/run_accuracy_test.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index e90b72a7cf24..c17784e0a263 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -13,6 +13,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2} # Default to 2 # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) +SMI_BIN=$(which nvidia-smi || which rocm-smi) + # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -44,6 +46,13 @@ get_model_args() { echo "$extra_args" } +get_num_gpus() { + if [[ "$SMI_BIN" == *"nvidia"* ]]; then + echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + else + echo "$($SMI_BIN -l | grep GPU | wc -l)" + fi +} # Function to run tests for a specific model run_tests_for_model() { @@ -64,7 +73,7 @@ run_tests_for_model() { # Start prefill instances for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs - GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l))) + GPU_ID=$((i % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8100 + i)) # Calculate side channel port @@ -96,7 +105,7 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l))) + GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8200 + i)) # Calculate side channel port From 04eb88dc803b4b39edd91b1e86a8e3cad5c0c0a1 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Thu, 22 May 2025 18:59:18 -0700 Subject: [PATCH 070/192] Re-submit: Fix: Proper RGBA -> RGB conversion for PIL images. (#18569) Signed-off-by: Chenheli Hua --- benchmarks/benchmark_dataset.py | 3 +- .../qwen2_5_omni/only_thinker.py | 4 +- examples/offline_inference/vision_language.py | 5 ++- .../multimodal/generation/test_interleaved.py | 6 ++- .../multimodal/generation/test_phi4mm.py | 4 +- tests/models/test_oot_registration.py | 3 +- tests/multimodal/assets/rgba.png | Bin 0 -> 224566 bytes tests/multimodal/test_image.py | 36 ++++++++++++++++++ tests/multimodal/test_utils.py | 3 +- vllm/benchmarks/datasets.py | 4 +- vllm/model_executor/models/internvl.py | 3 +- vllm/model_executor/models/skyworkr1v.py | 3 +- vllm/multimodal/hasher.py | 4 +- vllm/multimodal/image.py | 25 ++++++++++-- vllm/transformers_utils/processors/ovis.py | 6 ++- 15 files changed, 89 insertions(+), 20 deletions(-) create mode 100644 tests/multimodal/assets/rgba.png create mode 100644 tests/multimodal/test_image.py diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index d8f48644cc00..5513a5f78f1c 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -35,6 +35,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer logger = logging.getLogger(__name__) @@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]: if isinstance(image, dict) and "bytes" in image: image = Image.open(BytesIO(image["bytes"])) if isinstance(image, Image.Image): - image = image.convert("RGB") + image = convert_image_mode(image, "RGB") with io.BytesIO() as image_data: image.save(image_data, format="JPEG") image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index 52b6e977eaa2..deb6f580a447 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -11,6 +11,7 @@ from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset +from vllm.multimodal.image import convert_image_mode from vllm.utils import FlexibleArgumentParser @@ -45,7 +46,8 @@ def get_mixed_modalities_query() -> QueryResult: "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, "image": - ImageAsset("cherry_blossom").pil_image.convert("RGB"), + convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB"), "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, }, diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index c54f328c7a38..941fcd381dea 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -19,6 +19,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest +from vllm.multimodal.image import convert_image_mode from vllm.utils import FlexibleArgumentParser @@ -1096,8 +1097,8 @@ def get_multi_modal_input(args): """ if args.modality == "image": # Input image and question - image = ImageAsset("cherry_blossom") \ - .pil_image.convert("RGB") + image = convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB") img_questions = [ "What is the content of this image?", "Describe the content of this image in detail.", diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py index eec84751e450..972db40e8bd6 100644 --- a/tests/models/multimodal/generation/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -4,6 +4,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset +from vllm.multimodal.image import convert_image_mode models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"] @@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None: give the same result. """ - image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB") - image_stop = ImageAsset("stop_sign").pil_image.convert("RGB") + image_cherry = convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB") + image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB") images = [image_cherry, image_stop] video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index 11460a1a8d2b..5a12b5910949 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -12,7 +12,7 @@ from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest -from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.image import convert_image_mode, rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs @@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str, # use the example speech question so that the model outputs are reasonable audio = librosa.load(speech_question, sr=None) - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") inputs_vision_speech = [ ( diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index b45a87d94b86..b62720caa9cb 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -4,6 +4,7 @@ from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset +from vllm.multimodal.image import convert_image_mode from ..utils import create_new_process_for_each_test @@ -58,7 +59,7 @@ def test_oot_registration_embedding( assert all(v == 0 for v in output.outputs.embedding) -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") +image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") @create_new_process_for_each_test() diff --git a/tests/multimodal/assets/rgba.png b/tests/multimodal/assets/rgba.png new file mode 100644 index 0000000000000000000000000000000000000000..11eb81857a65ba8ddd80c4e07c5a0bbe2ddcf401 GIT binary patch literal 224566 zcmbrlWm6o{_B}kf4+Ftv7&N#;aQ6TS1b26L51PRTcbA0V?k>UIEw}}T;F3T0-d{am z;OQ6LRb5@x`hDWdy0Kx7> zUP@fe3+`kX2`-Es1o*tBFl$LmFZM$c!;kRy>IB!@lmEYtKtX_1r?eEFWzr6jlEQRF zgMsbW+r9Zt2Wys)FBV#DMP1JqBrHzhnpF^YR`V`C2>wi4KeXGaR&J zb^R|&u$Cw5(V3FE$Hl_YPX^xNeq(u2g01Z|!r#p;tmwx^Fg-uCd5%*Fb(Wf2JRJ*e zv^d1`Ed4!jzboIWv-}JQ`Gc!Y?D8#5T)L#BzD4G<3Q|z7yS;xu6wbvRsij?N?VR$4 zgoh?53BZ*WURlI;cb8^s>6t6~f_KdJuQoOJW!d@R-ZY8gk#fn?pO6zUND;66WA?~R zfPCRRWqUGI`A4En%H77GVChHtFcSE0S;VfaHF$`gC4$8sAL;jKNKP@|?|L;-x@4G! zo_I_h+ZGWKe@82~q?AbnO&A{U$2Q>Yn#Y^x`0Hyjvl%|R@EK`qju&O3j;#|yATs3^ zIdHjOTvpEq``+goQ5ttqh`+r$`z->>&HHz6aSVBW{V-GYJYu|bZr;R<$>HKAD@3Ni zDe2_64d>pZrC>WG&t7674TvO$1TWa(@ZZL-P{9`=uAj5$>9NoAVE=*!lwJ6>9|vJatuPR{MGueti0Y z)D72#^`B_ofgCuSDV5%1C z_OIJ~3FOT+K~C_n^K^NNrXiv>%ZP2#8!*1ej~L{m3I(Xpsw>Gg3qI38au)IJpK8j> z$WjKb>8NfL(F2tv`lZ$+#q9BBR=rhj1gH}^6Mu)Au?`U62$KPQu#=FM$RAns{ukEk z{Rc?4GPL_z%NCJ2;IGQ>mzOTsWyhUa)5b><+1OHna4U9Gc#sXZC8Fm>QQ`QM>hls$ zX1G^o@wd|qq46{2O|Nkfq>bD}f$00Nz+=B1Cbsl`Rn!5U-Y8{Pjx#OBZ*qInfjWOS>UA0?`cCc=2*J zSiyoO6PQ0vQ3}#gxaa})`jJf!e&dO8_ui|NtHjb`>Y>hm*)bIvC}XI(Xp%w`l3i7y(YLop&&I_sCSKcePH zTGj0)#X-emb7Kn0zRRDf&IkfNT|O232kdO+E~K*ABbsj81swXmn+By>wQug@lF=z5 z_hNtcsTGw(Zhgs3qvh2llw1}O^BO}>d#;T3omagOJsmA;hoQ}hf%ezhB3^c$g^WD3 z{VB#`;nTKU-!_#*>Sd(#nod#Hj4g(W?^~G%v4(We|EwYQ+g49-rDnyi8w30j<1q8$ z_J6R1AFPy>T6Pl#+8fA$-$_J~O{yl46&_e8xDz#>+-0$`Zm-YdJ5q=wtvWuWToQ== zO8?(VeIY1U{t*ILlQBvU)XP1l=dn%3B!wegV~I~O`RnFR+cyvfY|~GKv46`$iwjCB z3~&BuM~O4pq@S(=H)~ITi{88dzN7UDfXA=)(Zq@*@xJbT1k<~)y+{UY37}hKsYf)h+=cu4JvH| z)PaYw0I!w)9U-P}-cdS8iGNZQD6e#iD2Nx@O_Dl$!niFSitoymCV2AqmTB^Ma^c8q-uy_8H9t^bQ<^W&Nj1DNGu$5h4^_xTdc+bjp2H<~&FEb#`Ir{m zh1LUyUiD&QTVKh6ir0Q?>p#hM^Kx<52fd>0i1@Tk?T^Q{$GH}onUuzD6N;EGk*&<6 zy1?wM-^^^d8P#UE%LW!1_;HPWg|Zm+P~f*=xzV(Vv-F`fTX`e!ZSHX(7p0LXbVloN z=D60uD&Ij@Zr;|j>_$$Lg~YgVgYGPn?_f_)xRMtt$x3{`jHkqTFgY3*ib&oOXN1y- zp#%<-!-;(smWMKKTiZJ0V&F>n*w)JotTVu8AxVV}{#Lczuo$7EKqNc6Nc3Yrz${}p z_DPB*se%t~DE4=-;M*GX3i~UQ8N=<^VJ{iokAB~XWsJ3uu~slbM(v_dE>4WR>nv5- zF*i25TV=Yt&UfXmP_7H|@7Oqw0_`UIJ`083KkIhgT!91gXOEs+!u}-&1Egh+1B;QT zOs~BvGyUX75f670F(1XMoM=B?(6m$h{

q&bljsLhOeAMJ}QK)x?Y@8Bc*XEkp}j zh7tMppgEVckbPiWmIE(|L9GP21 zblE@8q*m)Lx{tId7AH-2U{WrX*zEAFNUZ>pen!YnqzrLJE)M&*YSFXdK*wHyp%jRDoVZva6@F%vtkl%#fp zCmr8P{<#Mc#CG%`Hnj&;p7yNo7dkY;o(EBeuX}(0?!Zog-NB+C-uR@-zfTk`cjd?9 z@?*M4n}14*IT{^-%fyy?KrsZb7&$W#6|qV{0@;De2UWb!SE=V7Z<|Q&6aR^$!@@LX;&Lqldw3FDwUka` zw6*pKUg7DV2RGLp!kxpLyDL@1o3%TDjAxOm-4L<0Hqkb#-o8t1U?HSBR%-{aJJq50KYXjP0Ss<==M|nTiKST5FLTJCMr8xK^|SbVb%` z#*sy*Epm_t8!?wBdYS@Q<(T>;QxSwh_ly&m zA5HvmMD8Z}SBRjThtb-+a5zL}?{Fi4JQNI`gbjCsA;FqALJnfzMqyPvt{^|M`0jE82} z9w4vyZMS|~wXgkK?{n8;a|2cy$wlCJBX%=5_w3#sg@n^`k&cfXo|&TVZT}g$!v=3! zbfu0OS4R{vW!MjN44#H>UZo>- zlrn2;tu9i}X%CYP;VKMo zgS~wUH8B}jjs!b*9zJe5t>$9AjScT%;$OkNc9G-LBYiy)yARmtE`971ELek6hUE|KvR0yZ3X3@1IF zige;nYa^rV4swAGycAmMDvT83Zd>R^WwWSUoZ?K1)>X$kA;;zCKUJXe?|4kM>yhOV z)|D?ldp?B3=K&8_sZV_;*%P_jw$jr_&+6NWpV&Km_GZ~$5V={+nXhDe9#}^kiA?23sl9> zQQEdST&YxjWU#E^yUF}YF_?MGbx>s=WV*xrX*ILC5mJ(~@q5o?|2GZd1+yx_4Djq$ zXZ~G8voLoRz&10bAfxW%sDtJ%gkoSUXiMjM}d=fR*heEJcrMxX=$2y%<0)iN*k-3H!B6DMPwKwnCh(`VI*H8|N>J;^NAjjq0iG8)5WL_ZJyU^Q8IFdnJ&XMQPKs&smmYRkOCN z|9yIl1twzg!4W>eG~$V9TR8k-s|;F|*X%1Gst*`@Qy2aSGe^(UEvSYMBYqD@Kd$kD z&jW>$g9TCxy~@jAN&;*40QQChez52-_heY1m!96xxC3;Q$e$wkR3eT%BEG)4LzS>$ zuiib61;Z~Vg6N0==lZ0U`K&n5V&W9PwN|*#*>1+REi3oh3akEh!IHs>V}re~2H=Iz z?M-UJQrtb!WLUxUG5h*~IF2`SdO~%2`}LNt-^r#Mw)BAR=qRkFZz8qS$e%w^y0R^y zy4ItrH#TKPmEN+*8psxi)qpaSi25ZF#}jo^2VJh9F|4ia%7v)l!~?9ezuY^Ij}WN& z9|>jm{PgvXHsI>3C|cDo%_B|2A1WHeEUQPE%mag2t^x|O3~)%7B#R7`Sd>soFQR$( z&#B_XxR@&WXt7cd5GJ6Zp-){8FAzjS1}}k>2az*Fg;(L5w+d|$ta&l7*zMi9^3eCs zCS<-tCgr>~GAS!bYMJ8U#z71FVOW-FNmPqssG+|Y&i6gRgQGtLZ}~^uS?=cWvG4Du z)w;8%Q|)YJ!Z_svY!OP-)pSNA-;r zxdb@rN$V-cH0|~kBL|{wUi_*lcYaL=2edm4KC~A5`{Cvjq_|64XJ322qZ+j3;yWtI z3PW%l9n{q&5Y+$hd>S+8NB~%$XNwu&MK zb~5dzc{Zc=U0#1SV{ysHvW=2MDP$m`GLd`BiTsG{k5I30P_*pj#Y(rlz@dVy8>BlQ z_~i|8xIDg>=ljghQb(9pKb$7(O?WZ`%Ll#Exvmq0H!OF>>H9`%%|?rJk_QMh7bPr% zs2A}7wyfU6l#>kK%?DbgqM+e?sd{v1wOzY=FVgK+xGug?t8M4)uM@>x5$AW}KW5)b zh?b?C>*n>QD06Mt3|!LAes4###ShkEG}IaH<@u-coh`xh&FK&nKR55;QS0q_?ky@> zWe$_i=H)Du%_RahOC;>Agh?i8mOC%P4Dj!>@v|0DiYP7f+c*$&K0r!+5@J~(N`M2> zW4;j?h5;L2a-$g4c74LSh0_ye$Qgeful+j^lQi!`H9V*&@3PUZo`7Mk(zzGWQ9Rg@ zW`~|!^_`l&n)c33G!^`Ez-@S{L~~STHtn~;X08&tAUo{{-10roucDFvJ#Ije*{|w+(kT?_R)jv>DqC7}`8Ti$$GlrZ6we=GXhAYoS?B9FPR6Mh11G zN%+}@L`;W5g0|gxL(HKE!3kVfs%^u+UV`@=Zrq2)Oyh6ju*@$9)aLPQxI5!PRBb-* z()fLILP90$`DA12(#YoA4BVEBvpm|WSI=9z{#mm9TRs+CTfbcu`AR5|%?in$7`RBT zYG3&`!)E+`Tafd7?PIpRW+b+I&e|zG7(l0<$;#$@vl}7mb1ock^hW`Fa@W3ge&f=# zQyC!}jmg7_Ikke>!{8DNM_;cqY67vgue@|ea^?Lb-+$tV9hA?jE-eX>*0Y^}?@gK2 z=u`1+iz(IMpWu1FML+Q8;j&!00_Mym=qP%EmqbUQB(-)#B{oV2UPT_1?w3p1(av5a zhFdj5lnOH&y+t=(M*I`wl;yxwSQ?Da(f1JKfv_ewH*^^qics!6+#-*B`{N%ANZtw+ z42L;7(?~%8z&`uTg{>qarI;haBjRnM8hQBUUi~fIXnT2jt806pKVk;IO7`Y|lY253 zEj@1DcW><(#}g`I1V7p88A`#FgfiStCa%N>4`b0{d$wvU0;yYl88wgrKgahy9x+jQ z&@Dl#O{YxaxcJWX#PIpe7hb!_@7KoGK1cRs`Xz1CT27AsIrbkv?EzjwplU^AdTady z7MU15GZy)HASMDcSf|Bq!^b$wkV z`G@Dy&Qw$7e>u)_N22q2?@8Fn^WnKgiOSpZTcIC-H$53nED=UW{*)2X@8)WXz2W-_ zEb2ppUFNq|@93>>FfzbB%#Mi*4NAam;59{*?A8{OBZ2!d&xiymj(~IZ7*8h%_Khy# zC}+x7Amm32N1a^7ri3j!?9`GZWh}^E4iWPNRwI>55+CG?1j>db0wy5(JJcI%mQ5-<~! z$Vi09?!vV#WqKBa@lugO==JcxC9H#=Q}=JXQBQh2EX6_ee&n$@>x1KdgR}dA=sJo zT0`SDFV4`rxh(>8oa$t4mEewPha&{cc=`cB@q^6GPcQ&u&)Smn8PY8u;y zvlFIzXEGCrAei7UoId=^n3}`hYK_Q_TW8!$EDxSoZJ`S+^)b|=+)7+y#R*NB7E~=- z;~+-xQHG#6QbJlWRDtGuUIC>7nPn!0oN`V!8ftX7B15`_{em}K9AZK=@0Vt?eSU?k zkADlb3_iIiW`IXJBx5K=r20+AE{)M=e+Ap)KeCS;BG|HZKEI}(1829+Ui^n60mNNe z0>|ZCfm!aXod0%2bE>`t{Z{o%t?PrzX3AYtt38eXVvS@o-8Dyjh%{T?`Li#RyYYLw z)J%kxjn(YSrfF+4s&f-YZ7}rEqHue@JidP75|n8~vvOaaGs{Z4*x+cozp$wKtHD=}ZLG`?Jn7O4Ce1LIKfcyr!cod7YLx z?s8wm>L+oW!{PAwO?5D?`4e~n^f;0e1m|b{Tash+Py}*xo?n!Bpy`82rQg2iZS<{7 zBFD^GJv*o2sYDm{igG+VTh4;cD{CvKx0A=YUEZ?4TZpf{h>7n82qV!c2)WP51fm;n zxah@r>Vq|h#vp3Fx4~4#o|GVuIFqlu6UHgYDluQ3|LJs`WHgpk$kju(t_ zC+=S5~qKe4)uudZr&-{5rS?FMRLS<%j+fpE(>hotRr~WZR?j_O_|@# zv5hQKZW*+-t7I!p>MJY-Sy{c*x7C*oqb|@lrBU1M>V#Q8?wfvf&YwNCk46%y?}j=w zF=i^!Ur+)m@+mrOt_33TX=rKHs_FNI5(f*Hwmw6j)DJJ3nVF|4kp|LtA6>_a$XjMG z!}0S86MRpIE-sN97;!f@3e{cyPg@oYaQyOfdX6Y%lPYtwM3j;W2)5G>wd zRf~s1EQyC2n)Unrgd>gA_Ntka5^!(=YneaS-E>WF#qBNEF`sv_q5JKft##~|8)~T_ ze*GZx%t05r-Qv}QOuRb%>j=r<9%+vk;Rl3b)xCe~op`RCk6C??4r5DcUQf(L+9 zf_3fV^&DI%!H;qnF=TUv!BpD>H)ZtQn&KIFFZiIW5_<-o$T$cl9j#4@9e`6D$Qjz7 zRQQYCII^jIo!r|mompF^ky*QL&uIDCy>`iDh#i$6IK4=6Xqu9S-A--3XQyONa^b{j zPs~Nj&6}wAA}$R~CP$#lD+vag5`92`qTyt&J2!ohIKcY)LYVute#%<9t*cG z>^ur6e)!OdC#rp`P|!-7GD3m+Mt6pk+tjg4(_cgI{=b5bWP zG_}~=g)-Wzm4+LdM8$}vy}EZ*WJR`YTDE0X;VN7^__)S9Scrv#7g`PcfGJ+DCG<;c zu~hu&H_~R!qIub!kyc??r)SkN=+^>c^E$M&)qAF=EEp|U%y?!g#`JDpsi8xFpL6*t}iC;1zRxbQ8e+*IC;CYrc($Y7p(#gDV9qpDI-M4H;CvDyRG^W*Y#~yJbOKPTrUJ!vv$mB& z9{H<*#jS5h`amtXv60-v`@@+T;rZ$Bi5p7G>>F2j$lH0fi~R1qQ_#>*e;h~(q|huK zRi7B{+398WjTyYzGY`<%+!^58C*;t9gFRCrw!xh&vsdF~9J|B8PRC<_NFv+w>fu1L zoEIzYICP54n8l!G9Fx#Iddoz3a{9);p3iyI8|&%(p=;~$=7wX>fAF<$Z~x#khLXX6 zEHMDN4;pz8;l^zjSPO!D)twea$VV5tXXijFcr^4FmA-d@C6V;ci?u1Z>ldh~Lpf;q z0yE;tG8= z`TU&u+DAQ>Ul=6YJCH<&0u$zHO7-x5$h^vQ6g{|Z`q1bNW?nRQ&#B*~jJA{Et1iuKEpDE{r#D-77Zkk*lOqY(+K9 zEszAR@c9i>mNxrXKV<4{QVVaKA5E!{!yATQr0mw=I(i6RmJe!iLN2d35$^)d(_`^P zgr2r8IFx{3DoVGDybu&@6t26zkUp7-oJ*~(zL6EvFzly$(@{3V+7n|SWG~Y-=L~U?#|QsLEF^> z8Ht*C!kgsxKmr>3cG1^}qlSb>0}4yMQhr6vg@juch|L&%*TYzexYDW#>oevs>e@We zew&zoXb5<^AYbMIuG?8o{cm_SEGX|hGuvC9)t~=FdEQT~<~*PhtWBmrkbTqTI^Nvs zM0Td6BE=`ykqH51^&&Fh;oX!=F_BCXw^YIr>A{2WoWf9#M2ol-QaAA|1mc85Jm4W# zqPghq_o%6RJ~#$;IfDn8P1~!p*IN&6j25b-F;(1qFQ;YFGvfwlm77D6O( zq1&h??3LQ_k9>!|hS>Et-Vnx;aCs!cVvTAvD0>#n?2U6LY2IYf!uUPxUbda_-S8=6 zsGD(0;k@(Qvv|jH`_hBMe4SlGGaV9dfKP~V@ zA{iKP81nlDO$p4xV$tdCl^DLEs;LK;4&p0FAc1~rgVa$O5R*6-o+yF%iqGD!Uij9` zrK9*_CAL^7q)r$05_1~2(xc=QHpW1n2?cN*bGN(zzD6yoR>zIMV-ArqR4 zT1Z!Z-p1+An-Zr_6l7O4;be5f8=|MJ>utafwa^}Z;{6Fn9 zDdW5De7B%`v}}>cIzCfuDJ=8^w~GOVC6S5P?C*HADX}%`PWmh{_%&lRI|*X)+VO@P z?9{AlXkHK@_pp~lo5r~h$(MGYl)vu`mlQ?HJW%GRPcNIbM4lD7;23%sy{9;gbz ze4|v82rPD;JC-;^7Yih)ffhDMb+^VGBBYV1vrTPB#PE0N^g_iX1Y6#AOXy0+DR$4P zT1}+TbQqm{;REwFK~P+vsH1k84yROgBa7lwmKwZ#4~@s=8c*qM8Z^ zG9Ex9b?6ip1+VS8(a@vLhTSDm=4-T^tC z(a7#aJX~okiEujpjz|L(P?O;g&`ceD2qwK#kkZo7fK&d!7L`G${@Txw!GI+wuB5l# zpoceHTTdKqV(((CUoQnIYAmEcOu$SL&KQ%KRpsZGI#EYNc68blqH$)Du^^iJ4hW^e zrGnwF)MsqNSLnL{68E=d3*bL4x4iTWkgXEAhpb03r_!*gDlGC>T^L)Wns;c9ry>dD z@OvpH5@z`Jc(Y!p^z7*1KM`>N5qjC#y(k~J)lD4 zwAV{I7q`2i3H!j(=u!e{BAsu&=NOx|i|9%n3w?J?a^jH<2r@XUson%VnBS=4ZKUT- z$-m*7nir!JIaGFUed9Si6x!pcx|oKOni0|s@Ty*h>8A%_i9n?NE6}Y6ZWw4YP>Ysn zS-7@wDij)e@Ju5l5vrTl706`6CFj`#ubE{Q6fuO#_PUR--~(>C-XuyL*mT(ES@y#{ zXf@@=$uQd5?cNFGeD8GLaZ9b$?DwJTX`w;Ci_6)k^7~|{YSe_apUed(#14gA1{!Oj zu5!AJQdncb_xWVX_@hCZ-F}uu=InIrQFot9b>Ft_*Vt*fxzTWU`M4i1mCCDEITjKI zT&u33kg~APDvAOv1No*Dx7(X4!Y94dftW~QZiS}h7nI_RdJN(dX;_DA)Z`M+HK-=4 zhnv7Tc|~$DG6CcuFS0Ov<+T8|;F9=XO99I4bpY}9vD6(=&)0+Phk@(AU(`um{txzp zSI)aOB&~=-*`qGMa8t>o<7Q= z9|96yV_t5X)dbULo=Zmuls10B*cGsf!Lm5bk1-71<&F4?_2oSHcY}$(!U5(YWo8~A z?Tu8mpRFob{G}XV~3Bf^8r_MDQ0<}!k!dgv3BLTL&AJe#)nE8`J0S-+e zYa4@mYgPZsbJulX*4-VJ!jTBKv>E`0i_4^&p`}@kd!y|E>CR?M_MvyA(Uxu}&d6K_S}ic)o!T8fG5Z z@kPj38yn{QjyEZh1cVCuH*cT5 zWSL?Ys3QqP=EmxJ zJ3rX)-dAT_-rqKPKlh3D>tBv$$#)0K#WJxMrMap$X%o~*r_kbC;^`{LEIxuMONHP- z7^i9K^cpzGxbRaIf%S{1%CR4gNS>|%xG-E}mE5%0fUHuuaPH4Rd(Vf7XeUD!c7v5Z z9yGh|Y~b|{q$+&DaAqtK0k^Eq1KB_q1b9zh$M5LL>R6(O$H*+FKpZ(_vC@SND3L7v z<}$Mxj;a)Kuv6C^G1aKwE8$RG!1w-`&D?(!y0*d+jumYLS=iQ2sLS((n(41gl?1u( z4Qv+5;31QhPB#lyJFb&{sJj~ZgyOwK-1X72J6_uHximiZgT45C3HV&;=av&^pSlM> zJdDGB*WI0c?C5j;)lL-Cp^O-ub5x?CqFz$}ku*F{)F4%q6Ya9Yk zY&BR}WF}-rai>#oHrQb|c$)Eo8q)6)=FtKlOu`8h+DDf70iX(^M5o1r!4uE9m8WA% zLgUfN(POw5gnQ&_J10zKfHGg+m9cL(P=4+Q1(V+m`kl>q{vXZ-O(S+NSd~GiUgg1q z*0HeaNUGUwp{9ARC%>Sd+1C@sP;`o%2YNt)Vad$ajs+24Lo{@?hx!}fltnEkvLqxq zZ5(l6p_6a1;BaAUB)^cqsn^Ambzl!nF2LM%movWc0EB&g9`w5}6z$a?4j!}3mwF)p zt9^01XaCrvLSUbs;lfNf}O8_9m2*YPRAf<+gaa+33k_;Ts}TYM@;7r zE`mw@!X-Yz6<AKw=7r+3(%`m@8rd)gqx7h*Gmyc4kDA`NZGB|w@QjeCa29ANvK-S>Ho$e z97Q_2^&EYmkeU!c7ALE(o$keYP3CQ!@aq^mMJ=xdW7h4t+>3~Ri95H;a zkzh4w;tFg^K+YXtJV{jFA)7il$oAH~NfoA^v5bu~a#bgd)q?Ba%5OI3WQnuy^%H@@QBxdTs~I{4y@u zX{2M##K9x;>i+vUqVodBLTXMQHao>p6oj@5yux5YbJ0-JMhj`~Bia~EEHPGDQb#tg zOOwqDX1S$U0iSGWs}HgSWF0c3(Pey0Hf<*Sq;h)}@Ylwf@Se}lj6&ECw$a8#5MDiSGxh?Y|e?nnv!hO2$Y&y^Xz1d*&jSniMUx-Aq# z5po!8C729uez{l2R2iAylKk}N)1T~R3kwUk`|xP-H@76Xid-tRK!IFAZ>!IYMdN@3 zjS(}yKjVdc<+@6^TTe(xtYo(v3L?_t+36fY_P_X|y4_nRSE``#5t z5bZ5V8f+j1so$7M<6$k$_oTXXGeDySht;QU$ls;;o-OyGL%@uiDwFu> z-I{P4{FGdtxC|%4WkwFzu!DJw)`E*e73*_2Vgk9)b3heJ&?-PYRqmfUc<~1@mglOk zP*{Sh{pdTCL9A)NO?o+EZw9UYb4bKhN408=O|R5u0cIQ7JRv zoc{&zVP4#!lnv2l;%th>ZQ1SLt-Jwq9TYENOdM$5{<+Wh_xi^YaEveRCn_z~kx&+M zKuMSVOXZn;?od1KQA7cg3SL?}2lFMbdc^2V_E2lXkdYTb3?)u!shONx=&`2k2lG&K zJcDwq5H3&7gzlj21uc+hxi(OXiwBj&?&Yq0drB~+$vyDRQPHm>?_`^T0 zk5^epkXD@QpBr-Fyx_534vjQJn8Y-jF6r?dY9Qyo(A5q1-Z97p z4!cYngj2Q&x4XTP`Kpt=n2D7OE5sjFk4WqrGoVefCWpm=@B?h0e&L+=NwZYRg`qaAJ4F!%L!wcCHXR10leh7VQsy7PN7?yR6K z4t+tdaIf1C_f~%E)Ij}IuaA|>Fro)LGH*epc6#uy8iGvCi*OwOJg^i!;NER)gnGs& zdZ=u?zlMbWeACPp!8 zKvHmTpn>WvIi@I}xNE0(EO>~FCYfC2%MGjT1>zJM5iP7l8>5mGrj$qrgBA|E!b`mR z8LhQ3U|KI}74frothJSGL9_l}f45p@27G4>W;}x3G0WTH>*Zo_S3!M ze&NWVv$0i}N&Xgi=Ee)oabGxpw*Qx5{FHsVSGG4l@Qau5+WGgVYd7oRTa(&RhVwL? z1ydy0$D%(>TCTtFhqJ@!b#Ln}L`t^*+z?2YQ2D9A^;L~!p{Z6Is8C6?_%tF0n(b$x zwG6&o(E+^aCL0}(byI!UKk2S3B|3alL7wcz!=|*tgtHZ)g4FAP;DUe@_@RGvq65?c z;yU?52|wv&$=6k*;YqB*4uKR6x2E9;3u~Xs)2OA%J68@0-IdT$Im)y#w8bUKxz~<4 znE!x4lBA<}c(!oD!*c9+Oo%q@Hx2SFpRo$9ovj2p2tHUkk6qmmP02!T6stGy|I!2IejL#<}^u z#1?ln7GjUYd|nVE$wYLK zQWG=0noThBYRUezxKz<%fR?szIcOVDFd%NG;)27ODX0DRt?5`GGe1{^6>@!l-1ukX zB1tH5RR31bT_38&v!y~nnJaHv4aHAJ-u>vuR%G@Hua2rG!iV?GXyRmI5;e++ z$S5Ev_C{4lOfh|UX)l(XFB|yre0?MH2`@`X1Rdr8Z_tGz{$*n5amheznTy8OLhdnF zMZOs`&PJUb(THqdsG-3LwRd%uIugitT%MccMcZ}F_VVQUbxhRbzC^#=0Bm?hE;N(C zs;c~ErVrQF4G0}OtK|3DH{&H9<2pvMp~|2Z?L0*De~pI0=vPbxtyCNtWK}W0joBe8 zL9E>IRmb#~yo2s^)+*XE%;^au-QNr5{_P-UM-0)<{HelYjDG&7O_LBw#KX92`S3fq zi^L661-i{s2U?&u?mSy2I{LCe9!53HeLaXX_TRl7xdZQ4s&?uK>Tcmy&KQ;!-Wxv}W*3C0TL;Uy;!(d>3MCA~`&#d{2X5r_zw}630(F#xEFZwG1`W zFr^!i>ZTe^=Yju34B`UC2nykSxWCvu#ussE1X84M9UdnwZRPzEK`={J`-n!#w<}MC z?0z4lkP!pgOMrE8`?J*JRj&KOMnFZ^%jrhvy~peAz|-i{lO}f6 z?P#qYSu8Z$+!b~C7p;KXNljkkLiUwg$8hGV)uoQ2L%O%p&UeBOqTmYLJlU#9Cje!{ ztb`-zVv*17WLx>x_#EMRm($?U%$WsLfCTBMxGO7vK?g!~*MDKO5RLk%mA?IKuxz|t z_Bemc)UoEXZ@SY!Api^j*-YiYSr2B?Ai}6z1gxsDg-;r{8xROn{w^Ue?I)#A*HQq9 zm@7duJ*LqO_iHhE;%m|u$K2QArt6vMjs9VA)pUzYZiukX4_NpD^e;7A3*jt5vv6sGH3jQ14gAPr+k>-ERgN<89U+3!> z$NnaA`hD*fWoz$7=7-u7&w-qqW;upbtHrYnzzSPyP(xR~qQ8>rkGDAa{N^YFjB(k{ zaz=PkY^QomiOfEGN@1Fg3Re$BO$hqgxpO63Vs~7tk4sx=kLHOp`GqF#eCF1f^lGlN z6aXKqZl=RpmUgz0E`hXaC4D$A>rxRV98xGCFQM^(Bn+@Vq;<|JPsIo%csl8@U?ZXt zMM@VxO8=*})yXKeOT z35jlMqH^drLAR;%(ODYDpGXT#c6snaLZdE13u#g061BuuGT|4_7xq&mFF@wLufPW7 zPo7c0M0twzra@A?fq!6ja{_+!BWZ)$iGNBusA862Pvp48_&EWCUC+aYchq zzQTO$5g@8hMwIY^{0TRxRQ4&*wi4EhdW$AFuq(P5d2xtD7JE55*fYgPLj zinGsad4(a+0y*990}JI)D)hS($9wpi;Eqaminrw;mJ#+-i$xd^WdSV%@(4e zJBOfPA4ekYPl;Q5qfKNI;S_BX0!>RD#22_v$6o&4UVJA|@8iql=I-7@m5bg*55>%MWp|-8&0V;xZ2jnW^K4N z`<2x}^lAj9<*K@&;Ee23!L+68Uj(m)R3{Qwq5NL%He0Hla zohVg-0jh46VHD3N(#e+9H8+FRHGTIpj1;goj0UhF-01>AnkbpAj4 znWB8-A$t`o&LNc+zj_FXdFEy-4&84_TA|(P!U=F`sx|O|f!|FF1;&K? zM6aSpuM-j{Rg*`UQzctbWFZ1X(O&DEXz{cySS6%#y5xB8h+Kq80++xUOU6q|y!eO> z3{1Ad5?SMg54X@JnR7Feqosotu~@(^I6Uv^Z%+nQPmbZ`Uvspvs?iVsjbI}z#S-ARrz=*y>*yg zWPW7tD@-zOP{jf_{TInhRHOnq4@)LW8RoA7C}CV$z5fQGqtaGqmYMU(&kFJ1V`BKH zHEEFRPT+|6ui)Ptf$T{aNvBT)$_dBLc0fzjW>45t*7YmzhC!FPqi|PEW|PNSN5|Iw zKuzJ^7ScOkrO%?o}c26fLrj*2i2xq%Y?b| zoI{3xxKN`19+l!JAED;1Ruw*i1+(|K&pl>B2)3Az7wIPUeNV?EB{9E-B8i(=T=l-T zSqa0c*No$+DgGZ?fPmXvno9x+i+tlrm+d-6v87G1&A;?Uz6|uzfChb zq~BVoP}MM^~?0*GuaD$2JV(|Xkh-3qNtXM8pdayd7kRvuN#&? z_qp;-jb2vEQ}y?@jyedW?Vp#+a4~*Gy=eaC-s!qCA1g%rPtH6ZEZA!|@DiVx@#65L z2l^@Iaj;9(f7IEVS5|{8di-yrP02tY50)W-m?cA;p3iw<9qoZ9NI6JGwHh_XKWW3e z9iFr7>icGTeALB%o+aQ?-Nru8Jn1f8be3-`lil{IakYKR4jYlUxybg*A@Qs{cuf)e zqq%|5p43w1;gyhV^mjkHn^p^{U@uz2Jkt)*$C=v74;}46U|D9@qRc)1bcEp=+hOF_ z7UDDKbW*t({fQo-@6#XDoV5ttb@X7MFTtj9mwalN{9yM7|>WYZi z%Cz*w>Qau>fNY`f>Bk(5?xs|{^S!s7QI~CxZ4U&daSv4=X-qDdhz3I~=-wK7)Qk15 zy?I!7X@;nU5!co8HmJ=Lz&}foh|_n&voX|=f$|gC!R9OcMO4N(N;Mon&wyUF;35&{ zhd{BEeA2kMG&db#;7Jz2B=UcQ&;a8O-i`}t`kaF^h5>_%)+GkS)!Y}%NC$Atg&op| zH+K0F)uFZueQtw?YxyV`N9WP|W)M|tD0IhY)iBrGPVhvmAc*5vLIqal=a1hbz2d~$MJ;_RrgpiZj2x~wj80||Ez>%V<183I_Fo=b z%;_X2z>x9V9h5-_cOQ;}E`E0Q-?~7>mRwyor@bg|Y(fz3yUM6Gw%QInWwJmlrIrBQ z!q$%&Sxx8faQ+;se;B_&(!Z1WzL0*mKMaGYqU_9yO&QCA_ zX5{}hKhWRJE1L!yZT-WGM?USM{$)2@6SWL?`Mpmy|vi6T@8Y zbls#(2(yT}*J>pcrCD(UKsfwfLd_VG-*qc`x3Fz7_Y~pq;loOH_WSHl#LTIa-%ztL z2O0ceU$5a8uw+gDhHp=ST6bvY?hV)g3%1l)5dhZ0ZXvi~L9`(xRMJM)M>O+0jGEo~ zbOL`=ma{Aj7cUfo6%Ix!>TJ24zYlx}tO7#$`w#RU1AgjHu|cc-qY;w2bbXGpn<2@c z*)_AB6O)r1LtUMVyYR^ES10zmvM|kab5Li$pol-gU z7a=@Ix+Nr)d z(r*!Q$+c%3#B7U%-@1R_z%brPcmrZ><1Fj}47$v$_rNSLUVWia#U=SNs^a7_pO<}B zWnE$KuSx;3(XbthOdgL}kp+9Rutijcb&=qboU?V}x^t5?sx7{dHv(e4&G(}SQAO>R z`gMLsF(;uORt8NFHBTfd?eHzVy?OdB)^u`OJ>$P6c_R9wr~h@*VfNz1LFL7P1*(c& z{QQa%ycDud3#X@CCX$Nl%ai!&zf%zKgK9`!UGyD}XKi45zqR>4AuIeK`$rHm$nR$O zVK>v`(#i9gCXzb}k#V=zU)iu?`LS5F@^S79;R4j~i{?{kXl`(B9ub$YCuy-59o3j= z+UuA@B`m|ioO$8TvNN#p)%9*`+{7*Z1AJymf`oE!L}_dMYzD-POrs3oD%{+s;p^RUGPj`n6_N}NAc8aEn1)u2K29-N@HIK3 ze9@;efR`NnNZk=Rfq zqWugsm8@GTxt@WZs0F*0Il8-xn1T+FxL?LNLZ5}B%m|7J(|mMmR;L~oBc&v6=eO*? z63}GU%4!3P%ld24EJ=7ceIzPQL$Du!B+{I&ip10gcSGx&%WX%iC`?qM6#zg|LV1!i zbGryeql$%^q+wV@Z9DnM3&Ak&f`IDE<@|VwB241DgYzqgpyLgsSzE2Y!_O5{lD;#K zR@7-MdZ8n)xCw33;mh4X`Iy7}sP`xB1r#nOijEAdT~gSbJhb{a}~7=JFr znVi9oU(3c5?S0ezpXy>{SH9zdw{ShHHimSW)PZ@q@LVN$#o1YQ)>KFt^(U2F2h${5 zTeHOKZ&7buUgYKI@)?kKgC0X#=poAaq z%M;b=mspZmLi&U4Z;9S9h*C@N+`Ka|@L5mIG#6|U}eqsbKB(hReU$l0~{I*e1 z4n2(6Bk@fGc3R9mMl4pIw6~Wk?`CGMdl;|V$Q||cc~#rX{(2=Kh5BQE>F;iNPS0&9 z0-DEXj=g%R!bmj`#>gjBe)8$y!D-FvI3hNaiNMV_wG>vR2(5~`>+ET@RL3c}qzyXK zo#;P|vcSMgH%g`>Ec!bOwT&Zwg;uAafz(txXpcxalUEQyY>c}b66?lAU3MyhQL>+8 z@U4IEW{zQre+uStyrJyd!O0-%#^JdQ{Ah?V%e~e{jKCLrKp8z83yy=2Rl?>|(cAK8|W)_id{UALjZ39oMIm68zWl7bb}8WV-ilu1xCk8;E6 zjtyLK5yqt8SU~nmzOLEl9C@-nfIrSCu-XnMaOO}ullJ%GkrZri^wp)nl~rS`)GO|u zTN`CY$I`oK!i6XaT=r;~!2KSOA)+>%8 zE4O3(>&AaaA7o)Wp_l&wlsO1NQajY2+2j`ZMlI3o<;Lx?Sq`Z zNxb=|?7ODy3Y(C4?KQ>az{?>J^YM$6#))UBU~H*eRcUR0o^c2pk+i z?3Zi*yOu9j88ajO61hyx_!AI94xAFMFf{9#U+T2}yS(D);{cm#+N74z#{u>lfJXp+ zB7`*Us^E>D?mLynTN`$Ehcs?kT}tbjj@)OZw`@=Il@(sFgYv4jpZM-AM8dB^fR35O zE6#Fy*}}p~W2Z0ichjE+C>0aXnqHyR@Hca7FlN;h0?z0LZNG z&YQ5m=lxi3@i2hCrTAZ2GQRq66;|OJPX@Y1YiOxk{ARNtmG|8>#((e z9?;F65HhrmquN>~jAlIYhVS$7#Ro2kW1lU2e9OYY;(*{{|wMFSaKf0!=UMYr8nf$@sfxun z-`tw(Ivw^22^wd*#V3>RSY6+&I14jGOTi?P%f?js{m$TiuflbKSlHH~q!BEPZz2W@u9Vx zXKN&m#bc>27X6xB@5`jE+?EP58O1K_f2)8GLKKtbC~VQ0KY&sNuJdduHg@*8QMwMk zO9W|w1^^#L$)jLe%FF>XyOCecA4Aw~f8U{%x?!Bi%KUO`55Gzr zouzY9u!>M33-I#hhn_nHMuIwP>S=XARyTPK1QYEyJd4P0h*IRScsdsoopmy1)+N5@ z*Hg7EI7K>&OtKKS0!?^wx+pKRSi(icPS37) z&d&=Llb!sRF{uAQzdB{dmdi(B!p-WSB*;MDvdOX0sy_NlWqSps670P*k&CW1BgDpF zSC8z}2(Dr@5G%S%fn8f)Sv}^39zf^)npJQ$L{X@Tscpl<`A{4dI{J0bs3?%fkhZS`)A_|aA*TM{$NUQ8$qOm|_*@AbE$y^((ktR@w77X_@-%LhB#t3-ZWNjy3#CpIvk zsi^$csx^J|mM54H^YIB1LwD<*%_whqb2Bn&OU@>W6_eftWOrDwAOQbV(o>^nfO!1U z!K$~bQ3Hnu(~fV748j5!iC1qU8UOBd+|A#ZN43_Db+7Wrd}C@}FrVw=XFlf_@H5D> zS*MDmduT-nA;f3z?#uBe2_k0L3|KuV6|?eHj*Jk*%uFV2svrCg#;xXSR2w-;`IDsSa>2&k&7eZ_RVXGISF+AF7v`U3Nsumvs?wX0BYlffb zj4l8ey!cAjO%H~ZSmLy z)Qy))8q!XET!~~8>$tS@u=h@0hP?Uk%Xp*T(iySEd9~`8q9M>=Qp|{fT)a5EW+l67 zJDf~DqcxW#4|f4Ggdh}l7?nOnOI*u+t+`x>|SW>?bIx1j)CmCruDx(38AWCEhK(NSH?`)uc(o{`%CsZZ(= z@mHRqReOJszn9%lN6|Jsnz%%n7I%?$sGdh$+!$ewsw&S)t&MhTE|J>MhgE!*%ujzL zAW`TW+N#~PV%G-$xKGX`+4BW!&Bi2(hjhYg)0swm)n3TO6)@cROT&(w@7&za_K7=X zGf-pUf44au4Er#WF8iG82a)GU=waGkRrR|oCLS2Ox$m{jXYodJHzrbZG+$0`YMqU7 zs{)mPA=qUzWqC}ly)>O~p|$5}v3bYHEWa)XbZ2v2G0>M?E$5Taez9WnFKuhXf7^Ys zy#Ha7QF34a<@(P(O{xzM8@k zmF28kd)_rZEfUYJ-i^WR-X6$kd@o&DA3BgZ?~^heFOVC%H;kS|xo50k4xs#NJ>*K`%+KNH<1r#ds@c5PA!Yr#LUsj zcI8i}qP42QMp#s#8q7-$McVaR_4u4Xm!WwgHat+MGUH1otF=*9<5KF%8{dbcsZ2 zw=W?FCV6EAgZwy}k~c#=u`1#EA3#dFb+&Aa;#baU8tee;mUK+O(I&!r``Wh6vdC1|d#um8IZf*r`vZ!W5{X9B!(!4OxxVP zmbMAnZc>+_`T6$Jf!vJRINKTfwP9s@V{iJ$yJ@S^XDamlDKM5#3pFTl(HWD3jFerd zn(2x4LHk#M`}<~-wXi5yg4sgkt71#Ga{Gc%c+YRez|&&`ViuX?1H%gT!T4?NYGM^a zAPY#7@wX!;O|c+p9!*s+i*kDrU=QIjO7zT?8rz$%8~THk=NF-`iyI-$i?$WqW*TiW zwa#1)N0pQ+jC}zMlG&g;LX<7^-z#M#30qo~TS}9I>z>&@mGN~{=1V+%h~E749=)`MQQ^E`$rCcfNi!Jpe;E(VH|w; zTiM}u+Z1fsg6f9(-XYTus(CiW3bmsMh20V11Uuv)ew+B|pCtCC#;YhTPIZ)jPf z?h@hwjVYb)Bj8WZ1FVhsxWIoDwzS=XT0JN}{kGU*A?48EVLJI*2z|m?{Q1NaAN)&P zq2I>z4_qao?RL5HkUC>=I%XU@H-XWJ<#~d_`29-EHkQK3CQrMJ>+eDZoOw$S8fz|1 zy&Y|D7)x?2gk5u^Jn9KlGjlq#Ik989W(`v#`H7Hz{qqOfMpbaWX~q0bJ#p``4pKgsUEczn-i7})>})!0shBumwvXhl~01RqxGeX7OIeS9eC5*zXkTh z=o;H#e!`bNCT6u?d%hm_=uC9-<|+;{*X~R9G5c!nB+8v3tbvYXpoARrKC#q%4UD9e zS%N)-!2xPMw=A9gHiq`rtVO|K$iNV%%%BgRRwlYkP{qpYMo3g6_R5KVtR^7`lnknk z3?vL=3GZ*_GHkKbi437CP$>K-$qlp2rX|6RTYbp>#2nCT(_esOC8Go#d+fZ{x7R;b z2dB+rRzU^=5t{}bF_|5IE0IwaYv`5^Pvs{r3A5{=i*_xQeLJ<&_r$g+n#QN4{6YVH z|AEdcfv^Y%vNH|k0?Rg+QTL3A-Y!-_c=``Gjlw&NW4;SA`6WrC6551<=?TAB>a5g} z{w{=hzz72I^dXPYM^Tj+EnHsj3zzN>gEkIUrU%~G{Wo7BfLKDr?2B-(TY2kl*%QV% zUkqmc5cGiAnH_0=42sETXoCa3gDrj|^xl?S7@zct9TRi0-d%B1V0G{gFdnDGExqox zlai5m$#4i<`DVsgw`WjU4djaDOOxzzezQNp!_QmYtvbKWEBPXs@~t4f0V#kb7gjdy z7Y>X_VPS9TR*NoI`K8TXfs~($u6~Cx6!J= zLiSb5i_}|9rsp~frd`y0hhXCcG2^osP3UADNOFTU?JNJigTIE-KD;9;CRB#6L+yav zu#s-RywTb%sihA-x1?ceGaEDR-&8tf&Og^m06-T5#K0}OH^R;q#hN15au-rRvZ@`t zQQ;}081bq_HAP1Cd5*P-x5q8cZK{$!q7=7WzqQ#Hq$bhu$q*QzJOK|CIS)I@4qEW* z+}*;e)dHHqw*;)b6MVeqb|#u+?)jPYjIsyoYwmvY>ullHZB4~S2|iew_tn*tH|wWe zr0^@Go#3;gig5+z2J*;yKGM!Ig6cFI3UxVxPlS~`9p|4lkSGX^l_Ae2BTX0 z?a^vBKe8{o1AfIM9z*twxT&dk8_f64Iriv%1K+ccf`<@d@6izgej;$3KTQSSVubF; z{Cq6`Uo4-e;&7OF9J8`Mu*WgV?|+dFuDhcpX7T>r+|#e=@8_Fy!VE(cp@)$lelJjB z5zOfS`!E$n7Y-PFv$q$>2=?_ba+)TKwP?7dQ(7*|FhF!}g5Eh`Lh)7sJ*bViwaeV5nuH z1SDK0(VQ+=YntYnlw2$lA~REAJ2;d-6U`G{ri6>V5YWMb4+=ePdj0YFIKnK1R1Z(j z$;F>emF-(7K*I|Q0f+Ry7TyxZocw zm{Gj1!2NgODkj{!=UiT^uEgRBnlmow@{+Me zWeiBKta|bz3CvI21I!L&BRAY>@@u2^xY3gf<(1~vFhw&#yBSx8A(6eiNmT=Aj+kL9 zM6t!f2_a~5I=CBd-u^AbGKP&Ef68kLD~`)IvkRa5q%p2fKRc$T9zmVW%*5 zAA7=ej>=VPn){yY<5SmWwWusNlqgsjC9KN34$76dKL2jbbV`ytVLoh_i{XXfV z>vOfEdV%}Dtxi!15%s4_Am-^CpP9_c7WtMK^9ZtoD2*$+O^u?0j+IS_{q4p<+;%^LarN-=A*wO z33je83@R`SiO`bKuZHknnMc*Q88yri6a12}#slpLx{c$rXrpIG@{_xeS|_EsZB57! zSX)zcCu@KTO1h_%xFiMT^Fng9U)XcL=Zxti|Dpk>0Sr5FV&u2{aM1dxr}xA;_(~<^ z<;7KuYSQ${bUc@#-~q1=Yy8Co=T{lU>uYg~fD5!W8`Mk?#i!oCV-&nyK52*g$m|?s zrIz|#LrHjqEYhaCLQ{|H>yO#s$-C!C;Tr@~8R&)%-h^S{KVEa~l7>2FP6C-ssCo+E zhC;7F*P%s}FUy(?R0&3%V2At3AP=Dwe3qXW123aUGE-4w7s4H2-QiQLR!#dc1rSQ8uycYyvR^YAUh(cIFvKPM{A@ei`MeE~H<5B(~qzCwS=N zp2mrr8pB)zeFFU)m0=Z#orWTk_mPDMm9o`Y=R3`8GU%EEPT-a2(iHgL@va7@?b*zC z^Cl^JYZHr^Y#~myRm$KU13@R zQ~b_6o)Y%R;NZ?TW~#AuV#8P-x<0OzVX6kJ{yGm~(=XKs0nYZSRxE}1p)Ba^LPK2Y zsJslR89c5m>#vR7H1}A~ng+#&5)G>a3+!pjsDGpaYz5VV1;ViVGLq7j z+4UX=ro!3L(TTAsSc@Uwiq>9mv&%o@NzB0@>7b-h`oWznX7iQ4yUgYHjOF{kxAQZ3X&e0Aqn}>3e5Kg?tzDM+spYl(O}-~$|1^(Vds_9@ z*Nokc;gh7IZm1%`Opc;ikSj6RW@~X(<AkDeiw#Woo8qii0X+1QcA*x!z7Vn8Bh} zo27SbphZX@g;2=&>%(GgIB$9zXIB+dn-i(E^G*ScCgbVHetc&08+Tt!7z?o4(4#L6 z3qi^1k;K8Ln_oFw9IX(RWiRv7bBtKH^g_35Gl2x<)b_eD=?=n@OXYO}kt_bE4&sGt z>N}V-Hxk&0W36H1kpbAC!Nu5hPjLahl98>5^FMy#)IN5{>t*52it$b24UXiW3 z!w9APKAJ)}3n@T(U`#O~PAOjoi=Ao((M<(V_BHMrV;{~(&MF}xKG2~ zpUi{K^bpO6nmWY^MVhuC28FQk{=7a;8GhO} z`1iBam<00wf5}1+a`y!-4nj<>(9&f}a8l#k`!MWU#i+%+9)+z1LTNG>PQbN|VM(W} z!Mcsh>AJM*4I|kKAN-6DsaoO|k{Y{~&nSK-d#v&qr zJ0~A(uN^rFSd2ss<6hh0BcX(fvJKmE3Z{h~5&U`jL&CT;>Vq+zNRiXC(Np(@?AaNj z*>S2Ti43*VdP2;;TINlz?DZr{2_?-b9A;rcbRis(+j|AHP%Bb(b?&;9*}Gj$WLcce zQ#x^tHw3CNkn<3n4sQs(#3g&N1LnwybKPe0De;{nO0kMdW)M-}T^#>dsqg@6#sk2h>uS@4@yDelJe2CFR6cdMrwJK1)i?#Mb0wB5~75 z8DZ&qT^mR{zfk?X)$-~7q}(T#5gadY=5f8dM)Wh~;FEm*uBL89xntil)S(M#i>GN; zT1+feH5=MA8n-R!HgL+@FT^MfGejp|oqf1zHh;S?TXgabC59v0Z`@)in51D@wu z5D2+Izvg+vca1IG{bzeUIL=+o)S0Q?$InJJ>zo^D#d5$6=WNtjHXW&-9Kc2D_%Jty zNGQ9y&7mil2D(#lSRWGA5`sGsFHdV^BNit6>aapc{MOB6Y`WPFJO=`5a=Vl5;yzO= zGWXd6j2cbi7Ww*p>dNM7DTxhve8Y2phQg`T>*1)J z9BpfRwP2bbXj685yeXuG7;p!9B_>3Q!u@c$aT%AxctjCW*Z1$v+ri`PeM|p#N@v_; z9M;eys=upZ6bFKOa~^96?q0#?^(p>{2w6nnXC*G7P$8%Yt?CP(GWv>z0tStcgJI|z4O6(>FW zv$`K`9>Vph866JkM2RYw&g)5!ToYvd^B|_1CJP2N0QrY;xX{wjRZ8KQm}U(nIDR;> z-&YA8@NzhuB`*~z6vc~gN2dFlS0#k7QT8@D@zU~UIG`RC8@c@b!L*aj|4|}9CP+Hy z;foFa&$&pzm3QUMZPw|!FwK*h1U!z$Pj?nVrXW8aFGWoCjC1h-%y4ArfTDx)APU0I z)vZUkiLh89r}gCdaA&KQVwz_Mu@gYn&m8z7aWs$7VO>r>>aW7!;rQfK@-!gNXxruB z=36pwa*6eZZb;{iQ(qYZclBl$gB-0mT7AZvhRTT7P|ezy7k5yiWIr*Qn$-jL!S}=r zEr=O4swQNSd-wIKrQ_lJ?#!|8f-=2zW%C>hT%m~~Cs8`l*%h_a{3AQT?@k-X$)PBk z32AfvsLtY{x4wVzPDv|5Y5koSr`8HSB7{MR06vbGtNxs#tjba4m_N=S7L_kFKYCb% z33;7!y%&G`@H!b%oga^-MB!>O2iQjWjWsWUW<}J55DY%R)TALy#+QtPAPDva2<(KD13|O6jcO|!X6#x5&SRgF~4fbN7u>! z&gCX&>kav>V1rHE@th;lS~>dk=ARK-IJ+LUP|aM7K3#7$qeHE`QV1jA@g~^|hdGCE z1pgrFH#c@x(A^LVZxtsXDO|>EKCJF&t$U)C4;&L_{+CT1V{+)ayBOP-VrI+@yCv;f zPA-ZyTQOT%I2!H!W54A~u%^NyuF@hrSC36k3%4C?6?L^9@=R;fUg~HUz?HNO3d9oNtB(HrDYkZ_AmjuM1z*cKB-Z~@+HX^fmHVKU*k z^Wj7)j2!alkP+d0Vm+f;fZu3ZnuBKiMItc^F_WEc(??SenjdRA*LTzB+yTj8srH=J zAg=z(csFku2|h6Hob|=-Y>4Q;8W(T+5uH3h9+`UNa}PtO71qW!z(p<41M6ks|0v2G&)T_l*N$rO|1)Oos#G`ITfH1 zHf0j2KFdPVW8fS5WGvD0gHH0ir>$xkN{Vwq=pqCU(h<{=!ilV&?2K z$8mRjW;jr=36((xrQ39GSCK191}*)9n(QwW7=2~D>>@Z^i*;iQ7 zJ$}9YLc-cGYUE=-=l5Kq5}P*Ct*LbiWB;DKlMDr&9NhfF6u);qoAo(Co>&k{9adS& zjcrD7E~7lQS%cP}D>gx(Ik(a&6Dw>4J)oj?izgg~)lZZiz8^mFs_L1jWidJsMk|-@Ht#ksC(b7x z1EK#;RsSH&8Qn(^r&iC`*6+#Id$!Y=1>9B_1Km0Sgml`xC|67mEx$rFK4Yl9*38RI zJ;|8We$*xRJ&S1m^TFC}QPyW#QLnI{8?sX5bu!(a3c8)%0Ex4hJ{9#*s!b)gaaWfC zS7Aj5Hv0ZhP{E4ZOMRxYF7XUCMRq?ob%8o@KQb#y#j?vEK) z=(tP#i`$zQI{)8W=hEG*C5oF;8Wm-1Xuc2DJ#e)%zi@Ax2|F$6H9@emnNL!?6KdL) zKhWT~4>%rq_Z5`nD-pK`N(I_27=L%G_fAkdsD6%1`s4b+KIskQIAZ5$?u%+l8$HwL zl#&x}QPtS@geEFY#pO*v6iK)|#>`J$vf+(WVBh#Pwuw@4CXqP%`kGGQn0-c}9hh`c z=fx4)#+r71GM6!jGB*b~Af3V}gso2NWpa4RKsJ|S#DPXk8X4LsA>{Xs>ne4vT07V6 zUTw|kQxIon1Zy(sk}lzk&=Sk#kGBb0Xagj=xD@)VSkLrx=Yra)Jh6p;Sy^5a4&bZ* z-Yup9w@i)js|B!ZCxAW!H|mFnm^*;|A0d*J_KFbrv==6LXlQ-G!ppAs0<_fH%?Ro#YGTFb@?G< zg5L!FPwrvWVG)pnk`)sU?ieJ1neV~9KHV0CrS$%NpkL1MrPaLWRHBY`y{@B(rlb`d zGhd~IOI{FExm=M#d*N}(@EVsCHHu4@0|#cwWyonT^D>W|jGi$58#*^I59lNzwjA~^ zGnoAoe@k|iB#^4v@{)eJmnF}El>1GBNQ9FRkSC5ZN}V%GUOGS#T_I*||Jf+z5KVqh z7#{ZXf6J!UHTH_8oo~ zlsSvrF8AEvlS)qByO&z;#F^$RMyBT}{o(Tt_&U1wuyv3n9S2X8kJcsB=gZAKyO$#D zJKNXTl=_U3*Qtndz@}NOj~w45WJUX-CMA{T`eN26swcy^M*`285CivUJ!yy2OBpds0m`RBF(Wt)JgqKiO&?K>f=KM=k@3`1ijhMzCx3D*6Q)@#`8|0=4`Z(KB8y z{`?_#-P4H+h!c*K75K%X^F-N`;pO!5Oks4$hbSW%U-G#e!Wbl9mhbchU6sa`eaD}|Q zfb9=N(4vGJ}JVPm5e?7FN+wfNc3Sz5~cp5V7Fojw=_ zSfpEV)|>0360an0?upRQ6L|B*2rvEfLQrBxpoCX=_7#^i<-6VpOID{6W9yggILU?_ zTXdQF(o@C&S8dZ%=`=xjVEOl9H=VB-#Ul;^lh(J1f1nbeIPT{i!> z-d~|%s&UQczq5(DkPdp6dL$6^NC&&MBR%a$XMXZ5SGNr({RV+2iz>{ z#@x|%>u*t3o-Qy{%L*L&i%E<;fXZWhoo(6)>y~UKwd#6 z4H>0alEL2r5UAmK&DZlf{prkrxsLm10}KC`IBvmavoKtc#jf5&~dI7;NCT2!LDmfU5*;(ZFh00RqeTPcQioVdyj9|aQ!V%o8&R(FN zK8n4{m{9w1Jejd=iA@w#IUMRGGqmw=wfD&~0^`dr()(gVnc>3+Mo455`q|(T@K&ho z9XTzI$TL1nR(UFN>mpMsWxpjQy_pbS?rrmEN*rTc9>+LF@RUudGueCW-FCEGiXUQ$ zYhxwA*y2T~^h&l4%QqD=i#LPWM}k`;fBl*~!w?<_l~4<%<^Je7d9*DXszL{H!~D9O z(d!jUmDK+1_l`l?;DOWH(k?>TYhMkw;Dg8~|i1XafU3A#P;EebOK_qc6vRXXca7hC?mvf_Mt zG{Q_~(ZipIvr%LPe|u)c)#`MlzaAo>?@f3Bgt%98v1K-~V06BW(_zJ>bxG_)p!b!# zMvchOmlxoiWzhKoz_D;uB7l@TfSXmYbIR1XA}irCfhh$6eGqEL5u4Py*V@^CzH+{@E`SWW+w4gCH?&NqTD#(TS~s5c{-4eGtEPFC zUQ}2Z9j8+>3u{|SPZE1KS`_-fZn=31MtNb-qQN~kn$|q2j)?E&r*Ul5si_01Jy{kM z{gY0G^jR7h_4;OT1`&Ad@SKYYi}YTu)AeniVuGP$~=uifja5 zHmV}t|DmQ;U!;3KI6?r&>dB78{6y^p&3V(VWdj{ZxdBFCVj=&XA6fhr3r6j){>Pm0 zqqukTT^avhs5E+u6kqk7;deC-talM`Mb$u+^9b4xYA^dBzajGRIwGiO*|s$CO#)K9 zEy7=F2^gTAES}^oANx*^2a|RZaRL7fSqI?{gK?@uHT*r1I^ zU6tK^ebk8ZvW0$DR&G`;{*dy0`R3CCS|KW%m@~Kz=-hGq}7-Ep&{){zoSdX8iENG*a;$DUM zYvH+*H6dA^>2E_B&(WPrU>GrT<6R^C<-`gQ1Q|EL0(% zo;d$^1|3Vv5{|sOJC-t4y-{w0w=@xCdc1P~O&Z$tYLq<5?+FT|*=HDEh&g z=SahT7a$&pm9Ad8X{f-)L2r6%8(kv#H5l|z?b6<*RopF=l zNVF7Dy*w?|N_JG^OvW(|V(xld`Xh47c^aP?2Vf8;Lu-j4V@J*)6H^@+>3TT_Ca`a$ zy;WvxyC)38>-l*crpE16NBiO}YdUCRnGO$TUO!8(ncTpSO1f%OB@_O2)uyInbR#x` zR`LVi-OuYY+KVGuqX=XDFZH*k{QAyRTM0?a+i@5`C^bfd?VPJvJ=NARjN-sSwd2X8 zBj8N*Vc`OULHxz{1KU5)jZ4_tY24N)yVBG3bQ1AT;>64f-Ba;@iz({!-PyR%71U<@ z^(pY^WE%Jq)6zKJ%pVYgjbML!x>YNcC9KED`YFgn-5B|ji;Gf(m9Qe`9mN*Oe{E5+ zVTLLZOTl9x!D8;BLaD>g8^v!8!D`G5!?|5iALbj){fHvvh&x^%$Yv`&rg@b%w&?Ur zjmoH@Zl}mC(Ygrd5{aNA92jw*=tJi{X`+`&oh(Yb5Na7`=VnxK^7?4V&c&uZ^b(B zo`oqOHyeKHYbyQviL1Zr(Knfv7DKN@Caf!)QAmG(Qs@8C^cHMUcJKE$T@nM*F$^i) z-Q6fHB{6gP{iVlx7bz_#ZsY-U$Z znU>+TCbzdP_bpK@o4memGJT%M^}!4n5z2n)f(2}}k<^){dv+l>$#{+E!{ zS0l`{mZTK>M_x62NfP^fcf?71AGyEB4__R^6c?&vFMmcGv;awR`Shp85}uv242ZiP zF0rkSZ!HaYDCK524A!vfKh$U#-m67s75CdS0p$%2ZTRxKw_x{fuD#P~s2$>$u1H{QD&`{mW9|=FTVo+j_sM*3qOdvFFCwV^UM%-{9z1 zRtTg4i=bEUtW>;-20*GtM~fnZmCB$x1lj9E{S-Oi1IL?(eCIrS#p2~LJ^$RgyL_SL z*~fX<^fAcX>2xTgnD60H=%O$lQT@eB>alZqCv#H7kMf8s@TX*kWIwgo9rO}i%rag` z+;#irun&hC>xcMFmtOrQNVfDFDuW0_X)5cWO$EZ#Usot9izsWo(HqamsOrT~vE#e) zJH`74HMw|;N zA}T>VrFUYLG5*9aWEsDKkGE4A>dr(X)Vr!Ps7PAVIP!eW0$|Z^SWJzb0VnI|EvQ0;NX)QYFz3^|0%oTo;`;0R2VoMuQus#&O!}$ zu$6Q;UAbwVL5_h+*iW48j32(AQz8qaA|bTLiYZcZ(w=u1j%Vf!h8Ty2a>^7tumK)9c!XTZa|-fdY7V^SI%|o7_7D zvOxs!cdbVBh0qR}!1wVnv4bPL^()=eN53iK=P)(ldWe^U$F!ONvp^(M*@kUhfpV|L83zqEt=u8)DfXio(o57 zts^?-MXZ|yz49e(^arRgu3f_@O`8~=w)lTERsW!Q80Ts0^D$hde6*_`8tLtzBU!BZ z={5uzzZEQIf&^y^73L%pIE<>%PpK6SF0k8Xw9UD*V?L1V5QzbzilSC@v9#n2StUqd z@Az?TL(uGSv?AVy!Vz4RZg*b0zg(E+VRsunaG;9L-C6EOTzEZ^NJPJoj0~V#iKt4? zhGuv^SJ=@xqvcU|^f1|OsK-<6K@7m^D>x+1Wiz9^dx#IjC?Y#vomAN}Jj9Pa6bT2Y z#LVAX_kWmNC?nN(_;7TNXG|G2lq7bHHUrD1g27?oZLd93{(hdL|8mYX;FB$tU zE0i(|2z@$~R)5|QG4@Uz=(yuQNwL3SaZJn%2}cjdef>Iu(53`YJJUNrTPysDo(_2Y3$_x?T zaq^MtTOL}a-g7gulITgEnFsWIyxa|nfq+c1xy{C_^#bwiv50gx4L_K1VWF5~I(lAyKM%BrJG2w} z-6%eO$<_>3G~m@DA8hT$&@|A*6h;QgR5hUDvL&c(N}T=~TCi;x4Ik^TSHkovuFRgMC^-p);o+yzZ&5sjuGauAfyAe zM|)cZ#w;VH5)MJodd>O57{)1O-IJgsVdrk_Q0DZ{+^{7mMlBF zR)!}0cZ;N--Wqx}43MWy!X z;K%$>vKpu91Bs)Qu09(#)(BtSFgipjP6k)l(4Adh=X0Dp+zGX11ir14X zHB%_}-~ta_BtGjC53oIGc5GZ~M+3NKQ2o^h$k;NqSv)Z`Unx&db3=Y6`3hi%fiXLF ztECY8Db_;p@Xlt}Ushh50H_ZveuTODPu$5YL;#1ibG{)`89c3PA$1e~=K(g#DhXNV ze~p32XIdz?^e3`s1-KWk29|HO%2)&@6FFWV+9cXUGRt0=Lt%kyk`4ELTY(5}4Q0 zQUQ0~w1ROk>At`6X2m8w^&%*I8xe26Uv`l1uT*I<1obIzO*Xqa4lb_)!||dDQ6{ix zvBKj5nAPmcYFRN54YmQhnaH^cui8f7YSbautqDaAALH!&qM-1M`jcz^P{+zhTqaGh zY!svrUpr+j^*a(ZE}a&wT+g8lskS>pc9aBAdL(Jz6+`%9uj*^lKMag*moQAtD?@GM z{CTrrWx0B5Pp0$!?r6}uzKy262)$~b@YoOy@H=7LOoDV-Ch5L4_ZH0d>jhT zT-@GvZCl0QQnNy#8HoF#yYl|QUg)qbGT6c6V?Lx;3qseJP$Bsuln;|Id|nS12)x3d zG^EZE!S29^(x!cEkEpj67Ta7m$)@YgE=R1xH4vU0u__u5vf@tP#gg%2*;4o-YWx)0 z`5!jSPy1%`O*kHbx~nUFv=geJQU@0(W^=)g4aGysLC;PDL2Dwqb0U$q`KCbX4b~Go z)7dVU((O?48`BmQCVc9^<|#)EDZ60bP;4T*>=DyAuz%VJ?YTQ?6X7*Y)Q9CcChLCM zlxUc>zH4t#cWLp76=ht8_#)e`VQl96XQS8SaL!d|`qy2))nV^eWoGe^_1tQKhHxMwjba*U@sO zU*K5N{HO2wBh-e%;GX2mJ}+;!F;(7O0dXiZZzY&r&_)zM$zgQpBzrEiq z+*~WCsxlJPa!054QWG+IO3-+RN%pMGTuT2+TTN9MAemI@% z^1vT?7kKt>#n7Ih1jI*TLSSdPd;b&L+SXSqA5~)R zRcZ1_qyikuZu{#JhN>;YVT(V#O9ns1L~#G3(joJiJvT{@yJRN3qXeW<11-^{{LTq} zMi_VPcd>~qXG*Z;RFBR|HMC6cFMr+Kn#Vi!^l;+BQZtCGfj3>9HjQICEU}8HytSr`JQgyhu}FfL#(+KsQO5CkP$o@ zvO>ZRV#SmC*h}Ar4=sqN6q7Uay>+WAMPKf-C}^XW{9DV0OULS$sDD$b2hB7(?)%5s zwMe4r{yyM^BvS0;0_mmM~P2 zr?ya8MS*bG#cJ*muWLbJYk-vD;;V#MlxGjq!fv(u9#hrB*5yHsYB4#K?NAMX4)Esc zatS3!m6cj>w7hS7UNu=l28})xO`JY|di`-|`n~PRW9X(dt7C+l>dVMc!5z!)QM-!+ zUwHhXrH6m^_*@M&(nP{AI+X|u1Y{(`W6Eh<-PGQ1=q|xv3^mQq|9ig;K2624+g2vZ4Q!-+5iwg=#_P`U7L;s#F zbN~o`s0b;(vD*geJiJ3?wx5r|qG|mMDQf*5kv)5V#jT6vVi*3sUa`vO=+@yDC zA>gav(bO@Zn&Rt5MpzNBhrDwf5t=FF#0DL*ogN^dKB5^jj^T3&68eEL^9IT6o2x5y zI7=zk9=S4QZo`*IJtv;fo@J1p`FZ^Rczy6k=JV>ymo|6*l;nz_8?ZqV7SOVl{oGUe z-ur#C@<2x1%6F2GE=3F~iNdme(LbD3&qGdCY<;;R5XxunColTDpz7Jog`A%m+sWpD z=YPqaGw*&%`V#vO-aC66@B)2YxNV30`mJ->52c0d*j2-9#6;Xn`=Ci!l&t~x)#aB% z$&HXc<93UpweyzK?M>g)DoE;rrzm#%2-g*22$h7Gz`slXNaS#HRKPTbl;ba#f4o3H zD6FoRp2UdV)wBlafTMQkrF*2ABxn*pOh{< z@~38GmQTddeXqgf!c~0tUVzt>;HQW?)c^CNJ>;I6d(N801Uth$-D~t?RBq;w~CSF8uEdHE47TdMSxYxfm zA?X}hV4R84M1;ySXOGS9nYzNcO9tK1CA*td)NMXBasl)&v186yN)kBKIY7fw?lcJj!*kknx9hVr6Qi@Bk(ATmMwN_-tTE{0aLmMzi;cc|`_DN<9hv9;H`5zN& zsOwk^m>$o%EVtWa=Vuov1I2xaD;V^mbked|{w=>!U5y+KW%TT)!c3C85{GYj2jQH7&V``ZpqOg%1DA5=}<&d!b$BLk)mTnC1xqkh0OLpZjyhIRT+0;1X6EAgbA?URhb#FMZ4BpIGjYkxc9iqZ(AM`Hq<@C-` z8VWtBFF;MDl(YX%D4^~2%!M~qxv%bL=eer{-_{)Uw;rGaZYUK4POI#Y_0CzVV z3-gv4!pWJp`~Ud8nKvI6O{suM6_mJ_a7Pshg0dhkMW01z91?Zune;*^i_ea%Jd?HD zX+h$@eKSc@)c~*Y`9sM;aR-Ep&G=^T4-rx(V?GU;_PBfU!}$nQT>+dK`xxopyz1#n zJ%F*KhllpP7RUXy@Xv{;vyfjYygtK~-zW$dKEN#5bDVL1JYz!OX}is!l6=zO5rz&8 znWq&970}1WFL0rBlT!GhV0W1v21teA-5KgO2He-rtb7fP^%gJ+ia?b}>S=P|?|1%# z3iQCDHZ{8mxuanSep?lIZeDfe^Bs_dBjMwT!t8O!Q#!7B4Cn{5)(JE|cHz~uytX1| zpVkOBDC>s8Xzeb34B87DNY4Cf-oGS(eYmv2z&jCma^Z%DEhx2I5he_Dt?|{l!HBU1 z%0L9u%X| za*-9`!$}#_~b5l|5_D>SygglfmlkS+TIBd;r* zj1@u&6SJFkzy)1>x5zkWx(1q@eL)qv@IB4ndVzN>J@K>JmTGLqw?bqrm!qJ>MD67A zTi*g1<3E?LnZq@NnTr&ePQ*cGrM|fuG=sXPXMR#mv z;%lp4?WS(fw?hjF6Mlq7p@L1T$dAnL#{7&KR5PAQX~u7?#@SDXW5A44gIEr!upHIF#(748y?XHbQ;DU8Gf!eT8_~?s4Cg$ z2%@X!hLU$5mt+h9icyp_MFW@v8hwbnlRt-sX(QNajwgPE&&IWwpZP(Bv2ag|2_nzI z-l@8Z4yg4YbICj zwc26(4Hw&PR@0{?$3HGed}3l6N=3ga1Sl#vBd1ZtEYrdWkP8NGSs~p_99wGK$a;5N z979;ZlWWr#H~c$;R%E~IgwZx(0K(mGo%qs@iTZ;k4|#vBq5 z*ghZxqnKf4TT6=-5KJ0I1|HWSprA2W%C(X{Mua!$q%ISxvurj0D5 z38g$vO()dcF;ilPEi7<_?M7HR0+-CIg@mA2Pimq**BI0cGTTMR8PSIf);qSWL^8t% z9j%scPLBc%YCzBXNai6h+R~is0lv<^gZ)4^mrz*s2SOlCzl<_%ygehzfmCqd=a=1# z8c-&@m24MY%Otgu*R;9gW!xLS$ukn(Y_KCXqPoH^eCk&HN#HXo!Nb&fqEyXqi;ra| zbY-UHQ*ffR&JirNZ&oVku5L0wteLD3iyqrBTMD=39$ zK}gg;3)1c@ktkYk^p7f{^csQsnwG|4cD>K@f(`mbg?zZu1QX#bFtMp9Rk8`$??l2JIee{ zs{y`^PjD<3C#4KFywUKmB)r08oBLGj{buv9#i;k@VNcR|3M@UB>fNp5B_o|;Jl7Q+ zS6!`fls`^T!+02ez1uCJY$8Z=nh$`Xpk1+?;vQ2I(-`m2sIfKwJ^sGo>^=N}T5Kb3 z_>~p?n|-J~3<+OQS|CCrB+>`;#C>gB`tj4B_29GX!-4UWo06Y|X^%guqDCZj4I3;i ztiudW5*C*$3DU)xLi3bd(ziPF?ol!J-UM^;YhE@zq+IWs~K1D(@kLOFMpCl`` zC*M3?;!d<;DltVTsn0_o4OLXR7(yY8dZ3C^^vg>lD+Hkh#i0Cl^hT;=KW`7hU0RHJ zGP7QUUO9)B9KIg)1m3J0w|V}be;|nV_2H}YL!VZWH3^y(E3-anUM zBl2E=&Ikr?I99dX(0x(vKk}X#u#JIOy_yodtM6?h#ujt=_yjMt@a|thI+$WOnvtid zcroKnsUGFWyNG!Q=_pH7fGqXd?050TH;zIOeeM=qWa|UY3o=?4QobEFCgTd}d1$#dbWOiaosylS0F0uWC}B0YO(m5D#?Z zqHph+gwqM{`ccViSez*5dF0HhYR9Dg#K#=M>wjaTUYti*jo7Ckh;Mh8KuvPCu3cX9 zk^3HMKBN{_41ACyZo|JCE)=#@`OSRZ)TGxX%T)k(7}7Dt)XjLrNGJ+`V#@qB8BH6&9JQ9`=dnD-)fD-;kVqx`=hnBzApt*}IK3oQ#$2#lg_z4gr*j|I_ z(P7B=$L91#jfw!gbdJX^QjaoBD1{nWl2U}26Rh_o*p=F(v=uv=&`bI_>qmV(4E1AI zs7d(Ho%zF>*BUoo<#$p-M7a*E{2BeaIvKYzjLyQTxGmcbg?$Vz?Z=P=Fb1(*Fg_oo&-&tS|6QLVs}Vmdw>3m-4$bt{?=!)P|)pU=qOt#mm7&O z0%w}B7ffzkdfZVHzk&tF3@udR1ICkeF8b0@@x5znW6+0}EF+C)J$Acwq7sgSK5wZ) zQd{N9QSpY<0)wFdoWw613vavCs;}3@m6~QY2!Yid>LOBv9EOhuoL$=Rx2`!3QmiCS zq0Z0bVRVp|$gUOnTCDZ{ucLB8k&F_zYs;!B=f%SWtLNj& zrydr(XNg74YAlMkkck*k-o=7MPwv4M48hHmdTM`q5*vj3gz42Q!B&RVJJGhPeAkCL ziSO~2;2qm}N(u0&@Th<&BnHUqhG1UWmXahct}>CAV8bV*(&*kzp8$15EEKrA)c)BO zJa0{Tl(j8P&V>QuS{M>tVn-G0wRt!K|87zr@)PnB>kJoFir~MLcACdidWX_ESIvwo z&!40^B=Tng&yCuvzh}Q_bh9oE*Z1%1f?3B-=6kZ3ug#aVueG0yfV)q7qy*ZZ{dG~$ zA}~S0AhSqD78GjHk7Ou5S}5hm&IpFHHSO;V(tUuut^(eJwGmk6Qjr|Na3{|5(~Ehs zr^{P`07BM2hC#vty+#7{5PNh0%xMJ!?NjvB(4js8G~FuXLd3C4!SkvL}F~(Uc%g>!aDO+?~#Chcf}06}d5nnK z^CK8g1Z(mLg@GIvKmExmWk(4OQH;LebA)jncA-5E! zLN^UwM>Q&Xtni3MWY%a?Mek>=dZ@BOlBF{7 zrz;xC2uVp=6|?b;v*oAd$2To=@W|(%x#LT*uni64HzV_FvO1654k2ZI9pqk97Umvx zCp!y4hbgLCG4pWK!F^T25BGlCwtCSDEAhL%OxKZ={}ypytEtewyO#>U1*F;H{)kYV=>;~vd>l@ z%l{cT(NqCy;g&^n=@}6k?ubH_FXB?%^h0?Dc{FJ=l3f6N#cn-@d_hVja6NOUefy5gB;gcV*ESC6mJ4)^(Or^y0%ec@H^Ipl2fvFxG{IdV9SoVqItRSFCa*5LVwntex!eq#B1 z+pt4c@XVtT#-yo}=iilEoA$MHzsE>7`J#eQFc$-+=>SO2>Uihmo226$d#h_RXY{{47B zjs|LU3XArI0T%r+t9}ckc!D1U1N;;+n)&0T0FeGfZg57s5B^uqlP_jP@J#1Wn$sG9 zw?;m)tG~)RoyV0J%M75IZHkbRdM^E9=6S}xrin;DieF~D63lCzUWSJ_XriPyex)x* zblg})P9-N`t~GMd$l=@mOe4d}uSrUqYkMk`LL+1PmeDfl=QlKTyf*?6(r^6lQI^x$ zya!c*1d|QUWV<8HkJK%Xk`GsGx&MC#!*_MT@9x~?@*>x&@}qZyO}rByQ9&B2!IO~Q z1_qN11~vF-;lv~oQ=z`Bx6do7 zKZRAY2yfP5y5f2wK1_(1Kh48v6 zgiGHH00>e^A&+6CpB~=GLjGpW{F}?n*>;4k*M02K(VCDbX%8eF%po=>%V7{J>(s*< z67o-bmlfj?pDWl|iP?3|@lyrCaA^pJJd|5*KHPt9K)^50jmNsNYZ)<1eGXeeJD&+r z1)m2cSVDp~U3Ex48U&V!GPoG80Wk_#ZS4o)$xb!guibUr{As&hq(H!fm!`tkbJxl) zcPD+q9Mrr(lWlGLPG;qi;$2)7%6oV#!aVvTI4mZQJ%gi%+%5rq(!@mK$D0@%^#o1At2}`X(bJJ z68jn6Xcbt_-f|$#f7axsYWnqkvFtm;`^`PY;ykJ{`|$~T(ZSXw=UIPHzLa7-&tAFQ zq?P$dSYi3v1f#vEF}-dbL780r93<~`o`&A=Cahm`_LWx^sEvfYYHav;HV~!`h zB$Q)uiMS#+Pzn4=FNeYVR+kB|_eb&rqh&^8OzxDYj0y6T0#!*W9I8viAEjZO*D@14 zySe`a~dC4LSRM;+m~`cI;k^Hj`MzXWN$Uq8J5{&I^T(P<#5_e>Mu zfD9&gqI?!o7LF%mH%hr6q>tp`O*2qS>UA_=R^v*M3iF?Y!Y?Z92s~76;v7tNCqGfhg!CQ|FFNiDTOB~8K^Ug{ZznmzAc??Vo&uFep}WI$X3fI=j_I~QI*fVD zTWexpH=yn@z3oAe8j%u+Vjn;-F3pS$)?^1ofOQrxw>NbqchQ-;MNrC6wVR}YgC ziuC=oK+u&&Q0UYGkEQ~h3`+HfR?N_7jcX9b1Ey{!0W!??x76#o3#+D)d?gB_GqcwLqp8J=V;Sa6^R`8|7y;cZ z#4rav*>Es=!MNdiQsGI!unI*4VdQX)jW}^)+8Ej*BSPzgXFL^J}+R0Dut5{0#;ZV|$Q4Ui>NgID8 z^Y=|R*|bB7A+elc5=$O^502H!!bC9s9jFp&&T6VjcSM#iW|LGKIfsp*!9+FYt ztf-!jXo54-M>h}}+XD|$ny;+jdJ=g=uIA@J->TD$Pe_+n%Rw8YdUFtapKQ0DdEq;Zm^ZoBUKChhm zzk88vvq)!NY)Egwj5-##^!>?A_5CN?wu=F!6sWk9ln0EWGhs~{E6f_jp0B~CmzP(| zTS+TCIm53e$AulyV@^sh0_|pcxVr2H^VQBys)4GB8^hrwvKT!Cf~xCKq}hQpq2h`U zFylrDc3kPMYmYU*s@kt#8K-iJo`k2^Kmplx-D|{`FYRFg&;- zCw>{nJ7u~G$;DvMtC?G()v`d21!q4N?U8|#=Itfd5H@a(`p)&kj~OnagJ79CxgXX2 zWZ5H7rKC6V=#rE_1%s_Mh^*ow=4|`}ohr>o?=gwf_-zJQ-_AVRDk5EbV4Tn1&K6kq zw4Nw@4iPZKrturm3oilYNCPvNTj!nXZk1Au2W?5n+8OWLN{_vbN@-sfh zLZeUK&ovk-3K3H(v#> z7eGw21(6LHlV}LXxN)uGCau{@iDT!oO-#YDqA8}4`$lZF1%g`wavO#`3-vzAU*?ZdwIEbSk%9jucyB$@Kznu!zOZ*MDDuIy&-s>ZU=8Bg642 zaMGJ*E-6kAk%)DAW1ffy^-FW;n>;3vQYca0--}ohjqDRet-aJ zCzlKtZPzAOimYPat9k@u@a1Bu3h44gOz@N&OGJh)N&W5TcS&Z6W6k z#H$r%nlccf*uvsIS|~pq-M?y}J=)T;mG@~2k;I@2N89e0B61{w<`UhwRBq6ZG^ORQ zsv*4Gyjw9F#!C*8M=l56tYWXJ#P|0i>2;eQE9!QMBJ?Xrf>J%&kLIgq*OkxwYuTJ_ z`*FYHVzrt^PiE8d;cvZU=YCP(NsgL)T^O|k_Urb(Bvi$~{O)KZdfY7Dicc5B1$meVMEDL?uhR0q(&S25KZs>Z0r7m&Wnh-zsY863 zDHQ~IIvehwNJK2{auixEwK%%(cb*S7;*go9POvp@q;d_sJadY;1?y9GIFY?oy@*lU z2Do~QI7Ap5HF&<8i4b5JPo&dW9IQ;SVvDj~!GRcb?S@I$8<&Nw#Fhcq^~^1mx}XqE zbwQG$o|8Y4PfmfS=>Ly+c6myC7g0S_<5)m*gjeD2p#0S?Zn?BN*Bo9(LWP&;ZOzj~ zK0G-kWa5ICeLLp{hOU3(s6_YwOz-!V37H}0340YR4%A8>fj2w#)lv?@h zAfTkHC8wKDL^ZqpP!n+iDC{8!R*o4b^~du*aL%Izvk*+3-b#cjp%m%g@eV9+f2?m@q*En^+W|XnzwsoG#1tdz5*PoQZX);E!O`)NOwr4TIAl zldx#!vHcD2#w>3(N5(cCLifyG)uKwF&KGj$l@fhWol^Wt0=Wl=rj@0%Khlb#Vbi>1 z8oFEW<`8wsq8gF#QTMKaEfP_UJzyQq=%(5CL3epFqjNhSd)+#p)DFCA46JBDnEt`R?Rp{>#E|m4dW9wa|k>sbU7@1YL$t* zgq^;y;}sF|%^XuThJ4eOf`UEN(Lr9Wq{z8St{0#lC53@prj(pJ6#|3a8d~Vd>)b0EoY4dsTv_jRT0x_l(iq(B8`U$??DVL(j zVZZz*Eo+P;;}?QPmD2-~$3tgrk5iqo2!(0g!r@Nmk0FY0cb*BtrVE+ z>&|`}6FZFJQcJ*{LiFENp0`9Fqarm1Ov2YEq@rR8bs;)YDe+8LJzP?E2B0mkEgo!K z&_5n)cgb_~=c+ytz}J>}guJ=jEvxK&D15W7g&em<^=BqVJ=88Ivi6$+Q*Rx07{(E}0DDz2kK_A2 zTz#I4T%2WK-2l*P|Ma1XK*7J4M!Eyue`dWAy-@JfJJ`cVF2n^9$c5;~)0elURx8zFpgA~iMI|Fc}E2_&~vl^tlmgG~w!@1e<7K zxqZ7sm(Rj)T>sNzknVQ3Zs3)ZH_ry|7-M9+33U{7)eONHJUAhr1+}4AP8o>Q4J5&P zLRd!XC}5OKYn_P52o)k@6uEH3Y%2*vbY@JNI}^heG$kyt9mgaBQo4|0HU!Iz!_!Pe zSzig)u(WH=Fb)he{TfPa*4Qh`Kk4jmH#GC6UA=U^hL9as(gZ6|7q$ucHTcv+p?Px5 zisfR9K}bO7v$IY&URo4yDhY|Y(|KCh$V_{l_?wfP&?M*E2fT2YHzPH$bQ+*>Hh;~l z;WgT3ZN#SAuEx{uEJBhej?McPf3+|&N@QD< zPf1l9Bj7r$_7+Mf6LK%gw_3sNH*<&o^^TqYIA>i5DX8-3@du?9lIZyFWk<}RA3m4!;N zcoFhKr2rh4bAL?@kl#i{Nt-E@Sbs3|W#SWL6ky^aT=H?l_jBlm6LqzYkfGcPrC{KrTxnf2p+yXl z1fN1;ono(qG0b@RwP#M621SCnP!tktyIhBzr<5+o(K6jQ|Cczs>ph48fpzQ3CrZG$ zybJxQ<$A+a*X25bzr1~Y0<|)@Z{18nr&3B$0Wr4vh2D>EZSVa0+71mW_05}IiNrK_ z`7}^r>t||wdf8pGXInJ^PL^M`Vnx8yHKi%SolN0UZ+FN)nbOIKh>#sAkKsXOzK zlmMWQou4xsK{`UrMyR4!h>EGuHUj2mEZE9u*+H(`{Nd|K%3YPG@!2wDl{CQOwQk>` z$!4;~OV}jr@LYu049kSiH1ISEB#lDrs3@ImEXp=dLwO}ydURFd^bo64a)oh2UNP&T9n@zq zEskE64l;uloh{A+fFzCB32;kxcmQQFOMy2WJsh<%}eXI(|nkWN!3n8#KqC-X30vyZD5Y0(a}AOQb}m^uC7<RyncMnqV%b;hud>-J|_$5=Gxu5;tuF~D`8 z#V5CM_-PTgDTFJ)rh$^Gjd!-S>uypU6Jj;z?daNtMTPjoB_qV8XL0oGn05wSqR8(; ztPrm6<0b-}?&?Y^zO=Rawt7n}z5lar-yD_iC82i6Agfs0C(ffM%9cN0VD(Qr z{0P>hY02k@YTC$j$@)F8U(zo!6WLpV-k>_*NeRT!;PDH2Bef)>1@Jc}$s#F*E?3WJ zJU04UWPfQg|6~oBAw44?f=&Md6mrF}X*k9BZ6&Q`Ac|YeA_s zZ4yQ+1wV0VpzOS`Bu6^w@1S}`U|<{!mhoQ)Z0mJdF2@_Lu?@|gP!J6*E=G261cqHN zwi0ob*!MyNf?z{#!$!!sU+`+F2;6d=Lz*lSNvB&Uoucj1rZ7oSCE?<>-Gs=z!YpdCjP_6UGjcMS((7>LXZ+3Sf{Hku@JnH1H&4 zWYX(b>MLqg>Baa%g%HJ&HeS?u?1o{2U4VZoyr8+@3*R((HBf|Mvvbr&p#k`m15 zFjDN<67;`qi%DyzY^4g1L;seAI36=fOI>{LdrPrlp5gb~dm&WiHTWPtDLJim?vd?F zvjH>_6tk$moOl8&WAS7ChsWd5A%ve37Wh+Mc+SrQJ_0-roh13#ohkqZQu_gGwl=)| zNS3J9b*rRQUr2`#K>8QMPeBO4g{%Gw&dip%&RCzv7vaVBSMsl1{{jEtqn`r+$;qB(owQRH;6kKZ{>w{}eP(sRLvOwW46nBN1_1B=HN}-mu3fd*Dbk5q(GUJ`x%>=DY>z7^! zf={d02<>1v(0HV9F|&==gV4COY)1)lFtbUsZjRxc1^I>589>hZ_P3=JPSbw*mzm92 z!2%G0C_>p7MFJ(OLV&gPE+7_Tu=a(UBHE(3QPd-36;y$>W(2uqKdkIij2s*)*6vio zmVv05kCb?l9bvUIe=^(SkYJ4}$$*6`SmV7Ym6k42w&4`z>KhB-D&SktlR6 z9+&75K|_xnZhxTLfVm#oqP3xP?p*9VJ0Ezs%aSvIq*RIsJ9GpI@O3-B#S*P&Q%ceN z=K1t|yqQYT`E(hsHdn#xKX&7;)~gh+N3hD}?_Vg|0K`dJ7F`%6LMYPe3{c_27B6fs0{_ZM@N&$=`jDAl`?p)o+-J%7ZgWfL@P<~(o4JRZeldMCWI8#m(} zfEm9?I_nG|XZ<<-yZ)~4>Fq7-@{;-LifnCYWr$@3hLh}S1rO38#Nnet2l}+`Rwr>9 z1prSV#Nma*05kxR7A=Ij&TV|5Hi%$K;IaU%%TSFXtIa^bgru0U-YJHeikV5yX2p_{ zSxV5H(f`sml9`mSEU=VJk4I4vO6%a&k3O}mN3HQF(BVg}h?F{bc*Lgw1CZC^lSt_h z(gB6*9wk=i5~^AhYiD`ZxgWg_WVOSGLkRiijKNRi(W@^WeE8bq>;O_sdK7^N^ejaP z9z{jKwIP86D;HAo`jHFQ*Yr6epL`!?)eaSgqMiZ4x<#u&vffakzxD)_jip3NDU z>5z~PBuyZ8k}Y9CpkE?lzHVQXDMj8c_si3l_a*Z&6!?hl{kV_2sdn#DFF^NN7O!E6 z8Xs#wXk6<*B%!tC4K!vmQ-_Z^%(V?{d38SM!DEGx=lS{e6@NScQlKMzUB7-+fNFO& zT#erJc+yD+8MN0lWagS_@EIESxeOi5A09u*9~}PB{_x`&v*9zypVu=39?>%X?jWK6 zH|{0`BvxEZmN)P`o8$qLx=#qXYTI#zi(WS){)(5-V{C0Y@tl2Wx;`j<7)c#!Qh4il%^po6yI2diF z@o3q$rgqGhA3XDcgoDg7iOS|X_MP1?9)O&629UE15c$q`z+d?l@Zv(Dnspd7l8mY$ z0#GxEs)9s2TBTGKQw=Ta!QRbDu!^7V`Xp>;)WgZF3wK^K98%00Nib({1rOn}knLzv z8+b^|xn~)y1Bk5a)ntW`R+vsJ(IG@Unz@n58bgV}%rqqg2a;u}#qLsE;Dhkk>jk<= zH;Ye!qu=XM+8VFj>k;f;1p=^k4{kGhWSe^4BCXf~q=-MWN3X~q>F8D0_xqPP6QPhR z5BI0EY*nA-p&_t?q`DnKQj(?5pkey~{V3O`-@DL(um}wvg#>#B#P@*kgu`2gL$K;J zb_jXQyuAMVjb<{1AY;xbafJ{wPHvJV3t~SafSmP>{EZkS3_=+)a|jHJd;fYWu4ZHOSM*Oj$h&dg?QCg#FrIr4ZUxwNc?f$)f) z2@y!mQUnQ)zkgZF*T=74zPo#OQ;u83i|Ha1d*(xY26&0yepmhYRk_v~Ak+rdCMpVn z_+DC(wT?FCB`*#vo&{o!e#xL)*~Uw~977J0k%=!*zpTG}^|j$^moLULeqs{0fd+tGizqX<~;KuKcvfaIb5gf(Px(YE$MIhHa??_##4Z-*D7dOYd|~$(sbHh z7um8D_~4joJ&<@7WD_@9AWb&189*FyB8Mdh+h9S9jK_KVy<_0Hr?MtbM6Jp?hYBR>l7|< z@|;r;(Y|h$;&z9y7TJrIU1o&4NLm598}5`0efoLM;1@sZ<0p~*;1^go(t!Z|7*@~B zwf|iqbm(2{lTb0l?e|lXEW*WOgUiqQ=O$S#?H*UCkni_=KEY9!G~)zB$n_qh8Mhra zN*vcUre)zUj-2Ls-j-un;arUVBLK*M7?NXF%#cw=b@(uv8Z3Z_1Zyk~0G{d|z|+BU z0NX+ueL?^}!3N2`uI}qD!&);?E^7tdzOoX1pKEbBlh3J`4>%kkSo%B)g)$)YBSDB7 zJ^J1Sfyk9!jbGY)X;Y3{#R40lWIl8=`d8qMwgHO4_n!T50C9k68Cke`$F87(Uf+@| zp%tpb!J}MO_P)yHp6|mJ+d-jU7=Q!>`K|a{&wpw9rOVs#b}E^Vf*%)y7tdE<(h6Zh zJWXw{H5mD5o3sgDx8v_?N>luY;UD(yUrK-J!!kJW0e=mhv=pX_ipyg^g4I43$SOW& zL}?iF$O0X*#6=2tcs)qi#U0E0j$h4Ry}q08mX~~a#EFh}te>HnRP0cyh&`VCv))JV zv}SFVIgc34XqF}CjI70GcmxrZlLnAMf}Z^EIP~m^so7vd+~{Upxa51>&pkQIqE>h= z1V5GbC%AJ65tSotJ&IN0Cgrq|AX^v?<2;_TAitnG1ISsDv+9?>Eb&WUl3}+C8xc(N z%t@6)aa7enio<}*097*y>oC0a-24;E*z1`Pv^4?tp>7reti|4iGK=&gUWh8>=}418 z1kKXngsAepkW}fNY{}d}gz(tD$k*69Y)ML%$I;Y5Bu0mkY^tUr(k7x?R<-eXv|*k_ z&2Xy6^V=mfe;P!3{PewUfrnLaTZa-4r`ZpW>*sBBFH1doRn}n9pG2Jo@dfJ(_`Rh+ z{^p-4xz&K29I+!rh!Fau`wrKr&s*=aMA3OhYn4j^)_MI+jRoG6kU@jk *xDzudu zTg`{trj&FOqisUS8xb7=Y|1gL>6``m;q_g8kdT93l`A6$fdqwWVyY0*Yd|O!L|oE& zGPVDyWWSiv@)pu_7u{@7`X(6-DOcb?vLwZNt{{`BroUzg5Zlg}(EFf}+hjrw$C%+(HM zq882P#XmWpSz?=I6NY7|@3v>MzL$P5fCvN&7dts}MD>*_>t~@2LZq(Oy!iljc*`+M z8tQ*VUGgH$xX2f{&A0rzef|3HPJj3J96ZsqaH+G)d!0tB!?9i!6d3vmPCy>Jkky}@ zB;|k*96sDNb0%38>hAMA*k4OfDRhOF0ilVxZL8byn2$@6w6gUhIk$DMee5EFne9p5 z8EtA=kWEf`qd}*T!p79*ki%hn*lsU3mv;ab{9@>=Gk~1+?AfBf@f&e`@dBF|(`G&1 z!D5LbGN@VrJO-+Yw3!;cSqs)LXy7S?yjcV(mNT=eXzNmy*PkW1frj*>EX5sM*-pqG6hh1vt3bm=>3qkN5Ywlx)a& zIB-Y_(hJK)PQO$Vt~VdGdgF1d(?Vqx;3hWA8ZECatsYk*xj;zTdBRKjz2x3zr!{>KPA( zN{5f0`RGxh@Op#W;db(Wb)Vw2&MX<$c_j`Zes3H`DX{{Gio(oZZyD_S*Nd+mLd5ru z9qk^pAyBV7z!xBgkKerb&D&N$Ka}G6!6&v)>xBv+5>%mhESk53B`yA0hDj&$>`Z%K z(z8~-#ov1V_vF9l{WoQ42x7pX0;%$-s&)4W9emrF!cC>=sHFr1bj_Ij0NC zf4-6w3fVwH6k0#Gmct7bJk;v%YhuwquZ?X+Zxu{=eH^Ta-}CqyOz+KXr=mw@j#=|I zSa@hPA%|WQQs8FSvLGMf7eZ&90pu+CNBkqk@RhF|r)knH5T<3p2 zs7Q$rk$|SPco#0VmbR~DKb}MaJ^Nt>X)sbmIRS~fYBkvA@%w+td}^86^!l*|Q$+=k zVd$9eGbQC)^cB`Nx9u-nJ2QW( z=A1v#9V?Si5jX3-eiUoXim+xq8kA^pj!`v)(`*R*h~AtMjg(@%u2w(}di?Pf;fs>% z^lj26K*qHwJr{?v=5&2MZ~lhA;Vu0kx?!DlC?4S`p}~SuC1E`r$=;_Twg22Jf-&a-f0_}2KX()gZHgV(dAr2osI(*a^M;XE(Bu|fB zbx?YOkZw`w-L;^Q8UYaq{LyZF0s8gbuV25ld271e-VP=E;Q$i0!?w>}(YRjHJ3Cb% zObJU^?5IZ#8tf4~&;W(W-sQXdZ{}~l`Vb!;@di0)dTgKf8nH%Efd+L~7|dHGvZU1xC3Bwb?fmxsrM*1-Rr{;2(3F`gj<&3hwSH)bhzf!4>z@eW zF$q$0W;5nAa0o$yR0L+RWJ^f-q=H4;f2$6!YA^-~8WFvg#wv*97`I_NE@{bLklqXI zPivw2y(TGk;JQ%Nd!c2}|>_vb9gXVF=TpK+b_f&Tih zr}*rdZnj&QHX9knQHEhqcW{@z$6^WVV-i;*#wgWmwxk1xOYxdc9%B*u7wDX`WRGR< z(VbHAy|3$Y%#3d0=@fN~uNUh|i~0$W2|*Y_uwjUKnufI5MB8o$U*{5Sw;OD~kHtf} zxLAF88S~ZEV9%aKd+V*3Zg1nmey{U5k}U&rB36^H8{gMv9_gkT*ucbk@x7Vv8+0_M z#I|O21DPNEg}rTmIt$M~yF?%k1Q!<=-+o&+U;CO~{PHh{i(misaQSP$7OwvEpAMIQ z>QBYXuYW!4zw#B`KYuQp%?3dgAZY;1jJ8rPib=rZ)6Z|6utITk4}%WlsN*z+akJ5J zx6_Ts5L%0)?<;wI-~5CA8_5rZj_!3&C3_7BDtrJ|_9*+c!Hz06zwk9J1 zUW9B})l_Me_L+worLCnJ5L6osS+bQcjq@=d9XhPQ1CPrQ6(W{Hjcv~o>HB10FXGt-Bk1NZ&(%x`*@{f#x4xx47N>^Dh2*f9jpxJDX?Q zXT$BLeSH#q_a!iFK>D6)yX*pY~cFt8Vo!p=5vD@zj^$a<~dt(=Uf#zQzv_ zc~XEaZr8dy%?kBekRD@LKj$`-(j6QEi{ob4M8EcNKYAChuf(t1T*=k+dKM%hRfs5} zUk(*?8EL3XuyFxF9C56fGDly}(j}zi&dj;j7a8odK_69Bg4QbpX%WGXw#BV%$7kW$ z&TEJ^*u;P!-hu2beQWbfpH00k%8e$HTQl31IMFy|%i9v&8_$F+wu}xWmv*^*xxC!q z=U-=?0pzUT`#s<{eq$UjE@YS{hY1Z0FhYzKHio|U7^B~Z$Lcmnu!4z02u`$SLWFpO z%;XU;mSxRKFAH;nkJLaUSvy93(iw{Ql!PMk8k{K1(4_ru#Ws&5*f>TjAi`;?u|bG? zv;rXc@^VO5SF5kDLw^3e`gnP%57UGk0*YP-4Z`(0XqbJ%?H;(A9?fIUv=vG|nekYo z(X6i>;69aE;?L8}4-)Kfu-hTN`>t*sFuwLRhm5%W>Q}?=-FM^mOJ545`%~Dw^G@9O z{r;Fg_pP_WZki;JNM;pibIYdDCE|tvY~QImfJlrHaU5kFM-0;>?$IHAx0CHOVTiGV zh~Rad&JB43h<|42=pw0r1v3mJMqq8jOUmSHz)Iw}-(&OgrLRA?&Ke+#E;?b-cBZ7} z##%#Cj{yQ-hsOh@BGm?iw{^M1a^!~6ihf9iUFdGJVI z-!J$6dVg(&mVzC=_h@(dE<5ae*S>p4GPhYD+99J{D1ICs=#Lfke2CWkhSsAw+NZSl zFUT=QG!Q{(T^c@%Az-nK#Xa_RuI!zeGO{o6S^c{H%=l++zqI?(rVLCR%Cz?3dVB4# zu|me?qPpy*0Lgl9x{wATala|xG1g9Bz0lq}eR&ah*v4{|UmZ)nG<@me%kt$5yg^=; zFWN!Om%74MQ1*<-NpU(_8)moEI+?J3?&DA)#4}K#xI)NoSg)JmX7ly<^&4-MeA<1> zYkv~cT3lYPA{4XzN*digv&Pn_U~zDEzzjnaSyUscT$XU0%y23Dyh{a(A4>y_x4L&+}gb*GdbN5WhBZRy@ z-@q+J9(#d$|KgrAa(7A{LL5BYlFHuvF*~j0WR3XrJOycXbrw(-<3#e9I#+oFk!gDJ zcIzxd{)Ojm*6mo^{x8qo{Qq|?ZM{yjw%VX2(uzmO4uzl+- z-EOyt0CUc6%VxCJam)ix8xge{-sp}|96~%3Qaw!?iwCc@m?oK?AjBna{HsyqsnGqT z6YE+ltd#FpQ2#o7kQPW$5gFFZhYYHmRBO%FMugLLi_OIarx-c?5Pv_P6@ZkJnwezF z%HD|#6q2G)kJ|rQ>)DSgK^+|T!SccJKHb~ha#tauMBX3oP`VpQ+1Goo=2v)itoHj) zKHjmoJf}F_JABwN9eiDU$NF>AD|?k+rtjtN9bT26B#?gC3cc7ns9!bqVrcQldM?BP zV$L?g7E4HmE%S&y** zhQo(vIMnBsGyn-D>*2BAa|Z{FVkw9r1w1@ziJ{fI1V<0QJl6RoTK`3Vhc`$>`j>M- zQ_ygl59tuXlXdToRUxEU8i<6R3E9MY`TOvk%vc%)?7yO4xegNa^!QhU2QwHSvlwPQ z3j(ee(R6eimgCZ1*PWVsCZzs63Mmx!g3E@p6+-s9k5}cojkn|T;raNb=}Vh(6z=lN zn=fzQ8Q`#MGM>%_aR@1KbX&Gf&w_9%F>&J{GA+~oFX1o! z0szQaX8<{i<0!FMV+^8-peiM*8zD$6E{|0R(GaNX9}lhcjefX1I1K;V|e~N4&}2iacHg(QedQdD*F! zYP9DywFE5|F8Fr(_I-hn*V(xkzrH^G$E^k(eJf~YgO&;*%^KpG5g9{;5Qh=(MdtY!KaSy6#rzI6HhgB=#H2U*}5kQ&p%>QfNO^=F|vgO9V2IO(Lju#*BI zdKoWc$W7K}mUQ~f%7E6!o z7bH52gtAsuk(LS3mNoG>v?E+01l3PxC_pLd_j@!0TNcDpf;n(_EIzFbNp?D8kV1SP zt@q{A5c1|P!0q|M5Y#gwc^th?B%(lwmBqHB$j#-3$!h;1Mn z3N(}qhQmXgN@inp@VL3DE;UQTP(FK)@x_ZVG!Pj)mVJ%x0*Vz#{HWH87rGIlrDRKS zcIRwH+DSbP#Z6j=kf4eo1Qh7N;J~1Y$86vVBPfSYlN5NGO`zEy{^K71q} zmwOFL;DFL0?=_@A&q$W*?>UzW~n7pGZ} z#8hptUdLfHq5%ZRr(;;{HTjztWinSSNs-SF-%A}YyE^N5T!1LDryAYrT2X>a?@ z^X2))?|$=lH~6{MS?4ZTX8^Ik^>2ms&O#|EMRLzBxVvN(l@vbp`eW>0wpu$~hTuR` zy9O;OVfp@FRps<&x2e&{?Xa+^LZzq!jNfkq3qM2l1O2^mjp!4x*&ci2xqK061?!SS zP$>~9siNQs8oTTJu9)nhG_rnpK5%_iwh}GAq6%xI%pK&8!j$Ffn^N^ z2*Wgq3`2z&QI&q2kL!R&+B9VyhAbhZ`Zyp=30r^c{r7G0vk0mbRZ~@{%glw-;JYgyqjH&R*tFVh|jIFvSsxc5v3g>(h@{XB_Py2LCwewF;&`N zI`PL6lYVFU&U`J`OI-gKN)X6qn1{s=!=QizVrY}eAuuy6X~CRk9G3%+Y36;Z$G5Bi zjD5TmtVtg`d@LXF!();q`ci?z zhbO*iW=ZDbg=7hd)wbk?WnUp==A1vWkB$o#!{^inoA8^P-+b=y;d4IR4jOf4$*|Sx zY1`Wd!-31p(7%XxcMxeaOZt(s6+U#ZL58~b%W-6{`w{um^G6gaCs~kAaHkT2kTt9# zx;lmQES3L?Uid$zkz*Vk4g5G3mvW_$>;+$36o6Ph;0Ff~NJ!?dO-v#tVq%sxi%?)i z5~SXz@)0h*^+77m>q&Kq_4ASPPbgay**OGjdACobiL| z7Pvx4&l0Hh0^br9fb{}&Quu=ldxZeozJKY5OFlVAumeihDcA@cGE~JvFey=D%pFd& z7^?m_7$ZXn%3{w_Z43#GWnb&vO&t z=ZBAD&bK~%#dU+Ybejl_rY&JosN$MBzwS6wI zLLY5eh-EPuM|D8y@BtEB7I-`5>7$Q0mvvf}4j=wJ6K!+D%vhq8C<2&>XfafTxn~J; zhKLwCOo&u34x~A2HlsaFL4SG<32B1~f(WG#N$lrgPT`|L>S-}PV^m#hwGfL7aPV06 z531OBCS;npDfg!0-EaNYFadap&#be4d;8n+S0DZ=@tOo85m7zam5^da9v<+Z03KVG zW@aI%M$=Ltjxx}08Dy9Q;*Xl&PfVEeQ69onyHBYoKmf-6@6i=RyjcBVxo63K(2rex zls?Mu=l7S=^{Q5rh>8sybc0D3E&(DHX5IuJd0C;uX1D*E3qZ~F%d_jV{2e0@G`%Hn?aRNYPqkQlsF@Dk zM4#GTJg#>5@OusuBtJ1Qwp&#nAtdHlW2<%P8KMdweOTj!DR?~1+IR(tgI;erT-+h# zRILkDyV#=DwGgXM-ctvW97FUBjHQsGe}? z+3a?*`TEyw2jFwe13ByeIR78{8~6>JDwJ!$P)mn>8xdnd8Un(GAQUPYnF|*=+7V=h zkWiq(cGyy(mYKus8IGF$*kX#K<4iAHM$Q=>0_Jpt*MAf{e0b&~e~?O5iG8r#%c}(f zYhUPe#Hb_`0;=dEeDp8KJ@#4-lAsbt9(y;X-r~2HCR@%~k|ink#qWFF#GBxwWjElj z^;m3ks`E;6k8SFBDgbrxi6#X3#uKB`gUBhQT*qraa25KrcCXI#yza%Xrwqp+LpX(xew4}cB}Jcl->1iLI{@@#Qg^?KJPHKr3XVGJ zSB=pDb75coee~k5bmPa)h##C!Rtkiu-rZSt5XqUZ(*Zu2 zp{UDV*HQN;|39Tl6-qwsMNWU*r`C1Re)jA70bnACF`8aV97^V+f=9E%habZV;#rbm==Bsr5or7TLW~quH4yo*eE9{`c7>dj&%|(*YxIP46J$8mLj-LIf>lC9*?qTp^c#qtMz;ch9n;YAd zHrt;AK+gIP2LKr^kK$p(D1InLsgc5%k;jEg?M={*HA%KCkr4{DBaf1p97g1W{9z_A zBwKJ)pYd_1U)VNrgPH-+0EN&C{@xZ{ILi!yu+@2ht0Y^l z(Bf_b6TQW7E_UFsdata?pCgx}WT`+@Z5f=Ai7vjme63H^?iq*D7I(LaZ9;n(>t|Qi z@WF`h*oP+mzzj(9>Fg4owiEW)fBzkJ(R+}g=~6DI3VP+Lf}p7b#wlzV2&q}6 zY+@`nllHi{PYTtgx9J_iz(p62VaNl0zp;&HMeLf_+l-ujNZ|nohY<-HvPcP32v4#g zW+)LwQq2BdD}>l#hQc-Wvj=7tqC{);POMKi)a($AB0(S!wnhdRb#w^v0)CG*E2W|+ zYZ;dj?TA|v$bh2E3hI~NT)vqe@Nj3eL-9o8Hs^e>Y#SzS5pc|!=R-bh$IEdG-~m5> zI_nG|XXPA^q%A;7fVh`dk5Ts|vKc7?h5#Y^*>_?XdNDB&W3@>W2}?a-L}^fo`*wH5N$EdK5kVB`n{s zW0ZU<`_Z1?gQy1o^P1`kLB&`jRfG_+Vx>r6jE0XtHmt|7AiehE2}a2F-VuVd42akh zfb`tBwj*i6lh(b}{m=8$I`cyn;_Yf3K)lercvxFDYs)AhNRcX}Js-QB$mONTeqX^u zb~`QW*wzjpZq;E>^v+XK)L=(qBvmOQxz|$YvLF=G;p6TuBL3fOJ>&Uf-5XQX+i zidb)ptEbrr_xe6fNNI>^WKY$B5TW#aLtrJvMAS%Q&H@tXbxkA{U|5fOA+E4OeX1EU zDY91gfmWpAytIC*?HEEn^~k+b`_Z*(PU4(dF=PpG55%bND-Ck|2S$|u_mLXqB4== z=dCDX<^~0rkeO-Vd@3i_$b};EQA?$Hm#%P`FA=%@agmn#MR0LP)~cmc`Jz)fD!N~N zTi!$i-m3sMmgwAV>6Lz(?#uu^$EJMw7!4*l}BKbHGAGTI%QISL>DC zFZUi7lj9#|(^cx!>bHV`rDV zY>_7P$?ryV8;C|%8!kD(&GpbnzNy0q9OrZ_aDXIwL?l_>pb|4@kJ-Q%y6{?;a@o6# zp+eowC`hN1sji}81_MPAgjAhFOeA(zP<%j$yU%X!E>~eOqq)I$xn#sky;M<|RLx+R zIB^4zbit^}38)SjG#X~jnruRUBWU~%ng*Cv^Dpz4ryJb_fSla@s`|T9%4N2ED!Clo ze69d;@%Jo$&&SUI@@&rl@?o0Hv;tZst14MS2Dmi%h}+O11ZF0NwbUALn#98aXzxhJ zF)Sr`e<#@``ntkTCq>@AUtD;ZXWB@exw%mecxX_wPh}Zi49I!$f)PQnn95!i_YZ?a zlme}*JRax%UKMn(Jtv^|ki5!Nii=DFV1T0+;P2b{N_uMa4G?-u|Pz`+U}o4>rAtnMqIL=8k5)0y8@97{6~2lry72+I<$zfvD- zs*jaZq;h@yvCUt7RgRLp6y_;9X=3{UqqEjpp`lceNXK;aJG=9!Jvi85yB`3#d&(u3F^57f)nhE{qSJEPz#|4H zhL7k+KuFz(kCwsp8S1=}<*U2T%4gRQzPkyCL3{@u#!y8Jz#19@6Gy7gQ?)aN1dZ_@*K2q$^EhdUgY^iPM`>D%2Y=qng%&^au8SsI+r^`naWb+`<>)O48NPrHiQq>$0 zhgov~A~LeqiSRE_?JNK4R@I!YQkTi#)eL4zqPr`lL`fKPQy)V}N29s0D4sDX89A7Y zq;ret6)!G`E*FhRDFysRR4U*jd@HEO6n_7`o?K2rC@8AyQqU)K-dy|jU9mEkQm(S) z<2tR!NW(hebhs%um$mQPi~qpof8YYZ3D5Ry&j9ja*|PXCXPrnbK7bM5g4pzV>`Py1%PF!wk5( zLe2e007!~`07*-0F={g)XHx*k<)zZ`sB(7~AYxvM?>cs26sM0U&SM(`3T@mryabLK z7}8KpO|lmGNDip_CNmT>hCh+6(+RT#j;>gvmP4Y(b52t-0OpQdiZiz36Yo)->c4x? zV0JuV5&R{*vsW*sSpxS#B_J3RYF-;tu3XBRpPq(h;{%F<2lVN#ok@!4oa$W z=InIG01}g|w7L#+QqySl+=N7`&+||fO;Q5HQb6~f^Z)}1b?`uoFv?P5m8`+PZ2*!I z4=L#|O*ok00#NmwC95+~)&qcBJk6f%ClFo(aOeeISY3SSFHNSja<$ByQhPNdhadU{ zs}&SD-4eHjjFl@gv$7TCEFG$ENm5Q-deOK)Q<$>Htwh=d-&IOpuk zin|xOJLLOegMAWu(^xRV06O7x7);gk#&6a)%bWGm+U$#&bLkz6K(t6*skWxoQc8s& zix-q!bdG>5$56Rj?*c@;`Vy+`h1c3vczwOTKGn`gH?{Hb36D*0<;m_%`yzBh2Puo4 zahEzZh9x$ck$69BJU#|NN#1b*;bRq?=avkV(7rA{5{`PR+d2BW8&(M~zNxSvOx0Yi zsK1oL@Len_aGj$d=xvb4{J&A3qGa2zP0e+cx-M{W;b=>cYIB0SqT=ReiOK8U2x?n= zh=M3Res8Tqh>mOV;!#C2+dOR$f@BJVShy9Ib4d%AvTXH1*|VvahVEgoM{MHQMuh1+ zBL*d9&sxhG`ztN_G%Men{B$Z-w(02gwu_UUF0taU*fQ5zs*4IKSI+?QY|jAl;aLDc zNy$#7_{p6MfV14OhL^CqBa!HX+Mp_$joBemN~URYn#2^Pf<`! zi_T85m&Evdv6&Mv^@xe8v4IQq^ytSQC-o{KAFbn+*_f0c@bORsVpZ++(=BR6RPPtC zu_<7q!UqR@)IFx@9Q}A_1VmClWhez)SJ^z9>SNW%w|I>BK3f1tK<(Wfdb(lj+D3Yh z%2<{6r!l%aQxd}@j5-Yi&e4x5?FI+3$GE8dU@{4Oh=4Ew+i{Z4Q4qyx5d~1bEcSrV&b?vnu%Sjt}obF1UWm4*3#q#F#Cf6A#>({5RPro?*;^t4wpMC>^?|=^C z-8McJ1TkZ_@CIWy2m-_Gtzmy6$anp2c*nGMb=X7c$kB%=dR@w~JbSdg6@+02ZXKXf zvV?ckf5_zwU%UVr?B`iEx{NVWK};f1|QcY z2r*qo&^ZbsAF&m&r_sGL3NpSQiKlMuxcs9EvOkDavQo<7#M2>_bjX$ubRKF7-&7-Y zhtHPJ0zh8l*`Dn~1CVEXwU%;s%GpmqK1tlbG7|97h8L|(t)sz* z;o<@`6B~et(?q3q?O52eT}emmN1a0{SjNTo$mx%W$K|D_fRKxe*rR&&IS?QTx>ErK z%YukB1f2LZDNmC#C1XIyvJ{ubAVgu9gg%`*o{RCg7Mf?+G?@*6XcwsPb=@GxPbYW< z3NIx&AOcf(Raq}%6$y0p=iO>TW_K|qs=d>1wWxYZ@eX}TNT*X~pZ!r~J-T8ea?)pi zh@;-|Z(8bAhsLQ1qvMq|S*kuY4@=2L%|!@@0|G#FI0UdZ9S&BNLiKp!h<@z#*d(E~ zhK~EFDr%2>gO`ZF4SM2>=6{9AZp=u1q9ML{lN~&W2zh>6kM9+P2VU;Bbma|eRZ8#{9 zF7b4GwiQZq@=x=ptSvz8mA&Et2j;p2am7lRP)Z@eWUwU4AS9k@_MUh75zLUM%(j4K zzg&KK`|sp`=hr*%VAqpP5Ed*Y{75^C=ijxK<+~?Ky&FD;ACaa#cx1J;_VW5gOS$&z z6Lxlmt0@KrCZm)_<*~=HxjashHPelnWZXNG9;vckGWub z7C0Rx7q4|PQW0)+b1EuIdK3i_A&IdY<&lCAes6sMp)#{WSO=(0nwEvjQfs1K)FFht zDpz*&qX&c>%hBp_6v_9;>_E(<9{sTXIM%zah0@~7T>aIj`t;-beE;TJ*LNm6tYtkc zX*r~vE|2MWS0QAsQMJS22Zu`lM?BlJeNX`MJdAMpNB$9=0G-@*s0Bj)AJQj(W z)Z~sLA@#JADiRQSS0hZ5stjg=WIe3~BGungRvi0zK)KK#!}R(!)|_4Kf^>t9o^B#* zh3Ekvh;*8B7o^@LiiZQ#-W`^w6yCC#@K9^wzN;4X|g5f1|zp}$F~S0oV8k2Yw~Z1~=Ow*v?^Z-$O&LYt>zoaYeLimsdHIk5^U z8Fm*|2zwuZNMK5e`>?7}176LXUsK?Mm5*B;tt$G9lk)#H&puDOx2P}D-MCG@j+&H2GVGr+kkFkOUn+ejj zUoSdJlP0R@;KfIs$Fv-lsO=cTQ#e{uWo zt^eNZh#&wUeh=%>2)TUQJ6v9>ynU|YA~}rB9z)2mQHZA=-t<*szW{Z>Qn#p2RDB6CTc$- z0utxVY*x`+e-4wK>|(uJuRc5gdA4T&c{aIE3z;_nan~`tgiY=ZL`qQ}Zz#2y6-eQm z(2J_cQrdt$Hj~-I8@>R>Wbp8*9X|}`TUY4%nk%55ss(EjTcpA{&k?DJRIE7Xy{5uA zO;8n6VpaH&>SPWGcX!y@|31`oKxV)^Ya8y?QlX;OL{aD-k8jgttfqajdqiK3N0ez2 zQxeQnqVA+O!|^`d+VNm~=(Hu2LVWyj1CJ=Gj?PZ~d&+7yz+$uub|+LV`~bEt{Y^@} z8qtpq88+JOcBQ5k4KbrVn$tovpuvH$s`}Jr9X`6=Uw}ToX#nUz7r&cNDJdndGK3nG z^1N?Rt_?U!zpG2irI2QB%Q8CT*nlK90Q7163;=aRh4t40B~blFi5?qw>LJ|^7#+1F z(5i*6a&#j{giHOt>~$fA>eXvUR0`-$HtYv?vJ4V0*bFg5Gh11_K|~NK=BU^>bkgUr_wZ^5s45ft>TpQnvQGzT^!)(irSl z7YUBy_g08rmWYzHZdc1~4hZ>X`R4S?<1g?3bp6xUzw}?;0U&-)yV1=OU9|!ZN-63y zN0fbAXHlGBwaa z&Jg093?}wht4!$8kbNqffDWj?#bT2E=ga5!AI~4pFUkwM%2&M0cYcvCEFxuxa`4$^ zCbe295Qv|NAQTm@W#u}rw&cZ5<&<8QmxsldtB>lmr;q*PSG8l@ZMHnu+h$8Pr<^XX z({-Xa|M}_9r~d)|2hUyop6wYxK0GV;fO3y`+)KD8t$II7o4Zr&Ov&=)Wr_mrNW(%rPib8?Q}1eY09g99HgZ-e1sULog5Ymp>Qx62 zq&{fas_#DwS}68o&ITTtS%@H;A9{d?)Zyc|7oZOiY0zry)*s*aa9^qv2ya_y#kX8Z zA?IRaMu@~P3?dzomXShc)?P=Xd;9+&xkI~Qgt14D!i~pcDFtK_Y3>enc=ePdckVh! z8>Iw^>>o$7)_yuA^9CVMDL{aH-Q#qE{`G(TdU9^i5>D9Fl?iw=j;&{L=2J#~emktPC30K$Xk zh80unN65tCBEnP%ihPKEbpJRtN$g$RNZL9Z^#x#D zJZq}~$fpJ$>rF&HBJJw_&+?zWshyeb@%>oTHY`!hzUK91pIM^1SAA}@WT~#aVx;{% zo&^#b;1nO*AIB@$Z2Kz3am+5cD|fjLemwrde{qeSB_`Bih|{hB{TwlL@OciS8B`vAG1`hwf`TzI)yX@JX z0p!E87q|1Ak7mA>!lT51kYk6Dlf+{w;WhQ#b*w9tl2FB}RMlrNs9#`VgN@Pd2ryMy zN`!lWNTk?Er683;P=&Oci@|DzX5ImWA`P)0&)qcc%YHwv3Xi<5 z4MN)I4OY)qrx0-C77)^5gmL>weG2fY@@b;C^d7N2w|cFHWx!Rv12$Uqf^6w)hXdk% zKF_Jlb38|zGXY?#K58divV){zX6)%ZCW)n}jCo*x(*_QPwMYjCd|3F-4eYAN>eIy^ zn>Gyy38l0j9>%u-atDlv{EF1Y|RnOQzCL^sp^0X z2jnTC+E>Jr?b-hM>wiA~7wo@Kl9E7_#FS;qM%y3+K%O*}RskgRmbK?s8XS-bMFo@3 z$}CDuGMXr;D5C&0Z?zgp$>5+FF>1^cED_xGCMQI}-CD^{e~8XZm|S`qAM>7q8Y=^UHa=$V|Nkgb*MF^=Qj~GhzZ>Qi0{b?ithX_*A)>81@sI>`d7?TnQT&GVl{0S;t==nkxN#+ zZ&Thh=9c)WY%2HV-s3r9cFC{uukP^3Uh&l-8qIbXv&RDnM-52!JfaGHoKU}A&Rw*I zr8Hml7(mYQzQAoIlA}O95md-vNY>`h6fZG3-QgfXu3A?($hu-M%+~ZiFz2T@PUBTy7z4!%eu7licJsQd5Hdp@KuVdj=cz(Uy0`nozpQ`x0{=mr00__aVc0W(JlmIF!hiCUbTTWe z8BYL?B<}z_N`3@T#rvFj0SGwUOD9OTCp^I{7YL92Sxq;k$%;k7UpEmVK z1Og7WdXtn4fXkhlrcLZ$cNJNMJ_8-0o12sY_(Oqn7?Dx{mOQ1@A*6>mAd{Ht@hrJW zlY~N^KW`^xIiuvv9FI{79q+I|QuL!oyJ)O7A|GR@U+2%W;hj&<{eDC!I*^2{d*bP4 z0yv@$A@=7Pb5*)M9tI=3EYb$0=cpa&Sl0%>?i$a@ z2Rx3Ql^*rz@PRCuyIYYGfHQ!wMGQ5H+gVi_+;XWZW+)OoC6smba(^%XH~tMLo^H?f z_o39Gd8;NZlcqIW&LA?$6Rk?j)vrlub@+&j2?oNe43G=OirgFp)mIhH5n~XY-UFIO zkfMjP@nY!BW>Hsak&w0k$xSF;U}nf}pk-wh0m}X3yb2&R#((Sz(8qM~2R~kkkg{_5 zOrPDa%nTGL_<@s9rsdod*@3kWGHk_%h9lAq#YJ%3%kzKe9L6HcY5cWU^4 zP4zkMzdnAw{P60Bhe!?Ub4NVd3^H*uH+betG{h|JfTnvf3H#5b0=R-bk2st4}*e<{3cHdjt@IZ9w79QKTL|KyDua z$zG33JNEXfc^nBz>1lpe+0fWI`+Qk$Y12Y zxcxiw-*IudZY`Wy!NRLbKt^xL+lLs@la20M4Gy;{_Gz1%?mcP}v(yilc6n8!ATMuq zdr?X`$TTHSvuB@AcAD3+PB>x0FJJ%)p6$c1X8?J&|Iz;l<)8c~r!5MS5*L6+cic<3 z19<0dcao~-i=0tSAd$=ve!xfdUcD2eC%P*jgMC76kAx(Q4udI~s{WAQI|;Lo?gKjrlaWbqJ9=JHW6@Dh0Z~N9h-i<6`k#^jl>; zB4Vk}1azm&=sX+B!#mklrBN)d+glYN4frUn&9C|}M=LFQLOu+sqe-3so6KD9unr*Z zPT$W@EQS6zDf%&nNQQM74Zm-(Od!0h*F_MIOw#aO?~Z7EN27%PvxlVf>nXIgY<7nO z;5z_KIcovbZ}a{$MvsVr?lO|uj5E2xc#jNvkKh>g_YE)YVT+U?Gjo8YzW@thn{QpY zSf(kNyP*`L8JIJ@nw3JPgbWM_(OUgl3Rg1~0Qq>N0K<2%XZueBzxsE7wPfXG=Dbdv zvYW5TQZ8Or7_y|*%$DlQc!zfs-ohb=W5Og)uDSYx$rF-KoJ#e@ks-RtNP!08Jjuht zS}+lP8o8CFZ{dD>1<%^0%m&>Icj$6Hf zCrgE-jaeI$P3+$4*N zy_zSRY_vBXaRW&q^)xHS^NrLuisNHP1mtb0AeXlRWcZM~kK0CC)@b#;AFDflKWf56 zcgIwBy~i^P%PNQve59gTvIZXF9xmAZlVIfMr=Q;joJRCxUgtKIp3`WR!gD1{!+R|*6O9hB4scTj@46H=jMAkS%Jm?qo*_Gj}W>XOtAx3``fgeZ~jy@=UK z7Tb*8J6;j+DkcCTN4r)?8b#6%MaeS(03q7|#7+Bf>8f0{SShzWLg}zQ@iwfXtWa za+)}S#?Ze0V*oP=~$ztZA5LJ3L`aL>R6*?a|H*RId5PVV8Ln$sgJ9HxjT|2 z_V1HoL7>$&wPmTU>M!c|`Q?RQTvY&>v-0Hb2hV<3(;6l2sTQgqYOCkNb2qGK3rl#m z55``UD6(A$*_Rq21-wzVc!xXS>Zu__yCf7@^>B z?m$Sg@G1qHuiC6lUl+}q$PsDTWc%@GQMed-!PzP(hW)b9C=!=GH%d zlhhw~^Q?yE0doJmq6CCSq{R#v1^88*=fr6uAGhca?TWDD<9iB|oo8*0+UEvQa1#9E zrd|nQt(YG%^?aJ9Z4OAV=6McC>_;lr`=_UGpjdqpx_~Su4ER zkl|s_=4h!Kz%iBPQ?7gO=bYTUQnEOw0lFiluoM&tD_}PyuO;5N^EzjHeSeQP-+qhN z-+Z&3yZLVK7(Uy7;0GVss&)RDA73V=3wOVibm{V|(6)yk+&%n-(Tv?dKx0WRN_{rA zJH2G)T&iFyG$7Lk{^%+n{+Cl`?l#W0YP27s_iYiVs>*L78ejSd~n zZ_u&a`0eR7e13C$lk4Ebm|L=Zv3zm+&#eE>hp_;}9_UZw^3l3EAaOJ>(0#rS zF!btUo`~O-yW_2HZ$pJ;g<%QFw9vS`POq1(xc(!o036Tu3?R?;!C1Mx%oRNNDrcV? zbofQi;nhI|a~uOO33$81&ly0t1CTC^bHv$1iM#Y0s=sTkoKg2Y*M69d)F?MnRy}tB zLMbFjd-%X3;2}Eb=pi*rAjOoNX6Pwbp3I#6ISeT64jS=v-+djz-Ucw00(8J}pj9Y$ z2c@EJ9GmrvT%<%+8M^^{SRrzj|8Y~Vgl>J9V+W2MfHe5P)Thd)0ZjX;^y7G% z8i4eO)tH1HzBeqe@ANfP@Tl-n0zSOLhi-3IfAeO|-+Z&IUwpA1zxt}2a;ERUr|&kw z9o{2+(*Tb?46yDkjh|I`U=t+lY+utdWTh*>qog!dpQ=x)a&mylPzODWh$1q!R2IA%a6B-lg*r(NA*Yu3~B#Pm>OP;A|q@j zKKynS!?BaFaW~>nLvr~55M|F>5%!{`yc0n12M3VH6Y?-%BT~WY6Jl{Wa; zit^{wz@vJmsnt9rCXRuGg4hATKui!!IKxLODptJM>T3W9LVNhwLq}GQ{XQBpJge-S zngyYtbnxxbwJ?FAMq1SHdk8^u9DcukBam!SvAQm~MTZDrQ204nYkWSD26xI2jQ}ce z#J<(zl>0a?x8*i|O1HPTy%TkU;pFsFva}Q{i%qt^u{R5d<@tBovn4#+2W3C~>BLgX z$IF7tQgjFaSyvwba^bEGLP7v{=`lh#kqS4u##cYUz=_Qip8GCWZ`Wx+rI@!=8P)xO z8_{gYdn5t|c=bIr&@~=k0;~4$aZdLsMQSjj@z|+>h_m{AgauMabl<}i_7K8dgk`gS zyulP{p3_`(GP$9dd)TZk!$qh{+n-f0^afY zyf1%+tO@`Lz_Jb?&F=;;(u`r_rv?e3T@m^jK)gpj!uRy6CZKsTIWW09Rpq6y^low0 zt26*<(GLmNE!x2wGrKt)?4}glt}Ac9^B?PH)9F9(XQ|FPxVZGoLsB}(brA~q5b9n! z1cX!_1kJnI5E3*5hr6r6N9?yBVH-PItpDPf(uA2a0zS5!c#Gqyu7p(geTT^gfEb7~ z%!i$47&_!I$Y|t_t0ht0`bDe{09pDZjDkNhfDm{WeDtj{^Y-~hN2)+Hm!pfTuj;c& zHf`{++HJL^>}@(ja`@5>leo_?qY7qXa3m@9vMOWIQ{rC1`yV+O(`ND z0U#TESpQ{b?LYrH`XL`*LxYd-h6EvfXM!MU!cB8kd25)k8BMv z8&Qz)rPv6~4jU1&T~q{*jqg}{JL*{69Z`y&&>)PRhmd3x!N%ba`4 zRfmtBf;G8o&Y6dYlq&1BAOkw~d$B_a2SiluQ;s?iKpjHd-A10i=ggj~FZ2R&s~U9$ z)&YmZKS~SY9t2|VKIVoH#VK#|8c^*FHz{ePU-i$m!HBen9FTk~+OpqkXgB9XDH+Hn zJ724C6y#UMesqqag$AkY3YYQl*zg_&`D3MC?JWSr=6S#ecToW(GZ;C<_dQ+*5Mr9b zhb*K%!CZCEr__ENkBG>JBJC=4bCdnsZ`b_w*Gu{3FW1wb|M_~iuJX|V>j~FStY!gJ z)6}9M4Y<+(k+*(dGhV3ACOjgaM8I0#T6ZBNSQn31pXnVYeka`pi0d%|G27p9O&ibu?dvlNb z8yFTd+v=^-oR*Z*sVaQ__4Ml#4#>}GSkD0RY#)&QZ~ix*=(Eq3%hQQ3-6Qhhhny9@ zMMSD3ZRe_|WO6@8D0;Q=Ls+2i`Jgk1?EZ`m06`y22sQAaSH<3Y!8iwmkg<0FVv~36 zYfFal3?UNqcBBQ!8F1Gw!s50|!5YtFRl78=N2y-Tv}^f`At`{001Ugx$4+1vpb_b? zP>&q+`@Gdc)~i&Cd|8mC?*~5g^pAXJ1Y}&4j|u4Ar~ME@GZ4Ug=YXUX(1Gw(P@i0` zCc-SmdE&j+;N$*2pkobH5NWj^HT9|j$h!XG7pJ=~zx3liiWU9|-y6HQu=#LEQ;&R1 zO%LfT_8x!q35r3EvCbmt79+5p23q+X1 zF#x30<#J_jtB>MyhwG_0LGU-7;FF<)?|3jif}Gv$y$JsJZ!iD$H$UQ!uDm!Sev>_8 z&1=AipK_0YR0VWI+Evti1bkHRSbw$r>i*})pI`rZ`Sa^v=~wr7k9}gFT>N1A!6DMG zBKon_d{~QmOlgWeV!xa9foeuF^|>8JZ~%$9JXKjAM98fs!aIO?ehV%jY!cN0;ce}0 z3>h4CCIdhQ5P=A@Du#_L8_=%vKoX);)sA0FE~f1%Be*>N^5@{ z`{+PcSGmsDxhBP}wF$uW%JrV)!o+}(Ed{GJ$e98_R+pb{FK0a%e>(~?AWGYI*Gaa62g9p+ zhA+iLZelY7$+!L}G=rIqVSWQxz+LpCckO~Ni`MG(hZ{I3zVBmMZJN-F&Krb;M)bo> ztQLFz)#ujcNnx5GGinrWH}KfMeoWH@;*sUtS}O-|4EU&}j3uHUweUQmANeo;a#?=< z^YwOFv~RzwC*)v(WCf5kUtQTe +jG5o$Bgd@6j{<(FNHmS;c2e>Zt9{ni0G%QbP za?OlVau&N5#)KgeOf;q*1u*~?UXYbi8i4fF zs}cF=#pFBK=tC))nymW}l6q8(eIE;9y)eJQ2sh8SQm_PeNO}l=;6eM}THgsmDBbIR zi-b(rrN?=LsuA^QvsW5~#J)y8%Gal_*UwI$-M=ocmup<#fjH8*7;Uj+Tg}I0(^3KO z6aZ3zaQVW%IDKGK(Vy)ZK%VU%{0D!MfAI^CLi87<_?&Y~!*cg39i$CJ=Bi6|7iLH4O$!mg9mfQ~p;DI*eM9jYLphXFK+>tC0n^cFbbX#ips zKzI=lQWb^vRTe$~B$?6bo()3wkB?#MQ~MoA8L$Fu5RyqNDcR_J13K)!!^mCz{JxzgY#p%vp?~N{*T383;AeaBU;9^h^3z3Do|rjZ;Nn1Mi-LFw zZ4u$>=UmDK2uN!1fl_@amm)8#)|`=ZK`sl{HL4Zd)WfTW5$TvXZDfyan|sHpf_*GZ>+3f6avg0O2CpRnEy9s^$93m?No z;SP5>h9Ji9z&gWYsT0{xQPc6&^40O@_dma>#pU;rW)=Fg_0PVI0eU|#KmRt;xbR!- zQ~UJtrM;Z4tVTcVFrdTgV(hD-iXiAZ7oHnQtCp&tdJ=8y6M)bLAGu^!k5bCGNFQHn zoAUxgp97=we!sX$1VwW5aBM!-C;+b-FoJ5$4;*RE@W6aV(R9vI;CGIONDV@Y3Mj(i z=v~%SR02RY=%`-3>s6@{kaAtu7x|0(8rfQ0?l2q!KDNk5vQu~_o?>6M_~dJyoPPT9 z@B4DavpoaIvwcveAN}ZJT33JN?g1ZjDG%*<-EojKS6}x&JxUMS)qXHM!tbHup-$W& zol}%bOSua0u#}J_&F5|@3@9-rw4cp_&|8AYyVq-hZHP;p?zb-5hWN(08JVN^Qd0lhBI#3 zijn^i(T^us?}mlcAgaOKL(~c#=9)(2?^NXVl-Wo~mBqfcsIakiA&>@{prTU_X zeB9lYQS(u11VrC{8%5_!{_3k`t*M2_8vVHGkNMP`^)qytaXK7q4qaS?rfL?ls?<}^ z20)In*40LYj1EwJ*kCA2Y3)&vkdxDheu%X9evc1_h&0KnU#so|Kq3k<&lZu8+m|ox zwjSSu&-4-cO`t8|}B07&@g(X?_c4tK#t7$(f;c?t>19)QGP+!!9#gQE{~xS{6Hc!6pv z$nKB}?YJtAyr<>J`|5KUyceK*Ru>jb89HL`f=YiO zgmXzDINF{fAdJXJlVD>Xi|8OC?4gi00w?dW4&b`=+Z@nQ4MjZ#%ankzj`_KGm ze!Ql%d{l~h?Z<$RO$T?(BMLHt;-orUm>%YV*oX{l0yfVoRl(g<3O(nffR9klrcgl0 z(CK9Ez5{_CDBgp}<9D;)zKtWz%d3zxl*N_RUTeK0pRXUts|OLy@!^?QKi$#^-yrWcKB#5cLP2`fAJSfEj(Xt zE3lM3D8Bn|7vFJUNQu+sB`+>7(_x-dz{nIn;G?HuG1P2;IMh77LvZO6`-^0;U*p`{ z9kER0RK^SA5qw~tHK!y?Y1=Qo<|7pFad&x%0FaxjE8ecFz5a*);lnMS?RPc(egE;C zj^%hE*L3gqHu*GB+-g9)coeD6#miI``{deI$;#eSoK)n#YAr)2EK&5G1&6AUR4+wO z)zfG_sm9E(?QA<@V|svmJMGFx4T6N~=_<-a)VVv>V32Bf3qZU9$VsQ)sSPiyy&Ei^ z1|E;$!{EvQF{JO{$_pc zuTOWn%l$D#GNK+wOV+&EGNtLb6{1h+WV0VH`%<7@h_1iB!gKMdZGI(T4_`1{+<&pcu@;G+)2k=tpIX-@dCFWe{2;(c)cw62EgwDAkX&W z|Ms8h#%~Vhc5(Eh<&u*X+oB#l0y0&| z-gP7Ysu=c|F}Pd9IP`rq`h7ZpyyVOImA#rH;xXH7o3CqNU?>dTii$?s)way$@HECg z^@4PRkX+95FS^nI(Cem}wEZLRKhPq<8%v^WUJXVh6@uy?>i|ms*_M88s8Cb~mpzDt zZGK<=)(<6Yo@mySg)L2*lJzzqGv_pb#Au_hBh;4m6YMKklpDWUqe%Ttd9!}Des))* z9mjf{E1;CurNKu*adabjGP7eSSvsZEWM-#%o{zmd*70Zj+3^`bo&n_9K0x~q{s#}p z*@Z?I%_|;1^$uH2r&U-7h-A6@C~8L&x+i7!_<6P4hei#^Cb?Uv z1IYPn6o5T=bmL~b)V19nlzS`>a1%V59ok*0P)kQ)MHin*rFa0{pj#fyw-d~x2vicIGERfkD7LM z_suuH^a$Gf0p#5PH_$K&&<|BL3NrPH@2ScxAf$U%zi+evclFnT)Pd0@ZgNLlA}n!NlPz&)PrceelI z|NMWU|HRk-iKzFOjW(V9WXF6oI%m1&h=6nmNp*i&y%iL%doS8nS$)lG>Cq2AZ4(dz zNL-R*^enZlx*%anMhgfrg4SkiTlX~5&K->$>MtmVK~Djy zz5&RpHB|Z_yWefn2f?RUgNjG%DbK+!{Ph9I1ujxey_zrWayoFU`)I}IvDhOYW;7T? z$vZDFanyWFMvHyO!opKbl1wgm?(k8HJiHh8I1#IZLoeoWbznHRQgeR@U1>WQ4b5Vo z-8Yr4uK{Q2zmU9(=toMMPN8X62_+KSQLTNI@FF(2drD3#)u;j4D_45J;bX%8W9PjWU5b@&LCoFnRysscW2s}0#b zZWsrI>?c@1Ds1mH6V{(EdCt~rOsF+O4LrE{bro`^1ofgk?bEvmm}tcK(gt(@I|yk8 z_W@WO>15tSjYv)yn;Y;lw%CJ5)vo<7LK=~fmV!m^5rgvGq95NYeDn`$a)+CpheRSw z&DySe_dR@^+XQtFKb=(W?*lpzQICLxkQ1Km_p%rNfxm=S zT{wJNw4_|}w0QFn0zy*Nv}&E!vL3SP5w?FzQQ4vkiMeYV#kQ)wHCf*21|kVcAg0X&8-#2yf<6h|#gdCkX;a(# z-aX_xuQ-=CT+N=>)t9`++M;KssobM|Z1U^*GPi z%+bs=-T__~mUcm|s_?Ls#41nep-z0%h4fBXy%3!)X~2ij%{}1396*+ssvs3L50%o| zqLzZ{K{HYs^(Itqbs(b^FLLR{E7;C)&Ao;M6bSEKkjPQcDXa2cKv+76h@{K?;X<<+ z&SpmRLjAoFph!4iG*fXGNjbnm8c6L5xq*jHS>B7j!BXJgbK$w9r__c#KnCD&2TmvD zQNTTFKH8pBN*kOdGoGVM07vwwU#tg+YSYb)a5zx&Ts$VDzj%>YVIWP@L$UX=rO$uv{giOOss+x`-$U#lwE~MrmB_C!O-OUoRPJ=}D_HF$_gT zNy!e=p-h_7inUC%vLcx=6>}RSfx?y1$%6n5Bt(!^P9tbmbsjo1e(OjW^C2F0pbGEq z`c9CbzeeZ?BmQ7^#j9!JWC@93273%D1KMJ99^Pkld3L&d6mIv3RkEUzXry}7*Gum} z*5RYUM)jUcj=gudXwfqtp$%72SclCj2*Vz#fzI_k=&SQB;1~lb)$V~&dxAoUZGm28 zL{h>Q{o!G&@t8ER$`TR?LX->A20e6lc>&-PCKi^Z16mJ(qnX|Vg2WzrVnHTzImf`v z@RIhmDyX1nAUK|HEUA4Or_NW&;mfhmm>fE;6qQL69MXbACZud)>XMfBgfa>Ygd#K zJf#6p91$j{T2Ec`QoOXzXsC+l2XfA3O$n?JVu!=_b=&APwXRT`1EL%%1p>hD;qph} z8hBW$qM31*TgV0%09n+&UhiOAs>D(nlve8?)*R4?m9$XoH;r%Yh=TN#wlS%{L(m2v zYG4)~;j7>&sgSl-ooexY#jwBZRi@RRB#uJ#KNyJcj|(0>^~z?&C1E_qA6LhezJoU>QoA{*Y~8%tTXv&D1Iyy|OL zx-sUw4Dfi=g$f;h1&nfbp3|$wu*`xYr#IFTV6p)pJfy>Xp^J3zt^Zfr`d;{`e|gBF zb-ItpN@gwqB4T2TmZhMS{<-;xFo75`^W6l^@o*TQZh=tD4QB{(SNOeM>slY_BYeR2 zY|jAlY%gA1yt%(W{a~7GIUM|$bDOr@Ktxv!MC@KFuK?KNRIF|tL|95;^ zRGFDl_%?MMXUK83&ETvR>%5$}WuP$Tje>c!{n(@3&_k7+`O4oL z(hY9b3DYs4#gNQkB_U;F9}MISgN+FDxgfyV@gHH{j?Llp(fhk6#q^2U*js91!xZNe6Cn@|r^QwyDpLxJn_JGzSr8Kv zB$YnxzktvVG7$Sw>>m3)lvEpNO}XT9l)50Gc=e74b9~75Y|jAlY(M|`Qv9kEyGlv-?!0oB?JxYsc0@#^aREkF-$Nh3uEA9E(8QEJqjYh^f2`t0?eJBSAuAV9Y842dxtBpO4B4O zMI2MhTk&}Vt*Sn4J;Zm-pU__g5_yY)pqPQ~sy<9Kdld$hxuv!NRYTvpGXV;_6uJR%{&x~lG3{ip?!a&D0^vxh2Bl4u^A zh3|tMdzUS3`|8(uMgWHn2l?eCW1myv3gCD;`SI>*EW1F;AE+H3(UE?P>|@03yjD z7G%2IMm>ki{d`d;H9_2tM@`v?h5%(ew$&BSS5V3yq_%_jqnN`rKqDdXfP;mMHVUm_ z*!Jqcrao9NpaDjcxuw)*k4_XPg2}vkGN~wa_%nC!0Rtmk%A?DMmy*i{AHG(*Bl?QXw7T&CqMNf?xL(#yCX=fHku72Ok451(QIPdgKf!#jMWRB z*)zuZYEkbpZ}6c@i|9v0;trUn8HY4sVnRw)s5U9AM?f5&ii*eWCL8lkAd1o^6XgJO zPH7e%7r&!FCiiJsE?4pd2)(|kXsv6)ghM$vK6rb!X8?J&{OP9`Z*o3p zepeOgSjSRS<)>2Y=&oshG1&LRRK++iA=Ay^+HOI5kda~^$FM?2Ng&Ry&+Vl5@njA0nJ_xT@NElbWJ6aadh zdiAK=k|gtoc_VbvVxLp8sT6@E#*n`@Wt^50Q-_t`yYAy@saIp~{VtnVhmQ^@?M%Q% zb)GY)oYh;_!cuZ0?ff%-nDXvk3PU}DME{QErL6Y&bM41N?}nJ+NAY=wj|pIsFojBy zbx_--n6(-Ycjt(}mC|4(KsRTxbwt~ufL;nh`?S3ASWceLjnpe^KjSleSc`tZL(??U zuCxL?0k)8~{NfkW>A6Ma_iJ?w{{6bHmnD^pJmrO%MWlQzOs*EDga(@|wp zA6b!=edBEXfeXz4h&Ey3%-Ke9e~QHv#}HcNfFcJ-0-({7!RD%I`(9$2CZ;$}aG2QZ z7hEh>00JgJF0=ylF_yCUNVz)YV_m29KHo1l>-GKX{O0aWRdkA4Rd2;oT<&nO4DzN!|$i1iv;)Qe;<>8 zoaydVi?GO2+Cz|2k`;GzXY3RD%~Fqmj8s{Hg3H}nv}S=Qo>Q*f#a`23qt1i`A6gAt z@4m%S<)u*Q_CEL+!{#KoWCT#$1*;~(&14O##=t+OX^L~$q9CQffA=&j;c2!{z1kO< zhyH5udhFK~p0k#mkwzWEh)$8{Hl_m8dzz91d{Rya3GTf16U>K=fX(jq_Zi#c^hyC8V$M=e!-6}pu9CYn0K}{T@je2w|C&-lsv7Wdex3F7 z#T=TZ5bHjQ&$ZV5O8NpE@9}KkcQN|ikOP{Kc*Pgf1rGC} z{ySsVq$DQu)wkpyzvbGMClc=lX+y+>Vhb|!6l1}MGkDYxI7zCJsbpH>D5%x!!A)5FV1VS zsm-QgpZNhZiiAOe;AoMzj9du%nAvME8^6Ay_NlNnVJIA(r*~114i~%ptW3dr)sjo+ zG%Sst!zVvE68ukY|3vuE?b)6ILbn+jgjgnp z1~)D@xdSORATT4?Th9Owk>^rI$_@2|w*mX-XVtq}^g~un?kLqEB||9=Gpll@B#vVA zc~+!hEx%*xRrgP*{phAwTX$`|fcEEQSF}tU@TEpM_{c^&6`7ZF$VHC+4Jeh%Ei7AOA?r#qhWID@& z?R)VFKe>ES0rirX>B26p;m09vQ)0pxj2R-x9_dj*857%6e|3*W2*%zo!(`k1n-yyT z;3VDQUQh%_5wzK;10hFseV0QaI#p17#i|!|UHi3T#@`P1Y|jAlY!Yh}1fQ-eYyAgn z6eQIsi0(_VyPS2IlAlrv5b;Zi)8jw512o8l$JjsJ9>Y%a(%3&7s~__LIslEc zjT}f&?S@z)WRl#A*gwSGiIn6IgW(t^0KJ>nh=PQt@1ONu(T@Qi%?xI8cPoHK8{yss z$*P2uh+c%A>ORfeUr3~7QU3nnf=^fDa zou`%Q?e9NL0?lX*JeYtn$rzKQMu4yT7C7`czxe5E=P404mz~1 zSTGaB5pH!jGqNpMd_~rTNgt(;rofLEsaBP+DqZow0}>}tl!q1fT(Y7Lp?=I5f_g+d!Kql1 z335>-e5bsLVEa%xVlSuwARI6mDfZutl(}KjWamQuVXe5}Qs6E6LGD0k%(c+&>wGVKU^9F4F+>9zb4erz zfHX~$Fio_aky3J+MHIxRWkJTyz9;zDPduljvDYrbp(g^&ustAla04G+3*+`ZE4&LV z_mk!MuCR#SJ)?JgxCqdCAZ7r&Kt#W)ellbDWYtJaDYfJNJI_i{l3?yK(s!+QJ%FQn zZ7QY79SR!_5keyf7a|R0a>;yGucZCg)m~#lPrEA96wtvk&-9B6=Fs7woKm^}i@%s| z@oaxxmKR%2+46FcE>=!lN-if%IHr=0Ui=8+-puZtew6fb|GVa?ouUNk&hJ)(-FNs< ztYSqoW;0mnV8SYMcPl*=>(VaxDg}OgWUo@7#LHBdUDzcP5)1}-?|fZcZOq*gQ4+tf zixlsK&zwVc$OrpZ^1t${4vM#ru$>~cP^!iUWpfFvfXV%?9Z4y&okdx8D<6rDcusR@&r z!<_QbxIVm%ct($+q;ZZKe{#=zC%KH;5BPw_33S3_UZp9oxVXA-{B2{;_6#7;rc(6H z;lNMUHM(F$Er{Rs;e>ak=tZ9kQulr#VFsvo2Pvq&?}yc9wEUU`c;0VM;u`bg^-~=a5Fd3-cO0`J2K6iV!{p#YL;0EmUR*vklE%&qWHZ=zdrsnXVXH})QW z%r#$p$;WbBzA9hcE;^Opo^pk4Ydz%(1|+r+siQ!I3fnml6pMfcBn9q*aKS_Rk^S&R zh0gg9aK&kwvDL)b#L(1xn>?&)a(E7pFm>%xa9!?*Jv2sRo&nF{1baxNaYvXzFp6_OtmbA$cU6aN-&+5=QW`_@JcW-!^l4Hm z1^#`fUcJwBCJ}!>d_4I*UxDN!(BX)V?V=y=iiQz;+s`KH$6*ZXQIBMXfVMd$o2s9x zug48O8lW(hqLFT8fX1=8dx)W)b|rUE0v^Pw$I*`c_lMrk=_zS%k`@w+8BQtjWXAAI z_3Jd@2;lDa7T^BKpZqxE12UnJ+gCuks7h7KfBC=s|7Y*N`Xot`>|XF3Q8hDn50a#K zbyxGA!4P0BIQBANv5(6MKE^)p+mgp6pJ88@legu&16+QGV0svOW||jI9>j^6sfw(s znW~3ZRAgmVRd=I-3Gv@4F|+jWjP#7kpZ#6rh&VkvYus+<~juU)1@A{~* zhhj%%FHSZV?ZjYm)%E$g_GD0@6NHR1YI?*amu<_|*TsRsWXJm%5jp1&veoBMLw zoVu)&=GZOWN;=nx=k$PsBMoIYh3Xhaw~&SIxYL%c6ytFn{cGQDzJ2$-;k}D5eZyzJ z{7G1~)vB7y4{D(-zitGjOqrrk*r4%QrLCxr=yhUA5GgBpWRki4JPkF2S=5wGO=GMZ zJZ#?n5j2)+#}G1nOv86$KP31V&*S;-5|@Jk_PnY`3D*5YObLWJisYlxUWF{CP%4c1 znplJ-MXMj+EV%UfXQ^T1JiKe@oiIOAAZ3(lw1Gye+sIX^mLDZBJxZu5_7m8yD&!Q9 zOex8dN*Au8f^%7rSwVzJN=&jWXHLZ=pQB+DRaCKn-2%uhfZWDkudr8NJ$f(;o3R$MQagm%((f{kGgWB_-|I&|Kxw_FaF6tX_}w>q}%-N z@A~oo`TuP2;fIJo_uk3LA}$s!+tzQKYt8${&d)u_7WeLXcjr#iblpO>+s3{BzAy8g zX{sJ(#9`!fQYjM4o1c#ZlJ>c2#vL$7+xFzLm;d5_v@_K~zPGw`mzO*J?c@Ic^qary zf46&r-xcWlg7K-t5Mt;Z-82zPRWpR_L}X{tI)goLmcolGhcFB}glBvfd-fp%NQce~ zgy(wRx6-;-_{#BVbH}~X-j_S=UAf~=<+MGOqvl8!ZYiB>rC~ew^Ft^muoNK%2`*SK z{e&k4JhEVBj211U&+qu(dHoN=AAXE~bK(#_B+5Kim&SN}Xk zDB0$8cwXY>>c_PyU_9(lRuQABFlQ0rT=dtJnE`~}n|E$9Bu`b4=X*h{mdU(7x3Ecy8AyvM-NiWfV3;up zl5&z&LwS;Y>i(7l>PD>sK0JV6nBidVi^9y=^L|FmYOZ66ZJQy3#2Vl>O9V;z7@HVz z_UC`z_rKbp@xSnw|Ky*viISK7INmkqmgB|6{YUX{9L1mgtnEMg$Zv-sj?1MbIUC0o zsx3lTkZcfR!*TTa>*$njOJ#v^y(*DyA)@_-jWf2Gv(S0Td z6B#)J$qxCL?9nt^HqD04_5A(y=IcNC_3+<*d&__C&YAo0=o#Mg>*lYp?taJtKcmHG zv>}LP7`dQw1w(7Fmub7=b3w!m8{3RM`mo`~HE1NCujATYZC+j8_xG6~Bmu}BcelOk zj&t?HFD0vCBaLgAAH@TLd5xUfz`;DygI*k5L<&QB%A zje-wz6v(GOle3i3m~_#TH(nsFjYag z9LH%I)~0R!I>wLzWRg8Y>C#XfKARg!%YMnp-I8Ag=zpF_-hgL9e$ z@sn%PV%x&c3jn`7eB1;RhvRF_ezDuiUSK?DiffQ$cCgVsXNu4z`KFnmBj=$j)w%MKuLDAssrsrw2ytJHQMH3Ms|qAWGt}dHUXEd(gpeG1XgbevxKqW- zpo1O24N&0&J&BDO*0};g^5W4WJO4lYAMOwM6^hIM{r}zb#?v?qtJwG5aCzD8KK_{74?m2scuFJctg zZsAp_-h+TBDRu=NH3TJiP=6{T*AyvAjwAz;rYtgo?CIPV&h;X)F)urNAiM5e*!@$p z?sR3Z9JlthqZNOr-8COH+vWqmZ9e4Ky>Fv^OzSsh4dx(jaJy;wv|1oH8rFGmT6Q9Y z0Rw7$wf*YyZgF!9@X?r58<|xC*)PA*aiwQowc+w~^SXqvTmaX>eMJN#lr>2Na zv|vHTIJ(~VF7Jx?iu=k*0gzvP_%KfSdL6Xh>}Hi4bXC*64N*Of#y|nVHUL zAyd{!rSa;2=={(WR;wR#rs$Pqs{JCnaBierCvuVYW zS{Z>QO%KZ;8?&e~;RlovQ?@d)7b%=lX|xC+2qc0S396$DBm;$`Qe)rtBi@GMHf{mr zHvU@1*>c%F>3cg;jf)U0S3y=yBiodxrGzoc&U@@az^YUVK68XB3B36D3mMPL(*@3% z6DJhWx-CjLyX(Zvs3yYb$g=o+Znvr5yos49>x3-K%8~MOr!-xIj}r2N)~lNUMnIa1 z7|oC8x-K~$2ELd<5^AT0;W_VWV|3~tN?ERxJr&B%J~YIHGa zEJCdkRzOi@jM0Mk*qLFQv>{QdAX4B%E{DM`{^*a|XaBm|uMBkfpZrh!4pxP%xoTE(r@vdWb*-ewY%Vt;42&@%FqngL zwa8p-(-`AuLmaGky>_m1tGH^@tB2RZYsK=k0qjBNI!IkzTdE*4ghWI!m;+EJkU=PG5F3))lt+O? zlB0r~+#Lp4WzR>~VH2Zl+7_F+`hl!X*$X0d zN!+zwc)tAoEkBQi@p|WTJ z_4Wafn=2V5NHG#GY`wZ(J;`~Vie(g;WZo-dtX>eM_oQ>`nue}z(S(5h=j8jiamUKf zAqdEvAP4Yu9msC3a*=Wq?9j}lyx`>TXj}TMrr`u1=~@OgN$1YFERnKjLzqGpS%MN3 z9A?4tc`>8Y?9ecOJzq(L1}L)TK>~11qncsSdyUS;VtwJH1CUD60l;O3k1SHwq-|7n zZ6@ccxOn!=p1$?g#qi4mj#Ih(XaB5gE-n`1c6;a9{r~P!{Lzn2HO7+=!m);M6jhJa z>_lTcHs>;g91}P(@;FyNKrYQ}p)q!092@I<&x;H5XJ-Rr?5$~{0!DH~QYk|bS>AUO zQZ@}TaXS%12`f^3I%3x~<8s-Ji$$mTouA1ba`i(qn|xHG8^`E|VUUzL452J~ab+YR zFaQoo7toflWdn_k;og$^)}rBhC0H*VFP0t8yCu&Sfln8~9ydWBcB4LaQ6IPv9!U&a zKyMm)q)>YignXm>#!-R~pMYb6kE~_0oB$+yK)R-r^r-DuBuG_7lnAMJv?)|of(DBk z*~ebG*v$%E#+T5sb8qmCqhD13sY?X`@L*MMn0 z7eXLTshC)KejqZ|-5&K^?W*db^`$;h^`Msyojc}ndr$5jt$5UR(l(uITQ3<#oSVU8 z5)E@0=SoM0k8%2Q46)ZC1PfVO6wM6#63u?)?*u4_h&E!3Nf<2&BNgfiHV)2^BIQW( z!qM%7GGsaDhA44#bYshQ1TfNb{L1Pp`B-rqw*Yb*e?8+H-@xX>59Ok5?IcFms8S5+uC!x~ zF;c%w>($L>cGl`c7R&3~=dXUe>_oF`)sGv2bl(jvBwdzs89c0+!GpE+%B3vMv1Ca? z5~~@rdOYfqL~ELyi5O|#6HQ|}41ig5`=1*a4*=*8K=vKj&Vg-f(lqLl(6)-EiRQh_ z0Ac{tOiYy;BUQx8hwipDldUS&fL#Dst4h*zd@+u8_VLH=8G!H$1P-@Jz@ZNwEUjsd zp7sC1Y5e%(lXVCuN*>|Sqoerv@o5H*sCr~E9!KCLXUpeiO9Bgsv;Z8P^M&rBsBYab ztg+i!Hw@ausJd9FnrS+Ung!T?*-F!R>AI6fL>rf41`|UGKIcbIy?5jB@oIN^dc5np z#o(Neuc|J_nAMEq;I`YLS+BP#_pGI_8VX@bKEnAO(OwWIG`u2$vGo|1 zhTS428P+SsdZBW@3^-p#d%PIi2i>R-+fhGg$MCU)@I(*;&2EB_!|?F(t>#;IUh7`# zGH4vNM{e0JXZUDWk^v;0vvVCXfOz^V_&@{?!~oIqs1L{d_k z+m19}7m$@0v=rQm_aiC92-T=#3>LNeJ~p};F6?5P2M6r14GObQf`?59r6ejKs8k3G z|Imyj4&b9M6|5Bq)|RUZ9t(8tgr|$61RX0`ws}Nm_1-TXO9R`vxumfF=yulr45VTvp|SV0Max}f{)Qg z0QTkKt)Nvv&H zA}F^TgeomY8i_hy(nO%Fw>2gT%vse~l$^5}W;Ux+5GlTWlkK zMWVn5c;iPrR|^PX1w<74q63u{i*>qW1rRpa3q-%R(HdL@ZkKZ zJ>3l$B2qYWG?=Ax_3(leBgS~Gm5`3kw`eFdB>;J{!6?@Ux-4eY1n)RXxC{EzSz$pORKmXc32biR!8jJ74Oh5=Kh!6)A58by0 zy?pBxLB5c@N{fy_WcCxxHx8C0Hc!J6G>wY)>CjuuSp=ghei6YWJFHxoecrDZOKFPQ zhE$MhaHe;=mKnTcquo{#zd+hWq_ z9CjkuRl~U%Cjc2oP1Q#^J3AYn{>5L&Qvk!yqsozh<5-W6kIwOrk0NjncVmTEJED9+u_cgR!&Zulp8FUj%`b{t-v{2={e(e ztFqZddG>6OhYxpr|NTpxo%N%cF$6Qzdgc^2&vHgVV&DpjofDhMy-C^8jMvxkXDP|C zz({lsW79N4({ydyc5%5}Xb7=2n-{^1W5jOPW3$;M_31o3ZVmP-HA!X)@oIn~6A3Z&txJ|Bnw9@Vx@x5~{S475y)-g6Q^y~v04nsGD z+xDy5xCM|YZlmjtKW^KLyCFyd5Vsgdn<^n8VClUsW8}_zPL&Ze_?v-)_>7x#1D@Ze z^m%s+AO4;~&^dx)U}*-fEi`5>$*Sr&&m087oQ+lr<8tT)xqeUcqDfuP`*kZv*S#Pq zB$5UZqjM_G39Au(u7ps=5w2~`k0X8G+xYWqy|R+)P8VZ&ZUG7{=P%{a;U~eOz7BJ` zUj4XkJTG1k(m6EVO9l@=k0H*&Q~`1h89tI&0ECk&d}I~)FaQ%=q?AKrM2yjN7FBhY znM6*Zb*Z5wQN%bGr)E1r)0peJEK1wb+Eym`XqpK=tW-eKu__==6){Go2;w*zM64I# zkb$1$T7;J|;xYtFlJoW2&YnDxX8@jj(JIIKpZ&9?J$SIPrdd51|M5LFyQ8WZG)`j( zr<$uAsyS!qxSQc4A0y>YRKl*qM%THt6mThTy|t7#-ss%DdyU+=OG@QUg zglbmHKZbF%xY;Pr&m%wnxOac`S1bF|KYeT;eY759G&@|la@g-yLXxy()?$Pi7~=#D zVx}%hNmr^epYo1~S6`B#?cxu*aeUX=*s+t>o7Wdhx6IHX zwaU@@c2a?l`P{X&^{T>0<+prZbJe^ngaUEGe17nND{VTzQs9w-MmjiVILcaJ(d1^d z2F-rMx0p~MQva#1J1v5Q=>|8O!+Z4O#x_H5eN48h?O;P(THmzuJK0naJ0#y|#dMAt zK6W<0#@5#3fS`vJkDI(65ac)nk9rLiKIZFIfnD#x}eZvOH;!nYuLCGIq7=sH~5@kFpU-i41TJx zane{?ueKtR(OzWw)4$48IuWuTa<92q)d%(`PKYf{ShRrT0Z7c_I9DoNEK zfj8b*CD>T{Z-4v9zy5mT<|TnXfrn!TkG2(d9c`v%XraE88E3+?7y>oZSKn*AxKO=& zw*g6iem>aa$6F0S_TW)T0Fad8XEkw(_pa?a&$e~uoI#}W2ICDA%=7?ccm*efbKa5l zlN6H>0?lCK7+lkMILGJmhZ*DrV>aB#Bv^Ohara|~H(TKAcN>1^)z1CKy`}rn{oeoi z-JyGbdDk7u%D2+G$zbl9tQkUFOYeLZ>ha;9uL6gv*5f%-vaX6tIJgE{{iX%-D+M3U zoGh{Q3oiT-j=Hj_aW00!Ns=qmv6yo;pht^A^jVzy!bCiluX}%IxNJvrFb{dihfS?#b zF68a)_?)3sjfla+Lm(=YF=^XM+c&Q1b51~=s~^L$KOR@jDh_UlKWKii`{(-4M;yd$ z+ycmL+{RCrOHLE9LdbhTLf(wN4FQv!i|HI2RV6$~-V0-Na89jh9LLd^ucdS6{p<%B1`GJP#^It!tDC^97vyPH2Q z8$g6mEuBw-X{lh-xOQ$ibM9cYuJc|cHUZ`TKF7~~ownw}8!4G);pa19W`lVW{5_BkV^3?s)I~u~>xZB;& zu%Uf_5)*uAjJfi0SJSbjV>4UAtWA>7EGHfvHSX7c{Z8|(GEW> zA4>ucm!ShQan02Z62);(RV7qmF-}4Vz&OH^V%J&9Cb?ZJwQaekac&${hryszn9YJZ zcjLbeY+KLd-Qe-@lB-phd>pN$Yu-6FBzG~QSH+m5hY&&{9*)LKSyM)m=Hu!s0Vri; zuf`a2#&b+pOp1q4^rEp75!P;jX6JeG(Bt*Qa$kA1k#D?ieE03w+Kc9>X^vZ|#_^&( zeB|t$JA{p_S_Qd&y~=6^Z@hc9fMEZ+rd&C5LRrv`1@#w~!{#%*}tK5g6nCt{gi#t>PLwR95kaHf+?H z;9#sSvvih(7#XuvQDpf%UDs$PfrOQ>#r$WU0mOS+*X7?cXG7Pi%>cqANEsB6;m}gH z@_ncoN;kP_=`R`q?nISeyx_jdW{5mE&N8ytk;E7R1f z@!q|=c}#Bj_~ZN0>|Pv(J1O6ZA>7k(%^8xp?QDwt4l{3?9y(o{C$o92N@)=TJi{<)?fYW>)hjrD|~;VcGxj;fDi1_+aOL z`qPd3(T~oNLY!j?FcV0O7H62DrOf#+mXfO-ckUd?$;pz-Wjnz~+xlv9kn#_ZBzY2x zq|prF5I~CT;Cq!M2Uw$OK7>*FzIU6=)?Hq1cy@Nl-EN4*BLBJX!OiU8*Jy?CV^XQM zz*0g$Tt0;P^!}WzA4(tYE+X!Gcsh2aWl97rA)JnHRF1>*ihnl8l(jyf77=Z%Sx^)^%<1&Ia~RaPmN-JCA}9XJYMYK zLvs6BHk>=>rJX;E5idDQQuTYJJo7w7I32)L4Mx+lABM#IC_(Um%hjg97l=AB%wU1Q zRw!F1E^FtU+jXwX9+2UNy3s=A1gn{m_=fO2iZ)%9w|jcrQLfBu}FUkRZCCaQI`Ds|3KRk9d|m826A zwl`*4kxWk%f<~hn1X;8?4NFMmOmg2v{zc~v^Eu@ZyM8{m(d(3FTcKL{yr z1VpnhwfVgGJ7{x;m1}!oQ?+77&)M+F5Y(uBRCASrvRsN^EI0v0ihNxFQWY``z&IL* zVSkdKjMb<{3dL6mAHR^cP%P;v}mQV0P#kK?GLDo3-Z zq=2*vznt5yYSkly2GX%iIVoku?9HJxjbQ%1L`Yhhx>W&CMX*cSh;RegMFx;@ekDEDN`Xt>Wa;AD91wbPG|zj-J0{i{4XyK5r%vT0mn%;0ff)3K!t zvm;Y&>Wwa5(DS&zlR7pW>;G7!@y$o2wqIXV<|4 zG_jnG$j30u`C7%}pc-PIeA!bTcc*f?T*x#7ZBoOgsqoRzPgM{vr8YK;b>Vrfe}LHk z1b}r1uc=x2MZI38tmqx5b?s;Vb{0PY{j{ai0s5oK9Jo)78yE1(y;W)T_@dc z@WbwTKCj!j1(4gg4NH3e=%~3Dqpm`jM)58}i1~OGBc@g?@0s&L6&=Z+Ly?|_&kR8g zRhY;pK5W&L{oWKviHE47(j{yF~_&l*Ptzu27L;0Qa3aGk73rhL9wrbDO;$ z`!*+2%?FZmR#bH{1gPpBJ~RZ$@R7tYjy8^?3^B$0dq(73h7-U!O4lhC3$Sg$u4B#_ zKqe_;RT6v(p{LjGgVvwl8$i^p6ca)?JxSx{~CYL zoZ^4@+E0L22H^guvkiTTKEFhg|V^LD)6oy#YKd4g=!lWz@5?s1F~GNbrH5{&XvU z@fVl;{`a49vl)?tWcgg1j8U~fNIDV~@p;T|f{)|mKUwnVXo0qEAR^bn<9Wv@o)35M z50IECVpNRdgrSnbW4q0jj~z37jAQkE?0v-X)8$-y4w*65VOXkE09iSi2n@CNNljBK ze^!Iemk|Q+gP%OH^Nq=)v-{1{?Q-!q(6n4o?k|DX07LDI{8ZKBFbZ%Wmj8zIh?=qn z+Pl z@L*2YfBP1#yUV-FjuV76Q_B@x0gmeX@KbhhNi%@(pyjIkY*g6LXw{e>H0sQ|`wEyv zEuNXU)$KI47&Ttxz`Wtwh;cB%h0$`=V=E$?*0)1apDQ6=`~)V^4ZHVN?``o}aT~V) zavQhNb@C+Xr0;F9Sa3UzZfeJh(I!8Ls;px~H;&c-kS|yLpdDU+IUk6rDnwOn_P)|Y zEFTv?!2H_W!$(oECaM9VpkWqa#iweh`N|8Tc!3%Hrq-)N_~3Kk1NEF_y&0|mGG!4} zkvipcehCpW#xDaOhtDNy6=a5we6Hm@M>T~Q=3o)5ZQ)*d#VzmN&F~@3$%!<}rAXTf zouj0H8K7zs0&Iqm3?IOHEt()iH=7IzG?W_*k*v}_O*0Y3xsas-Knieje_Lr1RizG6 ztoXV$4&XyewIyZC05p)C z;3MgrtBDo8Gq-`-dlY2@(* z!t06T?l<1ACWDM~>d1%`cA%Q{21B&f2*K?B{cibR{ja{Z`Un5u`1m{D=@tn-kPPN8 z10FZ2wq(Ujb-NwH`FR}ECdKgN$(R-$4))%Az5V#d>jWR?eDGj%NHF^@J?9+fzc4q( z=)J0oUDqWbYO!24Nv&o0Amv5iq2;*9G-uWM8x0|H7zP=Jk$pcV@YrQB!^bcLnBk^y z2k)!RiC5*0!?sN$QfL7^BQ0m7OifW+(vnivBz!8yF~$!+e7bx5cr713xDy{dIpwp> zs{QrvHujc--C2On-PjgXk89Nmo)*d#B2&q>~4IO6X+M$VE!SZW2zz35~d9pZ?qpo3l1sZhpbduz<%C+g& zg_1>P6{NA5o^^r_#GJzn857-W)G+;8h+%Km&l}yJ*|Tk#X3Wp~@nId_9$mZ~Ij%<; zFP*rHMw+eBb`;w|u%Y9Y!p(WRKKnns@_!idg1C)a0J)9Z_@Hf@du_{w>bU4SS*vFF zuw@8X#28!eu~p5-SOBmWEhfC_lYhKa2oA$6%S?@SRbSQuB#d>M(_HOh4|Frs{{!Iy!Q`>qKVX3MfLF5vs-zfMEdo z9+sgaeZ5^=AT|y3@=`J^7`X>Ay;_QBlB6+Gs!==#u&;h-g@9UtNXMFiV;rx;2Xuy- z(!Y_Sj2)(PDqUwj14q?tHV5Y8EM{40yQ<);Nu}aqBZ-Z389s1EX1T|VJX>tW?N`q? zeCyGJyRZNF2gi3G|4G#GKf$H@ljAXt_ebFV5aXLc^>wJeLe-pTNTsF(N1TH~;3nMe=(rzxAyydp=&K>Ty$hmByH%17qJ~+-xvjTyT8) zG=oP74<7X4z4tcy!yjJAkA8G12|P^IuCKeV_@N?Q^u7yb#}-28RHYSRTWECeIT6_d z$6;2R0!3!3h8Q(vWrmJ%9FYJ+5_Dt$NqNX^R_S;|j0V77w4GC`k$%oJK!R$85A1Pd zKTHc&A;nZ-6+jNb<2vv-Cz@K$LNeQjr%%`6cYpT>{f~cqZ};wduMHnRymR-re`_h< z`nGRhd(F{%;#$-SRM1eNBg2Lc`4Ol8TCaV~IY}?|a{LeVC|9ih+F{k>;I#^T+#D@Bcf{k)vvrOlj;>a2%5lx&(ouzvnLO&x8CCaP z$SFs}5HXvpGH3*ye=^3H9y*3WhjABo!?W;gv(?EE&M!J@z-U91co8IQGJtscO*-BV zGHgc1e7rVq+sI+_vHSQEfZ;`P8@B**8@I7q$+Kb5lNjB$X>2o&*v#9}r#7r4*?Mo& zmUJ`l2B4TPvZNH4!t%+Xu^i8b7rE)Wslr5c%A{G0IYUKEQm(Dy)uK+K!bi?RQpb7{?4Bu*HJf zb!JUNdJiQT6NJ<*mRR1qEY9$vWoQy}cC}3`NzHh;T(DL@;tU@WGJJ%9vG_7_Fh(3y zL1xfuTPtRFlUZPr%pwmqt#jV92UBGM9%u*}%&^VyLGp~?Sqq#U4cl;fv0dFgf7af; z{18Xuk8OGPhw7I<)!3h}!sgXG(vT-}@@@S{OpbQ>eCocmYIujOR2#CXgsnW68U|6p+c+7hC z%);ZxW{)1l_}+VI!`=^`@z4J3Ly6Ix_g6+4H~+PvhI5S^9i6(x;*N^kv0>n-y6rcc zq1o-|a=R7@8^>tg`-7UKB-M5%%%+n#S2>b$mT?S7nyMXnyZLT6Fa$-6@pEtStbPtv z#sWPG)I~E!Kp}zy@E~joXbOvQk0>_=3B7;+X#DC|Umv@!i*q%F;gA3Lql+K@@bT`4 zKYU~Nv;WHLcmDq0Ke_W;zqLq}D=*7s^_t|Mnj%9-(v*idfkw{xYRbL#NxSs8jF;i# z`0?iF+OW${!bgi%YSlv^C}wiBY+e4Of5FAH+oHfnV6OQU!@fb3A#0fkmt>b!lqEB`= zct7kAyvo`0!lJ}W6}{})uUod4&bdo3ejVMg&DT2V)&qylS$lc$KYQ{&+u}BE;}$?} z<7Fa*?or#uJL4FaD(%CO{a<#8pdJ1>dX+ zAg1&06(fM16J9v*qKo71B#ykdTi-p|oShtBKF0Cz2uI-qw(Jiy&7TPSAe$li% zJ*LeRV^KAK;~MkvFa$M|bWX%bNjcL{NOB_HIcGkLREU@_CZT;4nl@EKs!k+O2>nMN zjSxWs5ndx^pQtFBW~qM_(fCHv5$5(QPd5RGWLcRln=_`E(@YpNQ;nu&f(|)0wA&ocvGi}gf zX;iH>$T)lKtrq{2Oifj1p337G?RXe; zq3XJM5|&+N04XDRJJqQ@t1+_%*J1{asy&G8LzOf$iIG7S3fK#(A9Gj=HE0T9ghi^Q z3}rrsG13e$;rvXy)(XWb=42@YlFtRmq9&oX`jF47A{YVQ03p{6+t;Y19Nn~0UBsD) zMG>X^BL;{8wsUk{Jh@V?jKZx0N5UFu;!sg94{UU=$S{Koz7A${!?A&TK z14!2~Nu*Q@K%xK$Y#d?L;O)IK2e2Qb%ek6B3jyiaAOflIVMeFwOvGqLlFdG`$_9(`1qP@R6 ztnZxmm&d35Sv=|=+fjIol|FQhJ*Jaq0zWv2hqWiH#^e;4WHgqW-F%$HP+FccoFFM@ z8%HWY6=8IaL8NghJJOs;^rVW29`vh0|NPIN_Gz2B^}Y6Qzuo$DoN19w16+A3B+hT# zl*eIk6M*ENxvcftraTPuUnb^YHAdHo32o~{b!zFCvOh!)2oNIT(?UW?-vduhBsfPV zfZ1-h&1kmVK6|FN+ier6j&0jUCt`&Jg*F9bt}ZB4vuu*toL??xV^!18?%e4teLr?| z{2D^ zoa3lWvELB?!3USy|Lgzt2gCdCzqxzzB)0yfQSMV`E*>+O=b|v7I!wjRuWvn~j|`w(YcycWfIwwr$(ayU%mZ`)&PziFJ=T z#x=AN2WGy@K*fa1LXO#c<*W!Fo3)S5qRrM~!^%}qpBcQ}WD1S7(};xz`*N}3Z z9hPNct`LYK2m!m+Hsjd^*@S8eT1OSBfRsS>yWrvjxnu^GA>W=;6@~@BK|T1$!E4OF zcPf&I_%tAtP(})vE9ez$@tQ|qGiU7u1*Sh8J{-}-owvmpIs9SrVzD>05+|R=z<&0n zHKM|AkNFD})GD6>m5TmnyEqY)oqioZ`Tt7pAgzmUB%pH-hJ8~QQY?iAAP zAdo*XoRCz{16%yP3wh#*Rj0z#?dH7{BirH**7y(p0z+SlNgo^(8gYP360KY&Tctm5 z@5wa)xm+i)sL7Z&;BlA0w*c>KGA0GJx3I~elh9X9+DU;y(vm!EHVpcx!>jrYI@IqM zi}*vZ@5m`|!av~*bPtD9K5G?2uS~3aAI$oJT!%C+&}M9Zqn{dW2_jf+x&Ag+TF)6V z#rN=thnO4nxXTI(OToZ0BEsrxKW(c`jN~@wVqa(koo%n0+Z~%o%?t@Exc(wH_lN3%H2z1bS-X*L&vm&2vb9Jhc z)LwQ_&lf~w#XG+#`TaWH_4UDyNY(`>@Yv%t_f!E`c7PS~_f2!P7JeC-Q&g{UzW0my zy7~PM>QqMM?%x9%cDgv_rAhhQ;$2p(D@dIWkq06o?@rMHrin3sB4Up2!rJJ7^A4D_aQ&%!Z;ogFGGZY82r9-h*O}w9N0e$Up=8Z zDFWFWsxlYI@VgX`Q&&@ywcM@B+?GlYe_)ucOOtr;Me;%N*xF3HA?1GpT@DmX*tcfQ zbIE8exKhoRe7fU%p0ycN1Xqn?z#;^dZAkNSPAbbp+Ioh6W6J!(Xa$Kmoe&SIXGa99 z*4ygL3`Zkr8i&R^62PP-vt&1PNcM4|h ztyJY<+lQq+Dl+nNW)8hJ_&**(8!8iGe3;>WQ6?}8lbyYGBx@R`n%i{Sfc%1G+fJ_g z2gk#$uArpH)>j&K1q3NH8>6J`lJO=%PIFqvz-??sHIrZdB<^;^g zDQzphP>3xe+}}G~gUt?_B9Y&qP$}pU#sYATzOv@la%a5rfj&(IlU2)N%o^2vLB&IIprg`GT5fj ziAY4#g%WmmHqmlr@_R04$(0(hjf_xAj~_~0eyB_r``c6gO_^2vQW0o(-@Ik7YuwcWHdR~F)rpP(|Oc#;x^1%NHL&IXKgl2t1usOEkd3zs zo20lo431_YpR_5Na}_d+{izPS$4si1)(A~tpw?Fxk(Nlp5AwRA43UJ;Hzdf847>^ET6wY=?nmqa4?HZ`tT3=bo0&6 zGSVdr6J$wR(^zpM*DtBt(0PxcgS)xCfl#U)@%(%}OVW8PvDQoqG;O@4_0f#maL(+m zMk4962j*0Yv{NBXzKud29A>#D@g6WVF z3VX}>=bUC7h7}GHfuo^Ra>sz_R=))lz%Z+51i*TdjC3F(Auxm^TmER@MAQt;X;O55 zSS_&}Bg+VC?E;EIjyJv-OT_ZXrDh#wpNZMWc1@NNtah-E{iuoY)5v(V4i-mU942SN zqM<~Q`@~JEgER(H-K`9QydYB*ovXUT1W!b7Df6+j$j~-9tM&?~b9gioX;<%BMt;dU z?`@r`wW%n~XatueyuO|l1@acPNKq*Al7GuO-SEqMuJfbGwYrnq`Y2NRvCB1AYi@p1 z8hbe-1ecz*;4`7-@yIJ)k0{dPs_Au<-d-ndxi)P`NbyIpwJ$^g)offcZb2BTkSG3@ z-xCrUuEa0|@sg+n0R8LT_xKC-E1YETuHGZ-?zba|6w9rV?SuH+xV;S8mgrkR3=XTY zlqZ)E0Y(54>Ta<9qrXHcyeNBA@LWX16hWV2gWg}cj--^Qg^+&e*u2eDvOSHe{`sm! ziMFKZ{NB;@Q{Q5~Efd{QR+2Je|5@Ro0G=1Zp%p8^b65U~y|(W8F@O8BuFo;DXAVzH z+{{8XmLD4{%Bz*2usbAce#v(c6Ig$AA8^0*8nq!uW1P-_`$MV`c!|LzwIJ|f4wqAt zOy5BW5|;McfRx&HT3aN($zm6RJvHdri*y|UDv1E_wl?N0PUiZ5`g(dLK{GU*)#qey^o;ugmc)(LABx2;4tzpIzix`vtF}?hj?r zJSHf#o8WR@-fDTIqQ;Xs>?#Ul8l?#SL|ZSQ%7%D-(PseV@xbqR5HbcPYA@o(QsKvvi;)X+?2B-Bj=lV z$5uw;(2&8==!gP_f;OUc*UhKe*LU;ucTpDI37L1DJn`dSVa1n(2}^a0hCZeSV%6h= z>o(nW(tf5m8L~%EqHhP<=DpF${Iq9W%8305nDBE5qqN1BS?TBfWq}wy!vNLz zpE)}&dm>gP21w#yRYV1Fd8^rfZFW@zSKmfXR8u=^<<-0zt>um z0XTFfJuRdO;mnh=7#eh2Y1=Rx+nf#Nkx8?Tb{=Bfo@oQTPFNQZyXNy4?z7zR;N;g= zlkm$BcZ=_xJcEH>s$pzgD+QCt~JXqOq?(y53=)AKzv=P`9}689tbrZp3sd9`0qsC_~hLMYV;?v6aNI*K+m zk>8pfmqn-h6}^_4{kjdRJUpQ(y7d0*^Q*)$b9VqFWGx{tVJy^1n!~ivrQ+) z=r70SlDb+@i5;AXffW9Df^OQH~XEyb)3%}5rc+e#w$#e&rDeNEb|Lp&sF zLK5i4jF=w%Xofr3)oVqY$;gVympg>O9fqo*5+_yxLcbQcZ_QEV+eZNJ6wD#zaO3HO zUQ!~+gR4#6o6YFS+#3}Obi?W-H1d$Aw4fsU@+Bw2Si5aMxmNZg9SrLfOmyLxJ4R$&OwliuvEi+p>M1MgpEd89vM^6j77>Lcn{VH`zc8=5d?M2$nf|s41Uk<{)l>y<;Fg1 zr&hGDwA_19`*`+z_k2=*H|iX8#|)xoDNV50CH?1E>+Pu1X$jUe<0g32l^eBO|QU736 zDlq;cK-Bg}aZvc)I#-x8y8FQs8eDX54(xSpc;YOvWP!Cr2MX~E8FH4)%ANCa8Z~Ah zI0ev)8#d^W@Ms}1E&E8v9@dltgH!c~Spz8H#m-hlT54`@^rJy*+jRW;ulV|Lli^w0 z6jnSkm+5C?b{eu!^#y=%1BnDVuTK#xL4FiZq|W_dsp*=wdCnNn?M=T@Q%J`nw0!dbu3*B`scVY5S-5fo{;kXsFfy%## zjE(82M($mz-Xt5Yg3Kto9zq+jQa$-OwiFeA(7V zTDY=Y0?^`WGVN7aS1a8{zBcCN2T~f}_}5s6*q8zw8fMKh@WdN~!aQs>ObShAvl^ETwVtRdiZnI?!Tv z!mn@Mn^}&r6m=yq{@KBUvIB{(X-xHFVMQXZM746MKV#Kay05v?MSf3jE$SEk$dT%! z0b2}V(D05Hx8~#@o3v7k$Fm{UEhyVLGt!$=y#o6)$v?(RFBy|SH0+Oq17!~Wp~1mb z@V__y*ORf6tyuIgkA3oZoN}0Ya_rI#Aj|hNRgGMLZ(lhEv_)9JRMcr62X+ zeJ1qzT=sD+MWPE}dD!Ha7eyy6!mPoqT%W^mNdUNOAt%npg>Sv3qpsetRfKJIZTTHJ zczAjAK>D!>z>!=C&BA6AcP?h1A|UkT&=wtp4`r8=LV&AsSQR$ua@rMY2-whLs7OQy zfMdxr{%ANkdNDo9*{#P5;1sKgos*oEMnD}EPD7Uu3g6+wn+^J zpS|vGex{#yJzo(2;4`ItNcEfHNbNfkHJS)1j2JmZ)DxrvVVV{fqV+}){Z`FRBFH)J zLoFF%6@hD~hI&Q*uh_QO!YInO68GZZ&o`6eu#s$I@#C3z$gl)@)t3jRdtZVdZhw{e z-49nNc@Zetfa(}EsQ{%F50a84f{LjE*jKv_LsJa$e>zcT>hUTmJF-oB3w!9qM~*Ea zggxQ?4l4CaT-M*OS$B^x|0k1uj6@0$!E_^#x!RvOQ<>qJ>XovYYY&v#&n>$Xt|VD$ z+0HWXS(P1PHgRL|!&++WKu+>5cCA*}Tplib$6ytk5TWlv%nk-fVz`;~L7=EXm}s<1 z4fi3uD0cy~o&qcA^8;4MBGgn}5Qv+MH_YUOIck<{nc*f4&@5R2g%vJXlnUJM{eN_5 z6zsVGg}wl+F{DTHoqk^FSN>n^acs=^<6x9U_y>G{JBk`BJy; z`a-ZrcY5$M7WILt4ipxtGcDm573T1j!w@@cj${-c z;&N}ggl6VkJLHy<{t9^NOW8aNeBzh4_5qD+C?l0wF|_9&_(I}^H%TRtC*j#oh=@2N zKk@V9*h4rXpR>8@)qEtxvRg&1>z7P(q9yuuTxJic~R4ML2;EpT3;*|HK zAOn~U0E_Rm(P+-JFC1Ka|D4|mOgfu*+qLwRgHOeT^AODZWY6w1V`lH?f}_yEh@(UH zd*s*s$nUUtBqXs!1QHpBY1J(NSI;ACUf4u_By zAsqgg=jI<#3UWzAuGI+2GgXb!z8daz0=qT~6FBT$i0I;_&3IhXl8uZ`I)ZaCrG-F! ztCcvsWdXk=jJ^exj}W4QP@eAUx@Bu}9#}Gzhst@12{|j6@6!_F=9b3oL#js{iZXBFLMvfJapao6F$nff> zb+ZL%x=7@Pm=~!S4dxp=#wFFYuTC=lv$+8vmu#-v{1jtH2Qwi>#fGJefr!c+Sss?JNCeS~#jElB&{Y&5fOCxaxb5BrC@-TbuhZU`? z;%mzvf`}n_gzt-yYbIid3XkP-9TZJ5)mHs=un&sASfm-d4J4 zj4!fn$6!gzuPH~=*43KCBmth%UTNHd5Me;%hP<3g+ zq-y#+6SF!qGjN%MjS|gzR-(kIQDw_*)&gxrge@5EI%stXiX70x1Jk^5S!mrc*@af^ z$GmgPX4x{Q8tSsAr zJ)F7yx4GY_XQy#SlE^i}27r#<`LLInXP>Uu)JU)}Zp9)Y>E`IoX_mGQIs87H=sfRu z;4(ep>L4yXT{{+bN90}*_3sBg79T>uwGmducQpi~%w8y{Z+6eBy!KbEJ{z@!&9d6B zp$vL7?7k^s>?)(45A>y_e>i)Af3zVzxU?kHmT4Y?SSH788TwVjqSyHj2jOGa2R9C& znb=}tkO_{qsn>6;^tb;Y>qcU$qgOVQNvcLiXka5`5iuEc^BTYrgN7W zf1kz@{qUzSsr>Y#0WCsu_YeMUWa%F#Fp7n1yuD(j418<=cn0%((E4!P?<5ZVpL8sGR^(roQm=A_>@kvf#%eK#;#Ha;LF&sB+w zt?q;kcJxXCnM%5L5*E)-1yMb9F{^qbmuWTmoPvP}D|FY0w%M7@2H6e=OHV9Cr}+5u zIKd%S;+^l-*754o1W0es=@K%jN&mFew)M)41t}x?(?-GXYt*czuU_Hpk1Kzi>(4tiL5F#Xvi<<4_QsnN zRf4&uXQ5cS&5x1%9ALwlc>PlUIrVZ2uSACYp&k=Ic#q{2m+$>Cl44$tHUxSukVJOn zxC*US5-Nr@Iygd`55~GkB%z*MQm;DfoD7OZ`Nvad^jLK>!5Zh7!-ngn4^ZgwP0>$@ z*LW)nbAq~uB?~-Z_i9;_N5JpFXa46fnI~A-NA#>!Z(j&Z@WeIiY74jQFZ=9p$SXK8 z?aE_7o&SPnZBIi@{wR_uEu>WHh@;YBKMZX}@K$^;`S^DXsTh%{iRNq{{ra%% zBO-=24#ol=!6852gw+ZN!j)G0^CGqx4`H?dObYBT`r}gglvHvko>IWdGPzj-#XtN|ON*g*+{Q5hi)-;3ULR_FnO{|Ao ztfo`((|P6wkRoT9f`15-VILkphd|yI`YgI9d?&0Ft*>7ews+R{ID3e*_1@*_vTsP4 z-A)TGs$Z!wL&OXawcgzg2L#&V1lfcoRJjO^bRvZm9{shqjFR)J)sp^v6^Y`L3UN$X z>pefW%eoJl#~!-iQi5Vj@6iFruG{e5a4-g2T`~mT5d1PqMc&a5XVu>-It4!7>~-`o z_XAA{!f3GfsKESoJTDmPtp|Q7`5juoMV3{LlsL7~SmVWpKMlxA+Lw>E+GR-{`yib? zK*?BN!Ksv|hCXdwF{<bqDi%v`^U2q}hV3O4CYMmfoM2y?g0$n`&D2HC%sMsA^MC_ z4@!GSvRMhhol;6a#p)#yd|>39mN$p62%sUqNG2$mO;61G%Z~kp2FbzzrKg~0;h{+< zI5I0rL}dKkV|?!s%gfvixOEb@o;by*DchY=c1l+1nK@Jbw*t=?RNnJ}80k?O?)Z zWu^^uQADe}v}Ax4mMz2w1H#9EGWvW5MZTG?|Fgg##$1omm!pR7-10y%C9M`Gn!`24 zDPsd_*^`@kr1+X7v8v(}IJ4C}!N^s5dr?YE7vNLG>YAqHz`jA)L=4GeM($ zfaHfl2v&dNmCa&a2%T7>4H=;M=P7X(XTl!+U>hUQp_z+1PWJ~_`JdRZXo_vkYO&iK zsl-u^SGCdt1gS~~X(rfU<3$HfVFSU)k-maf?;e;?4&0zmX2X^;gPfO;Fj&0zyWVq) zf9`xItTYsLQoR_IBU7Ib7W*`>Y2<5Xs@;1bjzqFm22?5%@gC2iAJJ2P>n0xay}xf^ z$?Jf^Ihn$BVv{F=NyiE%=<$o01PTe(7|aQ+^;Oy52Z;AmLp&(z-uj&>UboX9Bd6C) z4x?b#TTlmdvNluv$);h7bWx~^{q1)9dKT?^w<+D_1S;Q2T3lW;Av@+{J<6c0veO-3 z2v~kfBbC@orh(k_`4v7NG(hrk%%Co%fvK(H7Ko;E?k*=1u-Cjbbb7gAmzS@xQk^d} z_zJRbg5p_n)el-8?NV?%>*GV&SJ`=pa;+oftEFSqj}3?9)9us9@D35vM@Um5A< z3l~a@yli=n9tO!Q7sLo$wNyYaFL%T#DOpSr0t8OMh#tIgoPlbMYv!VjG~7=Slnd-4 zF^F=sZ$>4*F`K)a&Ntk5vc^2{pXMy8&=&K%_y{+mGcwRbD)OflEu6SiwhJ^;@fd9c zH3;qen^yvaPD{D_fmkg$dmSWhT*l0z0K?a5RnBx3lCrf0l2dS}GvGbJ)}*RW5?MaTY5dxALw{vGZ#{#bKe9mL3U zRc(Y@yFv7rA%*}RLC1e9+I52I<;_AOO=glkQ?K;2n=W3bVobU;c3Ja&7>oE12Yao> zYVGmFxmCBUjSH&EwBd(}=B=1IH=q=4S2c}IO;W;$GJzc|DkIy&Fe5F^ollCQ+;mN4 z94n!sX#x!bDcFe6SR2qViei4Bf`;OQNSv5lOH4+LyAO}9^V4?{rVMhr1<;or7_QqS z3lb9`N*$H`c6B7j3^M1&+{*2BilRhQ&@$=JB9tqv0q_EvB6_1)PmV2J@wE+c*~8J? zg@&;4;?Lg^@dkM{6`*C_&B_-h7UFTBwY&X)B4mJNv6+RTWrIPH=86-k)qH)TAksBH z*!(U*7d!7$h<}^#tQICQz{?Bu*|BaRAX%;zoQ5XuH8quxh-5}237bO45~`A7uWklP zY;VfWxe2`qfgm#pbi_?7F*@|*7RekRs|i(^0Qn^e(pH2d?1NTKL;L*&P5yMQ-!#uw z?2e9`AiInw2l9lEa$?&@ds^J#0mB~L+1t6O_K?S>_Q4Wnk2x4ieJF}}`QCRXb%8iG z-}UbJ^S6k-2)DMs-OtAqXZb=CVRusS|Gq`WOo7E9hzP%QB{qKDvcue1!3CzLA3bje zJ3h8|XOo>b)^|TNYn(hetU{5Mc}viSk<(wZqVyL|KdEwBw6uuiI|u!I70fZ_p*qcv!%E_d)3<~#lNhq2Nghi*0_lVV#WDzlbBkj)CA{s#;iO`Qwt>8Qq5m0stv2?4g& zbMTL_^|gJ=U;R+52#rX@(lbBXrGhBA+f{R9{}Bp^Y=dQ;rfFNU?9%~lFB=Q^*)#>? zHs;+Aa2$4Yn((X+7wMNs%_p-J2>ra_pdGqVrHp%mlEh*~>QO}*ooTeRPqGNv zLv{`itWGF$b>$1N1Lstk+(oF66FxmYHe1<|V0**36jn9j?LXUWg%daXy zMC8~)q9&u6a|}Bp-la@SrWoUyYT)3bGtX)+sz-Bt_}fRpjlXZH$9^r*2%dy@%5zbF zIJoc3Qrd`yt7aZr`dUR_rw`e=JL}^Ir96uX0PoxUp+0UJ_BB!KmH$)A$78eRSo79p z61qU}u-eJ6BpCy?#9TyV3Kj7mz+&arOQAQ~X`QHmVl395^^??;_t{=EBoe>kjczI^7`T%^%*(U-E;?emdfA4pl?MQY!GlEjBN z_2|*L6&PM9r8c*?mW`dV(G`8|#8~Ils>F7zV(rP!49>J1RdaKj@c)Jo@qL6T2~p~@ z<8(C|1Uv>r@SxlOAfG$I7OCh2Xme7`@dz=H}hQ$d|5sb z+>@FyIhgdJ6?kRWi*T6VCT~KPZ}Q~|qTc*DH7??`F?>;)6W{8WR~9vUP3g{>!~%rt z&Y#)N|CAha&x50?Cn*xr^G=YMSZWGYC7pC4#%bQ4Edc=OdZm2!i$TeYx&(Ls8aEdv zEHoY`_Di*4bkWQJPEEgE7J_+lIb)@4;$Z}7ygwP!a*Ql}V z*iw7LKbBg#B;BmljYgSF6Hix#} zE$U~j-}lOdss|Jf&GFwD1Vpxk+3U$5rjOoRFXo<9s9$QJQ18wWPxjufe%p;K6MgP7 zU;bR%M8B$6IQFBFl9IbZP6V*KyF%$Q^b~{VW~-g+@>;5wWVCgWfc$w=Q%fSg2?iGa=%qhH(bU~u2n&y~YITz&*{B->_0VwLKHrr2xM%H1d0l&^ zfjkmyNez8q2)C_hD!DcjigwSdBMMCMYXz2L=6C(6++yDk$;P!~G=#EPSp&7A;F;#< zMQXB!Rhi)6zt21G>upnz|JbPAIes+MtU|2$Y7>|K+k213S0yzRq|k;$A_v59<*0HZ zXZtG+jY<>8`;#?g<{ytsmGK^qUJDY5*Q|P#x7qt}r6!2hHrz`|dmWfke`^a;Rh5s; zHr*&GWykOcEm6a>bWG7}zi0g8G{=Nat*WqL+j%Fh_wV2kF0OeIQ<(a~*?ZZH$AJ~T zTN)DdZGjh>j6U(z-ziWNI^3%wtgad2cECZ|{6CgXci8iT;g!&9fp8QJpo=})J|@YS z!rB4IHizOUvkF3*sTLK=sVfaw!IjyswWr!jis1*I=pqedWvmL zE0hRJ0qG8;PP_XpbAMQd4H?dL$ZgqE^Q%F+9j z89=#tP5v=_*~E#r%=&V+6@w_uG%vTYl_6pQLHh)^>da_UkwxW5_|41E4YOX9D;v@O z{l-+t%Nd^CCoeFchDPYcnva->Hzm;z)?BqtjbBLwiZv<`rJfSLs0$X70RoXd7(#m- z50ECt>to@aaJ0Kl0jr07J9V@EQ+o8xtu(qj=7am{*T>)q5x4K&a6YH}%S2=fa#+?` zM#72S9R+L(X^Q0j_W-u|)kaZ@e?+exIlPMq2%^LZMereSM{Cgp5pc6psI>m5nx~{L zjY^|+!ryp<*?rM4Vbi;dwHdc3A(uZ1 zG;+0}@rQUIoD8wpq%;phsWMAcVKdX=`bvvo%>U}o@h!FI9Q9PpIv2zAA7yEUPH97z z^~#86x~ICq8~Tx$?-+%~>wIV#WpjuR@994O*Gl!<@2`CQtM=S^T*p$Cbpw%(>>p^h3@cmeBCIi~@Kke4(EPm3=Da zFMk}lG!k5F73Z{83jQ)|;6sDNimFID#?TN_t~iYCVV>DnVw<-ii?;@uknpEEjo62! za#GB)skyp(ak`M}(U>?w_fAGx2?qE{_zqAq-u_JKF=+p7N(H(W0&WO1ADYBq0NzXM z5lz-4O3-!u3i|9#2j@c`_$GWVD;dK@3nd;JkJey-!#@3{$P^fo`NCtx?*KfwG|_Wl z_Y^oywd#x`cDI|{FsrxZVhd8R9Ym~ zn9|^d+5~Jgp^Tfekv7((;=@{2azJj4hu6f1k5F)c;WcVy5u>bdITa#H2+OIrpMzib z928ldmfS)D5;@XN#F-nU3MoWND7$vjf?KP%=^PYny%6|tk&sW*=X{t;5}p9%n^9e& zFgRRRSa^>F`yteV44e2c1GP zw2g%t;M@38(5i1i#k7rhW;yLPX5&9}bVez>;Ar1Z-i~<&?#HUclf)fUzN}TuqW3vy z6-slm@t9)s-hVB{rbnPa`XG@m80gwDeAN6V^k375;cq)l`jK|gG=SZNiRlm%l(l;0 zg-<2gY-tv^Y6)G8vi74DYx(>$YcL!F;dIh62NZ42uK;asGlIzAKg125GTM@sjo-F& zs53cZb=I=na{pRqgcRcWgVlMeGe9l#u)#avBJHcWDXZfO5iKB+^EcGX|;9XpYzG@Smc)S7)933SRr~$CtTSlZ|dswB6lZr5paXN zH{>chRXO8RP8yeP+M8DhB-jvcgoy(gad;586buLH20)`Qc!8lAo;Vws7akeBptK)T zn%Bxb6ii_gEvvcm+qE6O91K%J%SLt)OL5| zbuH@A@@HZunt_=4#J&1 zWZal{sMZoL7`cC_?NYDp62)0J%>_uRsbj!w^hM zkAoGV12mkHCB=DRLM=e?b$qu5E9FMJU#Qq7<6<14hqm~KcX~1iFTVC?WaL8KX%5fM zy-COj;riCCIyAXgPrfbU&(b!vRYP~YG(=S3;D&4tdt8KFzMyPVPW1Q~{KWTqO3U;* z`nTnz?I@gq#DxHZwR+^u?EQ5PkR1xi-ZqKt2NHP#U*c?q!d3^scO390tONt4y}?Na zV!<~JRk1fQCdAchk>>Uv-XZZXZ|z5;QPwfYIBQgP#|s_hd-Qj#WLKS$>Pt0i?VJA8 z=-T8+VK|~ASoZgMan(Vw#VUfa9&debb*=(ng;;yeu!tlEsm?Er;C{$A7sQP*XQ=iT4_q6l4tM4&W0I~9bf_T%5=?RYF~kJXinrcG#GM~ zSJfDT6Pw6l8Q3%XF03!SzQO*wIe)aUWZsn6FG>bv6C9I)fv3Kr2F zj>MW~(Anu~)1+4C-~+cyW=Ui>9fia^+ahV6)t06ID6Tm1a_j5qD*rwCx3gs`weU;z zZ@<7nF(GE(l)W>4R=ap8_eErd@ug^@sE1&eeaFWB?#&RcIh2mSC zH5CcA(A9+1`k}EgS)wp0cp+lY7Y%k_53iACck|8|Y|*8gwD;wQb_^LaO9L9$i6`VA z8kQh63Vkx4EIJ!_MRB8*5p=kid!aTpb!X7TpbTIGaRoO|zfF6m%f?Ki%kb96{mWyg zrOk0M7syd`9;OqFH81Lp8a6PWg<$!@j~+E*3~xG{W=wC=sXX7m=g7&)actF*QQH_q zBOiZ+`6TdW%>36jzddPsj^B2D>QvwAv0HgDFq3)}Be0A+sWU&!hAX;nN`q<1LFXzQ z-Yi$Ns^~Pj(z|dP3J37h&4Az~sK8GCg;YGeHXQ%1oSj{`dOy6Kc(7q7q?j2_YQj=d z{K5b!TjdWrIcD-+=conWYlR(V`-vK*wCglI2F%;dmR#9-Bd&_~hPYoI5+&#zI^l9X z;%Q>J9OnZ(8=;9h9~Xg}O>Oo1>nm2G$wO9TP|>VLs1*1t%xdElVIIrKpsdCrZQkXh z(a^tqDMt<7MV$-j7Za6WmxiJX04Uls?Y@w@Sgr@{MRj0E^}pYlHKhmJqup9Mf&iui z2+{;fSUF(c_|ZKWVI%^!Oj;ntk;%`c!ZwnBOt0(-?lrni3ci5kTq`UgutEWn=u!^( zus8;Jh%5_;c;^W%0kUCdrsy=6R;gitqsT%hMe>qvLJqsdww3O1#}5ILWGudpe6~)h z&UhK{w5LcwTHVNk!?)~PUw=*rx*@(A8WP32*4z>2BI+#k97u-n$q7{hM6(pdZ~7-H z=msOT4HemNwR#E{`sZzU0&Q`2_{L>@Mg{}OyX@pcNF+`{@ekhPytKu8NEC@UIOy$e zO57+ro*rE5`ohbbygt8uo^IloJcQowqLWF!K9dJYzPg3(LCaMY#S(l;?3<%&b$8d# z&5*&}0%6?&-rT2*MgK^!&k&BMMOoofht6 zknyfj_$68PvzH|2F8=cvk64IsXrlP!&41;}T?&6M2`n@u^p8{h$sUVHFLQF>1e!`e!%%?J`U#JNoSUf#YCW;YcSCw@4N26J^JTYF zLm%YmU%mzUzIWNtOr1;c0>uCz6yB!hafY+{N+~UtSdzf$G0$v)b0}OAc`iG;a(Y~+ zOYO+5!5nMO)Ac6%*o2aa%nUD1&J(LXNn0_z{?ec^$T?M@&u*emM3l$=gTr>Q;FUkR z`1irUl|nqB1Q=vVMzUET8kh?c5XLkVP3j16MIIUF;1K&wb4r^#IsFd82&+26Y9>z; znwDTBJJj%5u_yeAX%M8(grkCQ&(EW~3JD{2X*;hJpGsBvf`%h7GnjblUmwz5P6J3J zW%xlq+PCYp*?M8KivM~Lle5hcAtMZi5Mi@e#Vbpp@ac?Z+yx*J^KYzBJGc>WwZ-~< zrh-;SSbR<}_Mm7uE9CHEQ-5pT6UlDBOTh{=g21R3!$KT=w^TNVrcjgPf>qvpTE{?D z&oFjLcTj5l|6t#SZ=5sr>=$*g14v9GQF9Ju3z6NSL6yarcle)C+R26b7Y}91pQs@?FnaEe{zPN{Pp*Bj*w#|ZJbw}rp zklxyZVCg9ztwp!SGItM6=4l)gP$b$pSC(?p2w)`$WfspWqa)h4UV?dK(#Z)NBvV~c zIf2GX$bsmj7A$|}PpT$cea}0dsv302ih@VO%a@Qy)qdn+RB4by7lJrZqi>Y1moI2+ zaWzN;*%KK7rr)s=hM`E@omh8w?HYD`307Slk9PMia_^qFJxD$r{hoq_1%*FnNCZy0 zzL!dH00I?vA)QUNwbE|oO%0)Hri=!?(e^g0t;Cg3uU)!5y}_n7}BGc-6~U; z(v0CUf*lpua;}f9{+^tj-`_7^uJbOYJX~&2eA#w4>3lx^@|fVi;xKf)Yqo}q$oz?M z5K5yc#+HJ*1M0#qirupP=oc1f-g@hFdA}ja-@Dl(W?jVqXY0mOd z=z@kNN`#`=aK?dXDL-KaJ8DA4QCIpA%nX-8dogvs*h;LHTabw1u~&->x!~b^&ver{ zdtr%G(^{?Q8Bkf-(+6T4kh9|c$PUjCzj;kbsYv3>)a)cv_74Gc&B-){u_uCJe{JVL zhO%T`^cL9hk+_2Dyj%ysUj%GRIxlw>xQ&w=CU_tDLLyi+1)UdhrDDi}rGI@ijQCl$ zllcuM(2;nV`}af#<*SB^BEeoeB|1O&R_xHZsx1#64cc|}S6hh32go7S#VC^KWmHUjQZ9l&LxiVRIr7jFPe@Hsc4$Ar}^4dnf;@$x8<`7L+ld3RHNDU^u4y1X|IS>}5>Z`=Q(UG%5St zx*ikzxA1%2UcO%TlT==HJpnC&!q3da&0Amb#4WJah)BkN> z+V#WYnO=mmD4&K1WM5vFG(nu%i{+2kYLd6Dmt55+f^H`UnfKT&o?VZwda*2O^R4$o zmF~yJ4#!_*hA*RRT{lBwb7u-4#;s;9ouDAM6-<=c1}=sxG~CR?$F@n#q8rh-8q)ruTz;D5SzAh?q9v`NK@+?+E@9vs_ogE#mZb7i zo?8+;?tPXI2`)Q!=*|ThME%+Mcmm!4g)hUq-^cX7I{h++-jwJ|x}?)KidmGCy|A@1+1jd{D_sJR~Ga>`|eD52?YwMHYO&s+lGP zOJyso7%f8C!qwgIxeDQ1Z$5LS`eaMSNc_aXbRK^)SP>CBKmKKspjD8)O(WS4Sr} zjw!@5qQ!rFdH4CLN;R(I{=AQIpAlkgbNkz!7KaC9vFG5vyu4SrtIc=r zwtflwyykqnHFlwSn2<9?9NR+QMBABsV}ljalG>|3YND;1B=*mTERMg6Rw4pn$YQaH zqAdOoqS}v#{sTjo0=^j&L9aKKY9yu>u}b{bvlfYc1d4d|=3en3v#-0jXf|+iLlVm0 z)Cm_S>9KjVd4ZvGwoD~*nqaG=82sMyE7l*0PyJ8LIHO$ly5o)NdR=STbk=-TT3?8X zdvoz2xdMd0;2DgZ5VY(uE19@$volUXjLU3CTHV=eb&d@oyLS!bz0>D9YaP;XiRt`p ziJi}lzg5S>F@d&KK}YLBysH2t5{Famp7^XcZjp(r#)OzCQyBnSKb=vknK8Z5(#a@& zxZ-&CO0o$C1W|1K_bo3g89erXgNyG-LusX@9Vn9cK>KFBBA|4sQoRtYLaHBRDC^S= zcf?mxG6fb5ns0*f+%b%&Qms!5&#Spsh@?HfGeQATKx&FloN(Dya@mrQ$eE#6e>HWQ zLG=n3#m+eb5Qd^T{naKFqz>L4NoBlUBNkwQvCDkm@1|=R$)Lsvu&{bZF3yFGpbujm zd$QN$bc`Z#r^+_*HTc>f7lLb^zqT-JZ3UBEVZoO5d!*9RczaYZ*%OAM2I=oS3iLkL zq`v*!)aVFmhjc6=016SH0>IXLw$6kR#{8k*QgxWQV*%7rlAUXa6L`r%8#FZ>Q{q8{ z`~C8zy$_ppPyL03yy8z2+z)A6{ut+Xz7tzG9xea=w&WG$~ zxr>e4i~S)WQ#kUfPEEFN3jl9Yk3XdJmsE(gC)r4wHew9V2WDRwgWOI8jJO6PwX5Pz z*giYrJZ1Spd#_3t6BT0178P@`h;7UGznc|pU<~%!gFPW#qRLs|>dkjFD5UQ2mWBmi8kKlol$XT!^q5+*XEpYxNb ze#Q2#65^p%-OO@rdA4aIgrZkLli@jQna#c&hxuEB`<~wflqts--J*X`bgq5)$e7-M z8v`7og)vMeI9Ldu7G$7h%WcH#FIZThALpE-s_@7gVn+J zJNF6giLVPwr%lGm;8-~FkACrlYy~stx=~UZj8%^_k#e)LaitpYV_o-j0~%4X#bpT9 zS5)c82y`dRVp-&#oo3T`5{J3x>H1)DUCJ-?f}du1mMg*Gu(WgrWjUXnOot^mT0i5X zTBn2-aYmO*Jl(xY_?kEkj*DTPx$P}>G&}t`M%TGorCugJduYnwjWINb$7dcF`chN#Qdkx-W*`-O(xj7X#*E z6!6u9R-UQ|2n9r|#ye`wfekF_$vJJ1D#r7P!q{ZW*vUM~inYjWk#HZ29^@bqPewy% zMLy}jnXs!~q}9Qc5kyYTnsBkY)Tls+D^GGth{Z)uGO0BPhV<7lRC%Mdp=B47vf4&3 z7;R=%lsH#&!84`-OL_a-Q-O#jAFk2??w2BCt%*=L?08^YuKx6w`1qwO6D<~ekLhIE zTLlwOr_&lI?JVWHK}v8elZMWGnUC)LQ+`J>!$7HKik2}xBia-@BT?0{#>M! z+mj`%Sz_H=Z}v-%rKKQM^2Ga7Q*o1kmo@pg`rux}paT{sBrNZTy0o;9J8sZmmw>Lz zD>Ge~Y|OQ+^||k+8wyvQj{I*=XZvsC8R(E6L5Ago*+{}-CgddyaulnvZ~U6Dlq(Sz zM*Kp$s^dMbtQV?+a!|pOL@I1F*Eb+vVBh)gQ`Ni*joM|ykxC?q33tb7e#D$~Q3_O% zo0e)zUPHx&%9KwkchvDvaEarjO7#vgxHW=N(4pK&`%DsnQ=3j>F{MH9(j{gBh#qm) z9))5{i7^5nz4Tp7$w3TNFGy=S%eb1d%qlbv3aIGxPo>uCqaVmIj@Rmyl z{`fk_rwRBQn%@4CE1|}3&u#4L59&Uzji1w(0cPE%POU$oMg(6f7~gF1UOhvoB4C*oM5S7 z#c5`gwrH#fxU7c|6tWPhGb3u8?m}ZQ*|od3sDk;;usQ zkw5*pm+WwGEz4TjXyx?Cl=M~a%6XyqTIZE0vn?#jr$w1Vj0TjOF;wsY31!VC5c&c4 zu~DU!xz-ug+ zIC<{1c$Yix`O$XC9-u|TD>pc6+1TAM{M_gTp+I`rwvm24Hg%5bdt9)2-kD){v>Mpc z%>+kWFVU{ow-GLF?@b8AuyBaA&Co)I5hMJxy(QvN6*$DTl) zHm{B}?x~QU#tEryw*kJbftVGC4Eo~vdUVGgf^~|OD^>nX-O3bWg_Fpdaw7a&Cnno7RzN0C?Aw%sE% zZBd=q7{Xm6bRv|$SEVUkS0=I@(J0o0k>cL?GzkD|me)x4yo5LJu$cBP zh>Cc*HENJRkW&S80WBB<)J7|)?0~GMY5@R|YB8(?Pcz{mA|3HUac)%p6e|S@f&qUp zg_d|z2gUC^OLk96tFg}2c>BsKWoiuE80D2Ja2YEC9mT^LLkzKJRJ0uN?VRC9ViZK3 zph{w=BRvsH(95_yO1~Tl1In0o(kroIE-bABUw-`^mmL}VUXgS>H(f5sXj@vIL>eji zul8QKiSx{vqrY9&9>>tFr0sm0kLYh%0z?r$L^1o}QXO&dQ;*ay;;o+{P|e_fg2usV zG_6L#JQ&3ZpGhcb8wuwN_!rzT2ucqo(6twa3q$U!vs$YP#W8+t3 zdHhul*U7QNM>Q9-D3YK^zXNw$#npB;fARK{8In#kdc>fa2L)o--9F55 zIlW)}xNl{D66sN{)zPa(pyt?w`A7Yz9ywy`MrC z7%Lx@_?g!xZm&{}wKjkwj04Y>)V^3Fxu5;Q6z z^x%NsSh%4v`a07e^l4RO+~~!KqD3>y7nwTCXXrkulXu43%&?nYg-;Id*+!iw8zYuN z3HsHG-v!EUZTaRwl!qF?2NHD_QgGAD%J6*xMIpzgPwgGW55*000wfYb>|&6bN>4>K zR?eYYf4707T8>;eG%kk!R0))ksQ|GXc=U&9+VV8lpd}zdq1jVq06NIe07gxE1?xI_Ck4N(H*nQb3L;|spy^{FF zDc{})e}TVn#;~bv&iwD~@c#`k@R8T)bma$9`~?nF4a|_trO>il!7dInS{xF= zx$z9kyZUq^bEkpkdcUo+UR)Y#a+ysxw^Pxh_7b4N8m!(q@m0o+<1W&|JErCsgd)37 z&rMQVAgI71<1hw&SY$h=GUMvbg9VTH%15iANzz5cxGsj%g~ zou(yiQJxcOc6z$h`ff|Au- zhG7vAM*kHDTLF+baMBkO5~h$&~PGv6Uq0OHtqlr6`1 zyK7;IM>}<96&;`raHakgPJ{#@Tt00~_pJAu%Ktc2dGNw%a=-Hz*`~YxN2_D7-~M&l zLGy-=_iB?9S%KHtP=EiRa=09^pX=Z0e(aDA4KG%ccvLP8oeWeFi`2LnB83so3~#MN zjT$DDT5+L#_~<9%2nk9pMvEZZ{ByaY@czruG?E+73J@Vs?eN+WjpO|XxZiCu2=lZy zfh~dy)&0P)Ex}M`mMe%KW5KOoHQS)>uSHKx)^VdERClkb$#!MV9q`35NB;Z}JgU<^ zNPJ2~7H#^cjHM*{MP_2jY7XQl8ao%@-7pvMfchx#*6!55{l9`NoL__YrOT!i1Te1{ z#8==eC<2^W<336~Lq*@BChi>!Ezf_YQy`jg@WlR3R33j|gI|T)cyp(sUj#-8rB9_3 zbg!mte#u6Mi{Wr$3l=#;S2IetXY#817xXN&Mv}Jg88r@Z?&P~s}XiW*{;a>_c#Lv|{p2u+@Pa#9rmV>># z?^<;UrXIHbteWn;-FCZqKgIUA8D(Bql@9h?_Uw45KqBfeuR;A^*x9Som7-7cvVY6!~I&v3Q0y!F$8|tey8CUbvNZT zb0yYhITgkU=1w2K#>3xS3JjTU#~6p3we#n_l;t~2WSmu#i5hKu{SLjKrRXl}K6-bG z_`h4;V!PZ2o$9&jfk-oQ&MYgWdL?-!F&h+Wcf5X}9IJxxpFVRg2=Uw@7^ON?#I%$s z4S_qN!!3z2I~=5xoq`M~S<0~23G6q(!lc@t-HY`rq^3qxP^Wi+6^v6^O0jTTF0*ZQ z<8F776@bvYUZ~L6@xjA_lwz3G4SO`Ea{~y6g-aB6)&n*K7dk~J9TsGKDAcPdsmD7@ zaK8Mb`35_80`g30suGYj&J^`_R2W~DR+q->=#lB@Y1z&n5YAZc91-@RaJjMx_#i&q z`ww|85?xa{RM02GNr@J7Fz_}{ej@m9k{UtZQ{F^NNWJXB)x^)}%nd@<=Fh6<{F(Gd zJ9DMLjr_>#9chd;r?s^6mikgHQ4}r4?7TH?A`6~^m*Wv@q@*9GFHRcH(ZmrEhk=w- z9zqYdqZ$@@86{uR_k@i5-Mqka!-PgC`lS@3Vxgl5lw25+78uKXU42@F+hHdbTIW0n z^&w#ae4)`NC=$n{`73xQS)UN%xnbos`Sq^gqsrzURypLLchZ}(80e9FNbZWBaeVi| zy@DiWj1}%3Z#d$!;e2QS+^;G*P<8KIy2ZI`bH@wsy6+1~$HU=ThxoJwuLI-o2i9_^@=fEzz}^4|x}a55dqI>~Q> zkUyRuGAE$2vKcIjovl=k{2`b45UP<_CfAA*gnz9}OI)uBMW$p#BH4lq z34xdur$xhd0Y;81t>UDVaB}=;!uid{j;|5=JYJdGEa|gD@M7 z2g9eB(r^ihWVF`RAni=FsJ}fv*z5!4CQ-LOVqY4L<{aLL%TQ3j*{|F{YJ0zF!q06z z?-n#?q0+HirENw}TspdF0cWBVis!A0x3i_!nG zwV=;bm?yZ;*iEM%{r2ZCy3U6+b}J^fKQHIo)b`e%d>0-nW>}m&G}@@C_@#O%YKpbO z;(yZ`xU=9ia0i9;1{A8gm6}d~kc;%^-cpqg>XVjg3#E2nhsUKh`^yhsGHGGUUmU`a zwWYL^n{#aA6-wYNXhGQt_qk29Q2gZ`R{xW>6oA8b6x+=+XHWZ8%_~~ zLib_|l-Z5xU9W0icpsI+o1Tx-;{SYRvBU_k=H%ZxOx*QY6TL@AH~zG#I`IlQw#EWa z5Xg>y5-=Vq@s_A|Dw;_wJ%`J!ID5Mzo5g~Txv}Nu?T!nCS%=k^Xb;MO($VpI`Dp<{ z)+i|<#UpV$rD2*=INEto&VIa_Q>-#oI}99bq&wrj(@2gOOiU_HL{P(@i``krMhE1z z*3yeyN$~egut&HtaPe{oq4Ca7ZNfo!1z+E%%9b}5_tG>S z2+-5*ku^A@ilo1rXdBdw#Kjq%bM_ehNmyG}q|eB+58)SWOgB}OtCr9J`^E!fbI22G z_ga}z#vAcZ%=yYH&`=6KqEiK#KT@Jz7S!tae)+56=D+N=LWlAMLlGRH$gNeV<^;UG z6kY4Nz8KpTu4jGq#+R?|OYUK%A1b0^TpO88mphoWa`mbg^zq}F=R^)Unb9og(o1Fq ze&b1X|Im41Y}29;%P1Ks0jt+rHEM(}+fvCiX9oIwE2!0~Bb=YB^uwBSmXT>JXe0J3 z92~$8e+GnR$WXvOZ)x8fW{XJ-JgKA}$iJo07L1zk5p;I^_&*&M!v8D6F;mgkn5Qd{`H5{|Yc9H+$?549O4&Pic*s6FI zP=&M#;I$?txD_?PN*cyrqY=%>D_J>9M3yJ@Na$Fr{tU@Vhm0s_%BE&QP$o}s)q(SP z?6Zu!L{i4YjyB%XyjM5!EwUa+$Hi_ANzONhk3LZK#zXWiT-lzF`3DRzmGD)G}zmsmo9n_A&ib zbVHL~_aNSN|69ZBCH_94o}2M% z*G8ktP!dUXAUV|$1SyG__BIjQeR@!^!p^eq_FsaLd1G5(q4>)RcifzM zID)sONYJIlLu>n|%s$_Te>c(9Q*w;?hUKhb)|yBW00!!lqea!|Fb|Bd5-sIWRQC83 zgMZqSScnOHQm31qvnucknSAVQ{8!d~h~NcJUbzJQU;30KAPakaElv4S4ieFmWmp6} zA#J862AXi$=xJ6x=m@m%!PV&Gz|5KuI9+c7CU|-U0XvZf?`OCed)|O z3XZCEqS{XuV{_0NC5Ouf>zrXt?r^zeYF2|v=F4|`<*}E=U&p=i)4qOK74|yxKF1=a z3ltIw749C^@wAVm(n2VE#+a`kf^4@`udWl{h!U^4;W2pinpi4^Y-Hl`ga4ME-_N}F ze4zT-cc1HCov|mfbOheTE?p7G#SB08a&Ya{fu`qYu~}R7?M~i4fo}S70?Ka5*PMpT zE2-Q0t+?B(4EB2K(`vae8nmd4bmA@joGT5ro#B$gIY$VM@n_-rF{LfWI1;m>lk6Su z=W58N@^kxtNy_-e#@B#uxWe^e2~flVas9g%6^$eoaF}sUEk|0EXvj_Fl`tnoLenN- zMovKNKWmiYkiQ`Sn~|pjmT=-SrL;Un}p+yZXQNegC`)@y~`q z=aU&+G_VooKO0O&Rl~-OH_w86iC7T^A?SnXVft42--jO8nH55le!+R@WO8vuFaoC! zU{YA*I-tD>Jb;O|q)5*LLKFuA#qglT^tW$cv1nIJZ+;ur&-!Cw?|cfnDNB!i_?~fS z^J%%h;`sNLZ)-|D2Y#F~Dz*-De)gLHWrhqb6SVuSajh;82KWfBnyoC^r8I*@KvT4t zPDdj?eNhZY;#Ozm*^3|Nj68QaBXUAYvJhjy5HrAKOB0$@LD(G^!xriiSnjs-d@2H3 zF?+3Bg+E@bPq#bNjE1ItDuo-I(#F{BoGWrwW`Dy@S`9cmrqi3npn~i7f8KmL4R~WL zH2pJtMF%oVuxlVbE0m|FX%mE`ilAlrQ!i>?$56 zq!J<1lS7X8^l1Y^nf$WA#L_`hYaPk(k`pr!Pk8Ep@2@28rF?VX?>0u|6mBIt^Qa_3 z*m*sKRLml{VzUr5Gkv6?CWI7gzJsozE_ODaRL^`i=thF_LXS{@N>`Z!B4tA?aMxY( zT-hpJ*5MG6$0sa);FBB{kaEQX~m#if}i)?l}R5on-gScwdG92pLj5+sKvJo z5KBaiBt4{bj>X5`YLtJ(42rF%eH0kzM*Fj)+tYF4+6%tE+HrpA?sIs<3TnuJ2G`e* zt0sa`VvGWFMH0cC8cl0CbY0_8_Jeb`w?7@b&i!TMqaX;d?%f+5grn|TJS4Rn^46_f z#e$|=uI(W>b5nq+Qg9qi>e=X@pLLnBU2-MsEqBX^qP6v zb(~CE*SAPXgWDJsUuWvW^mvO|?4k*Pi9}}dU}zH@Uj45kK#-?&#$SbWwe88`>T_7c z5~_1_%3AlP&z|!WvMkhUayN?>Q^6T8nZ&@Mu>3Z%0ebqB=>5iMSrtTCoqMtLt#UF} z5fF=q6*ViMD?CKWyn9pxQiMf_&!*tamZVA!sG&b-rBZ^gopqC;t!WZlmpsLrn#@o= zcis|2&XrP4#tIF{M?l#=WJ^CB8k^@ZkW;D;ygJmbTKKiTdV{z-Qt4@fJgWKKOEC_S zKy~7G>pWkg8cIp_%H8*M;qj%c9&u4>d$__+I`U_TMV1X1SjcR;d60IajL+T1)5sZE z)}_Al@EE}bZ&0q(|44DJcT3*PI!W^gwH4NM#cy0O%PN;G(lt=ABPcmbKS33b$+ke| zJR~cPfFC-t=%K_?PUheT*J&+Pn@0(>w41T-uyB+OqzkMtt+GU{Z?NOL@^XPw76Hq& z@MhcrK6HqP60s|REiBT#Tw`^-6D0Z-rl^XrM0BG@JDd*0?o?M_2P(cbu&b*;aq3Wa zm26kyUve%CA?uIb?oX#|u_1^O_9TR-i7wnO))48CEISx~Vm5JL8ib!{m-Dg84eaHza#tHYf#6=#4HG&04k zEL7?9>bARIma?#bWH(e(w!tu_zJ4`bV29h-dfswpEL+)Z9{u}aaZ0Wtb_SWk=cT%@ zOX1vxkoTE<6%%A!-$l&E*I&c?Gn{)z=*nz3@NP*9W(+L`mr^Gmch(#5fVRGiFf}IX zQSmU#o;2ICK?@Ph@8Dwjw$17u;s=SEemSSEa%Jz}`q=f1Iv`-}BCis4GYi4*5@FY0 zx*8t1LJV`y&R(eiQ|gFm^?^fXmF&_7Ie!If!+v_g*igYz5KWVCnWAU^k6=t!J?9 z0TmtZx0qsR=aqJ{nv>SBaWe@Jnos`80~rf)LT|%4rvXLzuq4cA zwR=GYIS20GD*kL486I2PgOx~ELTvY_kGfG&`;)(A8I<+gKGT%g#`BFNT#Sr9`?&W; zp7`ZomGzGLSJMcd;2S-vb%8Q|3kxv-2q#OXc=JYxvuI63RX?nhk+b#~WegXUkEgWe z+_okhs*cy~!%>yr%$2fbBw5W16NVV}jsEYULeAxPPacD1Mw`!A=XAOaT2ToKcB#Bw zD;GEG2g# zwz;qIkV5u1JVx-L6SupLS8DooC!89vUBftMBOEpsXOapo8x@ub9e@fM5|sbq>ylcOK!Sa|)|E@dt*(@la-73KMU~jb;kO z3%k}{Mm${O2gUf`=u(d4{S1dvYV?H)wc)DsXlOznNyQZ-;Rhk%SeomPvR(}&11F~w z0srC>5J*znyt}mSuD70l5A;4+Lw8CB6jvgl2EECju<8?aXe^e?O@wec{f^zn5x2Lc zI|cRM^WUKJ#KFT}3gr+_w3cL2wIM+PG)5~dIK5vVIgOm6E9ihFQovzJe3q}S2;LrH zhmW}DZjoM%LyJ4of(VHxXh^V5`6Ox2aAeY{8bkL7!rCNW!W*Do*8db>|E)ATY$lpw z2K?cb?xHal6mxn^ zuF$77=ECPn0B)&du^bK*gWr_Zq%$Nu9VPlHIx3+w%%1O3`;%)K75Q-Ky8NVlwn%^t`&B?-KH@5-6p>qNznG(F?1a8s&;0 zz3nkIDR8-drKUYB$dWfP1*I_A$Kf_<4Uc?gl~WrKL*bt0R1Yd85`w~zzr%tOYj_t< zxI=R$EKn?#y+UwrgXuq_dobONbqhic#y-h!l7>`LTD(jRCdhgH&RgjKbylcLQp zG(t@hFr$Pw5aWzE_My{QbySYQc6(P$9(SJkli} zo1vpduti#t^u?C*<2FV2H!No7UyTHt`o_ zAy<#O92T4l04{RcCBW?4dqT%myqRpj*qMRmFO4;tk*o(C!Go;!7; zyK`1GS?xn04UT@hZ)U7Ksr>28G`-8{{m!5{*vp7*f3FYov2u1$kWz7OGFjWQ&Rz~T zNhVc~$l~V)JI6XH?TGCrIu<^*foL6G^7AHymAaMsx^rjmq?LCJ^IfdACkT@srT?^k z%M}=;{SvhJuqkY_BDX3c6|fiBb=l)|QxR8ivCGF=7WaX8Y#TTr+phpDF-zs#cV=%pzyEf{MGx{1XEHxp8lI04_x>e;W}?04>Mp^|yRh zZlC=V#Z!YVg*{PGD=!SiDiZq@S_%Z56pKY5`E7^4vB|QCzC!~Q@maDLgwxipx=REh z$?XSIT5X#<`|M5}=F}nk0VMHH@BgjRW@r%B`qn0x&#ha#`Xfj|x(Z=m#O_DVGOaF{ z+EM#hI{}C$V@9oOqR!34{~J$Crao%gXv1Nsvs^~8ntzY=rKX7GomY@AR^*x?ci2RfL_hr5748E1<^;Nw~nZ$Z)Da*swtYrA&k4d zG7#HM8`S)dtoZDRvO_9(T_&f=Tm(WSxw0)*@H^VTipYlC~lkKuO}Cw{#I|L z_ut=XY`|Dl%QAivd1+VkE;*D~($A0SKnhVDowC|R*b{SA{`r|rb zy6+k6O(XV2E{?R=brY`OBRnm z93Mq6mZk2V^)^@eJl~#l5{@cgj8F19R{o|^$|NSr-ENX-lKM&5*A*%(o|otsl89gq zIO4ecd9}6npB>A{*+=bU=({HA9RI87f3=MR-Z%4t1kTk5D<7=YKOV~eb88>?yk_AM z%I+r+MO#d&vUAT=O@n|iRSDAP1gU(a!QkQc=56vanT7r7>xYxR`pS@=jzu2@JGm^7 zcW?Xj`FAa$YN@QxwFQr+EeZy6x-BP*DFy?zAbNqXP)Pc!^}(S*vq5YnIDyBxIL^9X zR`?GoDFE1uoR)}ygvP`(t0&%c|02OGMME^sF;r-_#$FzONe}25tBEiOvpPX znSuMQ=i~Clzk^Q%-qZg&+@}hW>jEf+UPKO82>RtCF^!OuI8Z6*j_P_bgHhM}wK8mj z-j;RapwC!onbpJn+6T2yfl-Nh&eY=XH^XorsER(J<8!>9C1D5O!23XUQqe*t^-0lG zC?tY(e97Qv(Ht8QbXm+ALwt~ne`;?tQkx}eT^i}pZXcy$8q_3zsX=$ua9lq1$x$j? zh=;v zX8>wTuYO}M+vg$kw>-h9^+WdNR&DK0eLa7J~Qybt|8#ReaVNj;87Zw5cG)cSi@ znKuLx&RNGF{JK7IA+tuC_?)mM7sE}glByzGnk!YX^OZsKLsN^{RG)%o>ucrGPClMh zES}!-2m2>q0&d=f@|C!h&k6VvdZAeE9?S!1YaY99_!=kfFnKbtJCP@Jq9sY4#9YSh zM6|>_?7AWbcEbxV9{GjG=pmGmvI%nYwg?DUhq6_v=5$>4Nn(A8Q13M>w?y6fy&eG` zlq9RGy>vSVI&)V$L5p~N)49dw!27?2b}duZ2qHQ-Qh-(HU9J%WP~qhd#3LN$1xA8f zn0P}%hiDf^aH~_F48B33gdmq*a8xBMpt-AEdT}xb%q52lQ}tEadiaPHK@t|wZC_hr zfVKq1THTG1mvbyn&D4IyeJFh@D20h&9*#%LPPTSO`~nvarzl8Ctp!rg*!_sT%!8?T z(z^hJ7+;9)lfEtaTu#VMfWG{1`ZB-n-{zLE$-hGv z_E+7$)pCb04#H@cJ~rD}!04JDwITc$JzX42cuB&E?|Z! zKEn0;elf#c`b84-tNOAm3@pSoLQQux788CWkgzlkPXVRxMtV^5oed$J7Xmm`<1r}fC2+})FJNoI-K zI$HlZ8OU1a!&y1buDYJW;iWW~z)(YHl8UxQ>vPKw6|`y1LF;$<71R>N-Ch=WuYmCp zB0cp42%)?!iv6_Y=w~MCD=`Ks?IFQpuqBI!_g<&J=S%6u^_9md5ob5eIjJ}AkTaP1ibK-9m-bv7vW>|iHvGIjI7--hhF?Ay3 zg8}~=i(-4M@7X>7ZRgVRRd*|#Kiu6ZG7+jB1XI+({MwP~!CCc;smAjF`}v@BX3?}_ z6B$CH;K+%DN}v^Bbr~2Opq%ih?Q2H1u=Rk~Yp-=@ZI~8n59(IwgDf~`X_BCduTeoP zFuH9rq=Tym)^i6`KIYRx66_09(UTMAVosXT-GOGdJeqX*`2Km9>l z;ZLK75NZuNYTK5WkZcO8w%mgn?pJ0DSZ;jQsB3$U(WPfsmS9+686##f2f7iyJX?)_UroGZ_`=Kk6acm#dy_+N~vyz^aUfMPGsn zf^^ubtg2w`yT{_)lomR{)+|>+sJ3 z!14@2xkC!V`SgFHjjzZ(sk^7g+JlW~XfUMpvM5${s z|Fo>Q_v#r!6rPju5P#UKGQ0C;vdd5!t>L*D6SRa_p8xQSNLS06XN^QKoi>zjyV&z} zwh#DYt)Q;2&z9reC)DSdKZzp?D{G<{c4S%Y(~@un2Rwz!F=EkwCUOK@ap2$M@#Gg9 zfDqQvLWbPFxAx1;Czn_L3%2*)TLy$2B44O@EK|*rtm$1+A#>>th4KBb^EJI1ek`DIr7VM59^~C%2&OVD9qY z!oqs1&a*Nn!x|msMdn|;D8>4B93)dewCv&s0klbQCbeg?noY^l3zZMqFfTnRc@RDh`v=v~P!I6jo zIbsJoQEM322*SgLlABbkmWGSZ(Yy3U#oQ3m!|~|s7|VVVp)B`AjnULph+EUQj|#Mc z@58xN5Vp4NBMW0jJ^8cWUe)J2a<_04(C~@9$>?e3=kAI0&=?M~|G(glT9=7c&vy>< zqxCD7zh<}2+C+v zS+l-KpnwzvrDOJgt@xXZCCn`zO5i|r$){LznK76SfrFapnZEUh#$R2=(=Ilz_MAJP zZZQY_3x-j0Js}i<9%qU_#y zed^#Rbg(hBSZLiI^hDwl)Zr5R@;Dv*(ia%~1kEd=88i4C`@ja^CXL?Tz|Dv~!eG7c zn(pkq|1~vxyf%e_B<++-F0e`nDd>d4@sCLwjZcSeSLik^Q-FzBC*$JdFvwPx=Xh4M z$iMJOeP;IPu(yVIkd9!VB9MRD3=a0r`DwZCxJwm-rua$fKGA!xp2sJ>^6*?EsOGl5 zxu(Tz*noY?_mXM#d<0dhYdY_N{ovxOOq~f)-A_W^v^9deprRq3(CSNMu$&*qa)mjW zj>P?Da+}k7b^UkH4$8wosXPhe-=lD6@7XHso}YW zP|lx=TGS<(#-uoe03zB+7`R9x%xX%-Rsb1xLUx~17&r#RNmx(s#mEbTA{U25rfj#$ z??^k=<8vs_EuNf2dXJa-$&s837`bEbTMCCOr6RD??c=Irq{%I`FqITLmnl;&#VM`a zidn_9cqrdO==YZ(oP<2@&oJ-RA0z%;f*E;JgF3H|0gp^Bt}R>V#gCoC&^qt`^zKCz zh5tYU1Sz*rT}4du20JlRbE_dOW_WH4BK8pAOZ*(1T>&$RHczGNlDqPaP(Wpjo(^Kl z{KdbqFjOXMzQSn$b*r1}!n(e4*G^ZlvnxYCfN_(X941DPz#K`yM!o!7`p%t4YwAGR z;Bsw9#uvIY>H*@X2I4i%e?w)pw8=UQF`w}?7kOd9*2E=&5Ffq3^T+Sd7nlDw(#{?pMXo-xknozi zG>*@vRFi!Qy4Mte7W#%4F5kqw#9wL5USOsR2l(YF=}yhKGE^+a)T|#n$dR&RWUj?n zyXrm#kDv+>LrT2Wj*V)+Doo>vFvsY{3(Ra^UvzgjKWwgzY^c?5DRje8sL90O^f^AN z4-mtmKJN%(w12B%gh;`=lKwOt#a~klHcgW)_!z=nEq?+Rhlz< z1+o$2OeM2&?uTlwbr~2bDoPvBd~8&gNgdl#``~?9_jkW+skwPmeDUf;Ui$T$p9>pMEkS3ls`4B4Y)CZ@`r(R`mZO~57 zWfeO6c|F2}aXZBgG3O>&|E^R|tnS>5iStd6ZG<)Qp845ha7vzLv?3Y)s&da0>%qU} zpksrBi1hx-Rs6s-d+zoBj&$s~pr;h~$J~d~Z`mm&(^FR|ZVLL*u%Rv6q$95Tj00Bj zdX?0)tzgf7%FjEcUTy?eU#Z94jGB|pzTLFyFjFutp?`JG;Nsm2QmU}OOAGrV#z(Fr z8`5PoM??vf!`4QIND+~pQCcwZSm92T=;1WO2*Lm|V^x&yk543@_J{!qlW3HqaAZrL z;Io+&)Ay>UIadxJ&ARf1q6bvI&bhRks}TP_byeZl%Y>1MruuQTaOD#uhHyz&#-ww- zq^qlt!<5^GC&Hm1qus#)LjfF7StM*Tbe#UaLoGSFfB;>Ok68jQ)t8pHgW}RYe|0&@ zlEddi^83SB_l<69;HUyZqnFlY$uJ~*lKLsYHZuK+2q7Hv2EM_&U^2D zzyIJF>ki#i)5oZlJY!aAkpcY5Qp6pZ z78|ueJf$fO9Vr$qQ(KZIEvak+9T*(y%}5vw3o$xZ61w_lWi=l4dpa9Z_&rGR5IMZ5o6K*m6uWC&)s@+cQ~xL4M*P z%;q?ArpR#S(WDG4tVD5)PPWy(vYQ*AYb=d7r5@wLNnda!Y3@Ae393d%Ing-@h(q($ zkAtxJzn-=AA+4g9l^bRilPFTj3}W&GBcP@LQFp5$feq#kBNV~l$e%F;RYptpfhdv1 zU!IzjxxvU-M~~O>&HA&5aacO1Kx9^$Clc0hrd%Xf%FkZL&5V}YE$?D&IKh78#AjuWv{|!Re%L{RUeJ}eyG{b}#(S;Gx6V~DV3f5kzi>FV;6-*eavY-t$0CII<3?wg>*{j^v4z8z!k|fI)q$5U)rtFE7rUOhyUC6JR4gHt+j}h;~P`>S2}T zPwQM$BUcq&U|C_QbObMn_zfv9S{k9JN!rtsD(?7LM-A{TnhkSA1Jy(`!uvr^}f*+biX^pt6oX(QruLWiYfE|DMg$t*Qg4!jjEj0QN6|#W~yxq1S%PeB(4CiAh_fK6XpMBQCw&PV-^NwFvYm5gqZgLm{UWHC^ zRd^+o&-TED3JHN)0d#EGdZrjIuX}Uv=O7~MS;kns@x|Mz;8-5@*SoNf+rGj9Gq7Ym zD*$b-bY_MW`j5Y7eT8HWy`N{s(FRNBb2k1*+Et~VR9mq6Uo9%lyL-;>HMmLHr z53`W3-Cp;oFqGD*kXUpML{O|9z7$5y{M(^Dl)XTkt!!)14=Tp1XL%cg&K{J@h9xf| zTYweL@Z_7@cX70Su;P9d5T1>9m@FA8O~Jx43%@CfDLjLebd{dHu;Ood8eOeHMgv?o zl{q{`G<{+Xn?@8*fXvu@RbEsOnv$cv=K!1t_35z@dqk;~A7;zq&ga|pV>Vjr(PjGx_q<~Wq#Rovr5kWvYRlyhC>OuhQb)BU=S&CHHQ58ieLY@wDMP&#Pm_4~7^sIXI>BpJdd5L;eH z@F5SEi=f2hlQKS+AVdx-_3BVC#Jb9@i63QBZ#D|S6q|S|aF=<4pl04bF#Gy)7Zy*h zZu=B`8+hMF6*%g6%ov?SLJR|gOSb^1HQc#@cw*dV_d=vZaeN`!=_ktdm(0%tsg zJrogx;Rgt^DW$+sv81cv8r%i{iTudLedEndGT|;$baEbPkD>oPhl^%4Xw0^X9mwt< zSwsczV_H|**S|ONY)vyB?PTjSIz9=!kUk(QPoKZqF%Dp`4vgDrIR%|X5DKpKrk2x4 ztMIySZ&;GZG8VGD@QFki))OtAc))9oo2nC$jD+|P;a>yAM%#h$nZ*llxXA>SArIBSt1TrBdjMBh# zijtt$0}5@m7+g7HmEX-=xuU8_VNdwv5xHWe#$cSA)y-_azkt}oAH#AOTNgv6DCKJx zRMA}~cv_>U4#_@@Z}Vx6>lZ0&Riz3+;~2@88)piBcbpElr)Wks*=x2ic*Fj< z(f&R;^&KlJJsZ|TP3JXRKKC;`dOC?fdU1146ykbNCBPQJcQ z=%f)w%eb%`LP-0ox+$5WVT%}dZ-dD?rYiugis1LW6lc?gRA5J^l$&z#iTnBL!m@KU z&sLrg41)E)VsBWG(XT=P$8o`Pu>R`eB#a0dOEFXA$ z`2(;e_D@SFj+?c0!6koUY0Q!vRb2F78eQoV!2f)$-MabO@q4zc@Ff_%si*r*a%$B* z4{M5+-}7o|Y~y7~I{mfuMYmfR+rs4anfs^l#KG0|M6I&e<0=@l8q5XsC|>#9NF~l* zj%@PAA#s=r6l5N%F1XC7_o7v*fW1q4WH|^mHl0)m9cfxZZEQJMjuW_^=u!QTsUo9q z;1ODPABOU8#_U8KOTjPpe-I+wGqJ=4GfgA#OcT8rA!z_8W$yklJyci4I(Dn9(rU`LBOR#XLWzR7Ljy}_|+4$=nf zF%v!_ljS@H87SR=*l(h<*cmZ?d+Hz>?SDIX%01WFCcp35?d$_-N`UcM_q|UoFWMvU zz)FLA*ap^#y$DP)_jXn`!5N$h0wY)hxWBL~oK$A)SnKLPpF)Ak6fUG9_v=_{pyP{7 zTx1|lq?VS1O$U1XEPt60NwF+K?aqBv84M~50+kuU5lP6Dk@`BH=GFlo(tw_mrPyb~ zsN1ijkGDJa4fmsar(B=!&7$y;)$DA%{577m&VrFuGTdoJ5cPKqdF6>#ne^`mo*1A% z^8V}y-^TE8&y&v@W2@Wczm48n!+Iv1{^Kx1hNKK zozjTmxa~qv)$AeZf4l%Gv!5=jSNHaSU8%5jVPlk!zCT~94x8j z7p7iSHg5NZP;M?zeUa>yitw|VM3**BQX`=V#uE+e`H*eq((SY>RgqE$;4u%Dxy_1q z1Bw7Cl>nrlN zcOjF2p%AY`obXkokBh+h-enF?3VsgPK7$sJ#@swS)@ScoWN8S&x zZoZyj;hF@kMhnRAcw=HyXI&UNYy;4>NcQVKq+K4BGQmmj;hH{f9IOWs7&9YvOlcIE0@qv|W}(Es$T5>0L)=v94S))<*(wd0vS6zchk1 z_oM8`Qo%Po#x+-OQ zHfgwtHJyK&u!=<%JS*$cYaNywuTGZ6(o*HazhGO^8a0C+fk zs(g1CM>Nsh&@mqg_0V>xp|g*mx_A!%cr|%9D{BfzD!=!+t6{cn|00oZ8zGUv`aeX$ zC1R4=?v_)Ka@EA;YLf-e8Q-OUoN45x?F9HbKImHEZk_VYC5ZqTibTq_dR>(}aGQz` zzpcl7x)Oizr~ePjxO&-d@OlDu5v?gAsOdjg0IF*q5eL1`#Q`@2q7 z6~tNd35;D@MI@r4y3EUZeF5J`6}jB7dS2`Gu?D@{a(l8o(q-7||5fE_mR9v_c{J*+YaEJxS!uQv_b@o?+IIoB9ZWKPO4SgNRJ-50Pr zyL%q}cHMoPD~xFEPf*OycW2gE%PIiM#~ZnF;u2t%R3Ks-OowZIFc}PRwF(JiL2P*C zI{KHj>zIvc8B&y6b2SktW)hXCK}v~JA7M$D!$4QQBINhGzVg{1VWCX^uEurz~YAf@QuR0E=g*o7@|F=}%{XC<8y1Wo6wuv9mJ zXI%>l(77ni5c^)UT915z=vwWz2AYMAX4{iu*@$qs>K&`nc29GN6bH(%jSXPqS^dTE zhG7?0jFbfCA52Gv^EX2^BhH@e+_bHF9u8lDbUV96ULCs6(bga)k<7NkCSz*1n{Cw| z;CeKz7XaYmQTEu2CR^|Bj4C>2nx2R=fTVQ%rA)rSo%--)cE#7WKU7cea|J0BIx1VN zqMPA(G34d7PSAR&aFMIiYcrgho}hpq19iQLF04HISE^h!gTr`tsQoCAl8m-c?q)m; zW&m{zID=4aCD*B2`NKd(t}%VewER+ievG@gNxORX^<*xW*SoGJ8+1>-*BvCw%288R zttoCl+oi)qwIIv4j&?-{q|2c;BSWwOB6Tdy7C*Q4E1z0HHb zbDTriHOnP5jN)~)NiXSj8<7By)u`UwC#);aI)+I{mly{z$`kLI(8AY&zCgw2acs&D z^EYq-@!l=Uq#&ulSfZszDnG}PuWOVu+vKVT?a~5W{n|eq2(W#7`}>%{cEDC@1ueH( zi;bAlm+*CCI-M%2kH?JnwMk;n!w0ec0@fZZVM;UWKrp){j>gMB#eV=8sHNz^zk6*- z3`%-Fh#oNC*ch(kmutx>Xg@Zj1N^@^RktiQr~-5`JVY9XGF0ks@!pos)Y^cy$1Au2 zqEvJmn(cPEfXb}Vk2V5CUb%|CHa|1?#Bes}b?5ecd%0oN0A#LpY}CI|l_?h$U=sF* zX9E|q)-qEr?Y;4)y1p%r3$7Q7AF~}Y1ACi<5+4$YfL@y)E-LRpw|553e*j|A(RQOE z-wFD_bN)iuLZ{`wMT>n1brvEcPkm;+F#CorauAj?Fh}Af6N;|I)96g_}RyEoamS z%|xJd!c+)$4?t*r`=>!6121zyy5mPPU^3Q^@2muj!Itso5oe`pw^r@h!(_BI<4>pa zM;RtoM`@QUva1jjB4Ecx-qgEW@0%oW%50RkHzK-gT}Mq}>8C_)lNbr&bQRHo=p~dIx$PhTKMN(#Mvr^E-siEznjGf=JWcvK!{3gFDGug{QlVBUo@u`656g1V!HS~VqT7vS)$`elIcPXwFCGwHzK}AR6B^Y z6V`j5x3IW5=4du>6-W6f2NWW&ZR+cpADu0ex)m~`WzuN*TvU50<}py0YLlnj2Ab(_ ziLr}Ex!GMhd`-1@w)%Ri|Nhw5pee7wJ)QK-nS4=99$}@7bBmOPQdPYXp;Mt2Fc)-p z^k3dhRckM%q$Di0RPTCro7>uAuVrS=ZMb@2PXaMCy)#CC9kqjRju?gM?fX} ziw8uIIJUP%!5iFbM-!k#Iy`XaVSL&K6PniCM-~}C`d#ZN?fEc>b8{;ps8yPo$>ofv z#KRq*K#v!CKWhPo0frJo9Z?im@PnfpYAi2e@Is_$bX!7>A@P)WLjpdo`cJYXGTE}a*~jQ-u2 zH~-0B?va*f8L``Q<6~FkC_vK;ZG*Hs^p|d1DQ_>ZM}|t>zD=!GedXyoU$}N~oHn?Q zAE|V$qh`rPwN{~j)Kimz6WBRfxl*}C#NgNT{god*hUb(E`mC(*bF%Uydh1`}&t61M z)et)yjvIQYm7;+^a|Y!V+DC1pc#%NeF|CEeY@CX$%!^^x#_mdBGai8H2OU@$7{Xu? zBHY#r_+t9(lM$}izn!cHQ2Wo(nTT42E`1BSe_GiY(*W34Id2PzvmP+HbP7q7dpqNS zsP#$tmMX=rB~QDrV==CwX~|YXx{#*J@hMXOW#U{{{uhhE6?N~D*8;U0{9Dvci7E6M0 zOoDFCQ|d@RXvz>Esr>QO+pFXKY5^M=Qu@T0m|`?hZlfFu4XeXAKo4!->WNs_g9W(9m;+}qK3Wj5i`ceDpbAFg`fl*{+0$28whOhxBoNM_{&H(xV7(B)vN|D=4)uXmpl zWDPzvzqR$tyTQHV{um}&sy|0j_H_0QweE}tEjeb>%YCo=+8G|5_wy@e**Y6M3^_Rn zB(oB4dlQ}8BZ&Cc4p4egKI_J4X01p6s21j^@rjt>pgt}{gL6a_7b&_wLnC8_@5C*J z&?X&FprZBo{uPp(PS-4XurE=Z9;vqdZD0+K0opfwB)-;K21 z;ZzN>fTSL>ourL@b*QU);bd{3<#rK`7N;E%)X&ab92_1b6+0Fu;V|ip~|Zb zWC+YDkU*IT=2#o>nQO_9PH3(ubdG^asphKM@F`5>Z{>@Ltaxw@6cUvyiV8=n{$wZH z%tjkp;kWFt$uN|Uv6L6u_JxP$GLE$Opl&*E*{kM_nnBwo;$o<)jo^mIp+0~n-W>M? z5U3cI$KI}0saJG%hN@nf=8yPM&28|8V2=&OEGLE#PN z-r?Ko^>r~5vm)5EY<)-Dzj2~>!6tn zKW|!2IIz6;5}7p<1gV~Jk$pd}4ek6o{5mhO!MNyaXZ8G$_~@kq2l;7CGN+Wj7VSb- zhKYCuy@%%_hRRNsCpOz*OyEhO%u$GVo>t86-h+sAx-qhBI2dyI+LMwnV+koTLGg-Q zqZb!iYLyq6pJwM3W9^nZQ#m?Ev9((CMABAO{Sc8%d%mx#0kK9nV) zbT_bmdE+-;5Jrufc!-}PE}bTB6SL~ttv^eBEUdowz&qMN?sO8IMBu7priEX zp`wE!8Y0tj&#qxMtf{>@6cvXM3X!z7jG0TvXu{J6L3PvF{%lDRwAGRACiZig&jzqU zv++986b(DtspjAC<9e>$;Aq81-NxrzjVGvY|L5>Y^KQ8k&puFxjkgSXz>uO$xHoLv zf~1nA!5-2)<13S@E+w?tz%kXyMN@^LR7ZD~%S9%@j-t3D zi%n4T4`lbP$@Mvl8q11SniK}Obn&cS={6qxaFu6)qskGBEq}9TT8!DnK)TwdmIdoj zwcvU#sI`hQ4zrD5e-~hjaYWEr0q)7xwGefcOp;hfyw z4yQQjV`s_>(n%?YS`NcPWvK|eyV+E^j=+*ptYi|zX5;Tb56+#A#BD>VEY|83rwTp` zr#mM54&@%a&_TvBZ~6Te0q~AQpj;8fUcDxTL+&ort4NuCjZ)d(;P;X|oSd|!j&<`B zTuBx%N}`7(wyd2EY9v@zZX=Y@xur6MHs7*w?fzeBbh4`^ob z{Xi;vm=-3Zq|?b4PPDr>&7?TfIZBom?4~8w5Gi0SMUD{>_P}GHVS_KGSWd78vyM{h z;#d|hKO?tDOmC&_SQ~S7+Ot(_b?js|10|{*vHD>v^h+Vca)it$iG}co^J(#E`BmiF ze(tIAgPJWYvBVGI{a4d))I?t(-`TQ02M$t-7)nv4E&4w%TEOMv&+<8@CSDGKr_Hsy*EHRAEu~(X`1R zU$^+{j%DZdPa(}8P677r%*$SNEREXM<%<^TP9F8-ZJ{h7Bq}>mG-=(`!v|@%&LMSG zSCXckeUlC0Q5&7=6dUmPg;UH5tj)aBx2<6+zt>WYtgQ9PO22UceOUb+VNK1}i$$hs zU5T)5hMao&3$uSXIHJA&!=638vg180;+~B-+76V!bg^uS#Z0>ed+@Q&t%17Jrhr$9 z-p6&=S;NY@kHP??^Se$3Vdzub%1>xM|p^infItIPW$-S*pB zarG3?7UHPAffi^|vdPEC|05d;ispJ}Wv%b8B3A)4Ww$l_q@}cxJ0228IyelSa>(g) zxE^}l(odm(J4x*3O}`V<98ct=Bn2d=!4qC}XLb@46NI$jTg9!0#=;y0n=P~BIC#$& zc`p}N4s;V#wvD~^>=vWhqQk`;W?On}mV7sILg(7-BU}7OoIH9m!(JYp9aafa7x_>B zXPkhFSG}?Oq}21i3Gq`@<%&fc&zdW+;8H)3WnW^Ju8sgALW?;r+R_U^RfC^cH*Qu* z&6A!_A*rINK4+1biUCc`Fu-BJ`e2Do^Jt60P{+WC_u6-Mf)p8!F#Xd<2u*|@ainB( zmBTy>>b_dE(Az=IXTVh=?P3P zoUTt0I8f!}?B3*l;K`32h8Y)wKbdD8vP21AvdX#ow+3N>hw>$%Kl8G$#SPpKUm6(i ziXNjH&R}Ydx$~LkW~m7-WYvUH7W?i*VsvYdwy;Qmz?QVMZKbF#B0~_)UL%qzK`V^_&JDPUPcPkG_7McxNFaOVMD zb)`W((n5nYbq+LT;$wdorom(AH(xQZF_qV3ZQTEOn^&enMGG7<=gn^-e(ga%W@NT$ zS+}*dZ|pgIsE9XRYOtBNO4J3tmLx=7n~yICHgE7(t9{pAYEjsj+c-iI*xDwr=;&Bb zlK9~Q6;~Kz$=eHp)!VO7^nR}Tp^^cOqKXwvHe1r3whgnKXH`kHGFK3%@kS$> zzW;+I`a>}BC`iT~q{~7fnR?t1wL_5BID=>;?&@l4z^pjCoVyh(hsAPU5@Q(V@;kf# z+YZ5`SIU_*c4P%9#G+cV25iFF>H#Ch=;DMc*pm+tv*VAz8Z)Uil)cwJsW;lrh4DqU zOo+Hv`EJjri134~dM%sh?LO>AO(e9~6NWH;Vv4XS&Z8_9Rj^S2x0QEi+Hr&M~DnF^#|Y~JH#zsv;e=QSUZ1?CyAez{{#kR z3=*$JP#T{phZ@TY7tLXEO?$EA$gZIyqk~Vl-|b6zy(H#ZS6SRExQ%jcIP-{qb=V7*?=<6fN4upHhS-yKVG~4uhZFt zVm^)GZoaXbIu_M{h^|ImX-R;$DtyF{KzpZZJVpqfKgVK~DjUvkb5wGSNMVt)K?yJ< zsAy$3cpK8a8pFoU?ZH#&8jffy=>ub^n9!K`{x;ai<3OwKI|sV86!y$wZ4SW)!Hpe^ zCOgR3!vE_g$F^?afZCa$G<(l%^zcycJayDetnl2pp6 z*toYehWJ|UP2lpZg<-Bvt&$x(l3T88n5LLi(pXuns@1VwZ4+8zJ)7-q~xi zAg@)%_;`|D7gtca;Kx43;vKii?Ppg1H~XL7LRg&(@Sw}b#azwEi3901^vL72p>ObC z3wgQt6L<{~G<;#BT7iI**dJiyQ1T@UD6IaiU(c}@3;G`mtE*~X466L+{Dd_pS%|;4 zhBjeB(z_KPI}@6gbOkXsgdNKpUQW*^-Z}_(_RKi)h#u*Gux0kn@SJ;EW)FG=zJ~-1 zs|Y}qt25^Ij@;e=2fcgR&?>cHmc9iiG8Tr}@%xa)X&!MSHunXlt6I`-jQ^RG&u@}n zVL?xDZHGZ#=(Ln@Bj7D%(Oa=xD~Qf=jzD_OHx`5$rEgmND?_RrLk3k3HWJfGO$z<} z@p|VZQMqttbwFjiP;=a~)2S1*#D*Ybma!d_LJ%$@O0lTLMMx@}Q~9^Ym(`m4Hs$v7 z{+IXDw>!{b7~3l=jA;N~Y+E6|ZbN?MS=i?^sxdnn4)K_!$9rs*$#;C8_Q)**tDGPQ zgRQXVMnJGm%a=K$AHhQe*H!MQzXyXbGc{6+kz!48NR?L2s1PcOTF5FqK*Vs!P-YRD zpRfR?NTqhj%I&STBx6D_=VC;LoXr|?&|#rcpL`d}%$gYpk50dIwOV`6XwiMx>o<8? z7uMdqGkE;G7S@8ZS_{t1WLJ<+DXV~Y`N4#SGBhWE%5${cooDlal=}w|WLA?hj%3=1 z&&&O{WsM;vmN2`c-qBuR>TmQgjB1?E%;6xy@dNv&lK6H=xt<7A*!bz?uUlVSJVe>y zbC>BN0SiT)o9KNKdY#6CsXtX*c2&IS#<+-*6{BsJH@ASms6t zKInM2KvY@IYisRabNl+#-f7)Ap;#FaR4s{Cr$cOrODNku zM#D11L)l3>&u9Bxa!n&MA>r?amZ#f_1_47VIzT*n$`Tt3D5|F;I3YO8R7%Qkr1-KS z6Y}1)PgojE^%_IY^!{ih=Gsd`5y_r6cw*YLgw;)*Bg>m@nYyO7G*K%DYGUy2P0q5S zQxf;dcHK|K@Uy;`0$ZeH5F=ctWre=-V2<-j61q=}@OQX|xxNtxp4Dc|ufOBF*qm7X zSsT}jRZL+`t5kM=7X*Xh(UK%IYL0%oLm?##Y;X}~jD(m{8<#aS5}RdI6GHRW0$?Xm>|+=@ce#gV`58< zfPQ!`3u3J4R*9VS!@*&WdUJmK&S|>uO6rJ?v)6nhD3Y3}K(Drcv^r=lVsvrt=dV9x zanPX5`sRvnA)Xj_DApVqJ?w_3a92waf-TBR0V&0eG65|cFB zTFL))tmC3aKu0L?;{gE%bXnlBy581ak)&^5R(t2aD6SrLE7AHN;~3buqDE?9$WtLXbhM%oxez-B~2HrvfAQ6TEm^OZq0vrwYvOV18wW*+wi3tpndGb zOiD)19Ss5a-lgB37WyY1XX#m4KaxGVCnlJ9WAnce&Iw(086F3eSX9(7HdN~T8Bd@e znFw#gC;5}5JRILZu6=RJjjx~UMaob*+wdmr?Q<3F#S`Y@>D!&GZwM&~-R01D!7(|U zfbzt1(B2;uBwDsWn?#Pb;)^=)w^-w{!yn9{Bv*mVtUePlo;k(#>bwgW%+=XmUynWL z5S%DJ;dQ_yO=G^<0iBK($6dD)3rXFgh9QV!eEi_2L78g7&5S3!xrME>g5|WURAs!h zytS1&nX2c=PlPb)i3--|K%Ak=C%q3@7+xQVvs8fMW47C^*yi8;OHj>u?f9iUmpOs6T_%kC3>*_qTwTULO<#HZYcozx?EZATxj6MO}4Gc zmro0s$q&~%O2H1C#?ibM-mgsRZ5{JQ1K$ueXeI9h@+_gDK!mRN6bV-J zg^2{OAzw_4uVE`S>uy44O#l@XxMf7PQU*275ii4d!S50Kz_Lv&?62z-P8<^2e(4X7 z7b`>zD@lA^r}*m}2iy!OW}Y&3v1afMQ=7(rfh~kD^<6(t^BF?Pr|E*?Jl5A~Z*aTM678p6B*)A^Rg8K`r=wKr0w)D~7<~?1 zg&rr@_68Ke3FMT(cKwKhWaAAYaQ>Hcwv!{`c+Bt&vY{ZoTBnQC1^gg3xv4CHZp~^n zy7OpWVdAGpqdB)_>;68OpWwyisSj!15ZK@*lFopt*=`=a&;ynq#XjV=fYVJYz~3;W z-f|C-pMN0{ElLZuYI{~o`-Hmx3;?Q~BEzVN&d#vN2s3YrWh%Xm$wWw&>E$*7+XN@~ zCt~<`hE`TeuLnQ3f{t>m0vF*+gs{P~uIp!n44@TgMEuKFVrewv{@xnWO!_aM^$Asa zJ+Z_o=LUPXjUJQ!A#Iop76uG0lCqj#lw~tlR3mmAxy!n3D)lNSF9#EH+GczU`1OKCoX6Ne91)~SP-9&p zn*ZHYSmzrcLTbV&6EFFYM?s^2JQ09V1x76AVR=B->NtCNfw;^x7cZ&|3%d*p1!2bz z4hIFX&q4vse?8)480Lk5!8djB9?xckm>wT7P?c8VpC!uHfm63-Hqb6stt11-h70>g zZ)vimXk&w%^XYf143Ct*_~dbsHp48KPK+xWX(Va&PklYw>fej8y@ZtjTR9nS>(xi7 zfwLIuc$A-SN-Q37bA15}fCy(yhh1;DiPI;)kvsbF zR=f3OJY2$d`yJ~-y;^3#vqOc7j#&`+y2s0bW+RcjysOP)A!^udZ|u_5j+l`az`s~y z!;7J(Zf5z690t-Hlyv+rt^W^i?a}q!uq3T;1C2??Yz8%XDppJ zO*t|q-jAN%z(ysWz{+@WrjwS8khv3wn7$izBoa;^Kwf#*ffa2Xdlmf+4t?KL)qWKU zOqduI76Kji9gOwbQZkH*+?+=v@8EIBaTNS`@J##07{Kh8j~D=Od|`CLFcHL2Jz+-Wodib?JbK zrNar$f9q)N?5r(T!C}LB)WW^Q5Yg;(9Z zkR(gg^OD0MG&JOWuBbva)Xqy=?&zhwYhmnQp^_oZ|1stQI@eb}BOooIsL1h!y3~GB zIkj$T(E{GljdQYA_l!Z{ObHnW!h^R`O#SLt^CPiMbVTLdlS% zOnStFL!23i;HSpM-Yh0 z(#swf>}_`;KfxY68z*=jp4k2#8@&695S*<)xsLpicp=@zToJ-i3{0;2ytK{P>^|Gy z+aCl7H!NbB{dx8Zie zDPq&-qy?BSE~$z)o<^=pJ}OCzXE9c&B8heqPNU3MnkB`6XD?KO7f(q{nw;*(Q;>L7 zmVJ)Bp77)J7?G8lxFoFPFPfMEF&z=}ph<5_Lx8^tL{*fE2NrFW2t-IjL4w#{9h|c< zX-UZM4V0F6VsV$`yeEw3$T2MJfmPK95(!eBhr9|WQQ-TDbI%8+c|(COVGTJLh$OPSTe zgVRj)V05k34O-iJr(C}Dfx0^Wd9}J$dj+9%Xp-NhGTs|pe$_WCA^T~%X<6RGeE?TP zo(6f#h(kj(jD;fcPA1r(^fn!)w!5GTNkA-ZuAFrL zT|-%UOOP{;_1hzR;R0n8RD+=Y(_E-fz#HgRwiKEcnZd<=4?qw!=DM?c?FRE?=cq|n z@};&lC5aTH&zT?R&35ZqN^5tFRWYrJezX&COGM%NPlH){CuWyg_>SL)G-b-Ab7bPf zObOthnPTKGXEhl8q7EoCHgeNJE~_%^?Ft+EBieN|5gO0Fv9>8sGewl+{k&UG{IXrD zpl;Rs^i(sbtz92ZN^f=jcmSyOjWWZYc05(l@%ws9ojb}|3$s<1ckV}Tuv+Q}TdF}) z+x&bS2xkj|-uWXc$&p#{w(i9LGAnZt<`~XG>L|*%BJNt*oAQ2qKnvC+=WMO>Vnxiz z@lza<8MQv5Qgel2IYA-^qq?Wcp)XL#_?UUy9kgCduFq4ND-hREcjpuA=3y}%VOibYQO72K&GXh&+gyoQ+gU zllEY)7hFB#;=qMrqS~P=&n3%85KQgvn@gK%W%+SXP}Rc(>qazanNP_7YL1u!EfWr8 zInD$?cPI#Ml;|nsGecU97rj^+-(b%ahXZKUsdOs`*eqh#p$*?V!wW5B!bcY1dRPP-|FFeaI%my zM&B)hmI*QGw@^A}7o2;R7PF` zWA5}$G80aSjvfh@XYe}2gDQr-%>0RM*I!QIqy^Y<>s_u6G0TSZ@&H_|6&qrH>X z*8i}i^Sf*h3Zo#BA3?1rDLi&ww5^5s_lVe&2?)pJbM@p!+=Rro$sx~!7saD-4h{V} z-qk?np8x*i(BXX>9hT|&D?^BwSkFBGw3rtgpj=GnB1-Q)@z#1lUolG=jZ8)!W06uO zR2Nvc;wGw|U{v9osusm1tFNDl@I4N@6e?!%ub-@rrfCWUq0J9_8QU7tf8U7lm_I=)RTiRp3@l~pB z*zXUaU9>DXZ_F^~ux$PJ!5?%jRE$KQ?f*mu z;2o?`hw=R`Y6VBP-H%D>8;++WBkPvhao~NKKvhUs2qM70%Uy}gI0~!{E|du`OI?5x zH1_9~3G{l)dM25M6Av}??6+d@!g0G)VDUdL|A;EUypU0ZnBm7rzarTC8uS+V-H$I& z5emT~yLFv!hIG6sqa|ty19+aR8)pdGZo_OOIa$tJaa?}&RQ}i@s*xBUU&nxP+527grIN!ibWreq&nzwmxbMe%$%e1fh+VD{eF}a0*cKZ# z=XdDRpddR=cj)k8r#*lgD#F@C3LqL*1(>T~z2}i`@9xOrBs&skU~&{We^xNCI-QEr zs^wfub>;im_}1EU9ir!Yl@n~7U~6agrvK@9w0SVzdHb-$D}fptG7gjeZ>H$0IqR<4 zH-({(9`yD9Q?~pmm(3~gx5vYJ<8!{+Hqs9OL8e_(qcC#uC_?yR5o9b-+k-jqHvhwxC_$#Ny!f~us7f$H#0p%doAWbq%qvW7(E@0*Mf&N8$_YPwVN87z2LZbR zQ@hgcro;k%JsDNiH!L%0zxYrcTxrTx;%ch~n6yvUi_VEgGC4atdtWQ7FQ&XHStSlF zx!^TRIzg(Qt&jDi)hB@m8Y&_xp_ksD9l63yD(uGEnkm&n|1BA;2@u9?7vyyJFefih z3|a!H4vWG;{~2B;88aYJMfZy{9iPs*fG;n1Pyg;iWUF$hR>{CLAN3k@eQDu#8 zxA`Dy9aiBYP9Ku!x@noC_9DZ6oh4$Rz!=lFti)O-bB}U?{$`Z-EOQ4Vf~un;-X==6 zo!wJL?#skZHb_42`^38?l|QG%gdnYCI@apidtVI5%zd5T+{k@Nw_(lyxXw@3N%qf@ zh`5{6OsLX*J@d;J{qN^D`h&rvxsc!#+UME952vDxPuvEC567c8`e2Kx2_?bBgjem8 zwGo%8!Mw*yT#6LV-1(yX0%apy-jHOCZRiKBmKwsrgo^f3ex_-R^RS zk!#)p%31`xFIHnl@oCM}O+kTda8Y?H6EL8Oy!UDT_XWP-Tca1R;0*?Ew*Ff)@3iF~ zD&H|xDJ^A;ZeetAgUQE$bKMS=kpKK0(9{D(I52iF9_1vT#T*?~eiO}ye=$G9H1}5> zBQ58E3;d-HAsQOOoY@lsBqS@<+D7s*mXXo@JTB}+9bS397Dlq?>Y`_uxq0OzcvQA> zT)O0EY-Hvr+V$=2!`G+E{P1U6h{Q%|VDA6HEz?#XU+30s6{6qKG@9eS%l0MecTE4b z(ffw!_0*}pJzq`>9}qd}=}r!ZGd>>Z6q933(7Zp^-FwfS$Bwi|-4>h~kff|-RosX; zYqhlhc`)u)`!Vk4@p0<+QQKPkb+2$`quuUQE_GMrAaiU;XnW}%uR18bBNTW{CmWfMh>EFy30W=8jrFX9WeM=d^)5NzJcaRwcj+>;>ZcS<6Z3GVvn zXpV3OPn0M{H?0MGOYuDyJ4+yTMpBhZPP|-a42M$JlYQgXs zU5Ce#XtPLurlO^a72cef{biPd+PrWzjV1SNa-Rt7kVNg^eVs1d`YLMSGz>yHmy_Fcy3e4(|!4r3eVSLUA31i=-O9AN#M< z8T;!vQN)E~KGR9h$ob>hEvW;j+`*hkh=r-bSCwqnoCi%wsYyjaRCsa(wG}xr%5r}W z2_>(7!iI9+hT8S00*>Xo0%&sPKA+d!eQaj-h~H)~vJ@(oJV9+2ZyL~7`ujzb8U(FM~WP^Q2XoQghBl6 zcY%Zm_`~`}ZW1O|x1_^>$X7!(QHJf9IQp>mWQOsz$Cn3F+rPX2H}v%*sG%m&TINuH zWTgCJ&G1xq9mx*L05?`~j-e^dV@OW1b(g<$e|*Y*(5hT?T*1+(^F%E?i$$?h(F~V% zsOz?_?&!YPNF0yigL?-UiNwZZD(zXQDv6EEj2>ud5qQ`Od@{Oqeu}>yf86e)sHu}v zFa04+5Dc^+$-CYWqEt^Qtmm>oMmE6R>AQgBk1*|y=4(snmti#ASX(j_UFu$Zjdrsr z?B2}iT%AhjOa>KzN=nm~#G3s@Vh~{*4ZDZWJ-sfsjYJm1TAG4eCF$>A*lto$1&f;z zkytKB&j9IJk2|NqI%zl|t0)wmuXtEr79j;C8(KHxIJp8Zd5&f!^VY=gHo}=E6!~oT zq>i5yc0LG?DJYG{;e-^U<*~_RC(O9BG=knGFjC5-3i|FR`y8-8g=4fN`dM*rttnP( z-52jnMvCbhtw8d`OryoX$(m)V`HE>CPu=p2fKH6C1ruv8xI#q=g%dCZPvd+ z6{ay3NRMP$NPg@0HhG5u^Uyds&*jFi*$39TsR6a`wg#sr;7IhMDA~BqJ!a7O`QLk; z8%b~Z@Ar+1tgoLYV!incX~!JDZ%M_~jX2O*fHi=^50#)IBE0u))p`6G_~3Z!`IK;A zXCM8(w~BtE>mPTBvCBL(M8%RWZl`Grci&s1W+P>&25WkWE{VV zMK3a#%3o&!LM1dku?Sb!2@4vnM_G0@om3{oU)kXko zor;#O2unj9&k7s5iWKWM&sot;XiDb0=Fku!v13G)D`msq?X{(Mu5U{HUe~iEbxDH1 zrOyx6qbLz(8*F5y;bNuAYXXr#U4ZG7{P9r@@B-G4|3>|P^DdO&IO~3Io246h@Q&VP zw5~Nc_{D)fWp}$jN^9P1mZ9&vk(XW#!%#Q|Rus+eK63+SRJcNXtPprIt{zM!hqH6W zQt^Y1nRFT%=|SgdR?019oH$FJ=P|j~%GT`?gJCYH%K*uB-S>!WhJqng;fCmV*;BNY zDX>)|?sNBQ8Y)InGp5oT%F01v)6?Tze&6b&P%ynaCDcV*k7bgJ*A4jDp9-gRe_{l; z2t@&thB3&4d1$2IN1w@mW8;LGD_aMUbWd0Y!0RNXZNPFM5hXu*^mutN$ljGBXG5_j zf|GoIWXnM#xj-g7Q7}sazP-)Lf3J%oy$soLzKaIT5##nz#LBW_>oS5$9Af?R9qsTH z72!kOR`GChXeeb0m+ukgQN-G}?*FnGrSyCXR3w%MDLX2R1^?iubYJK5UcEr-Lc;CN zv77_;WFeQwi&{z*J1S{Ft{dmADWUBr8;mAp!q;m!JJ%Bma6Az|qm*I=rq7lrusAa7 z_Hx_gh0SW}U6tUjjn9tcoB!7_rcmouHBsdoT=jHY@`-~%70P94D_w(nKjHd6eiNTk z39C|pHkM#d=v|J{OGJ-1f+s-V#%3a>#F7|vo0nW?Xx4)Jelu?s4*2LA6i*<&Z z7!*gpxv}EdD2|Z@UgFe_mYELDen}gTFtqKN#l}V=Vdvmu~$- z27omTm7Va$`mZdsA2i2{?fS!xa4nD9!PF{D01asa@3&o}%w0k$eqOth!#|>;Xx(=6 z46^~!5U0GN?5}f(+~5HxW+Kao_j@I&kj&#+7h0@!#y+qv- zff0ShDiX_=k0mz_U0|jkD^uvC69#DXcu0}kFLx3r!^Dp|LtB_=>d(EO{9hlRe;N(4`__mr}Opsxw z4@9&g3lNyZ5OWpB6b?iDrf5LeH)6Eta4oUKx@|8{1%dmg z+b`>h1rJSucYe2#fnMqKXay3YfVqA_u;zD`_FfZu0828Q+ey1iy9R3}p=(tugxQ+i z&dX`JA;E)ujSb&*_AOjy#_k$S|0rszFG&@v7$)VS>+d14>4do1(Z}4ek!?_I$a!g3 z!J|===ZOw7hOCmpnXc)Z(}3UHkH>dsq8liJfJ=3Tu;}D4FL*9r=8TC(Ez)LxWvidI zc|8^JzcFAThSXeHK1fsW@Z@a}>c5*d#sdqnYX_5q44;Wc{kte7M2+qSk)hVQcf90` z>>$eXYidcWh~G6&S7N;lS17N)Mjl|(+E~B89Dz3dZk%7wuD`=31PX(kKIu@>fz7KTEp@dqe z!ztuh8frm$x@^*lXuXue~;E-f=Udow=J#c1UvrbVQKnKes6Zmh2V_^{bKfp~*Hjhq3 zbRy&i7?hwfD~azTIu`2&t!INZ2S$hbCgNgV1>;n6ar3BR4A9|nvnGBhsC5**8d~e< zds$}J+J??pn_)iNwjY*YAeb7*uPo`rpI$Q-1X&M6lM5C%utiBQyDV@n#hrf*L=a78S7_Eddp0`ZxWK~IQH z>%5&{4K;-Q_*nD(>l#ui!>9duL{^JWP!eA;1^Y*Fw;Ent%RFtCaVZg#wx^3kZrh3{ z+0VtYGb=hmBy9ghN%U%nbC!~rj^#ke=!FU#z5r*Q>gN+Wd zd=bO+k{4eRoHwI>(B0vYY_Uh%S*`86*3xAE50MmcX$~ZB1P|Yc|8qTt67Q^C77O=>S*7!U(FA&5H_M=e zi7p@~;9kZM4%87?;0X`t!+D1s3MVhE6Zp99U>LZo>L^n-TB0nvft>pGEt<_&rJp{Q z=aAqaLq^_QVvP`)?a`s_j&91Z_3Yg|5e86SNXa&d71Oxxh!K_HRK2+aCtZ!XFUxbS`) z-=cmWA4RpuqupMsEbLg`p6XbrzEVc*2#*jDyh=QHqd;C&TjeqfgPyK&+WTWntoZVTB0CSF4@)V-BWZp!z)KHnAdUE%nM5fy=jSrhi?&6F`G>w#C)wj!tQF1TRx2AwKg zUHU`v#CNYVLNL?p=lM&u2iy6Jj4~a&Qe!7KIATLx=A@yQy>eTeZLwLmoUv@w1THhM z0jaW#%E1_=|I{N+qDK=ZZ*CeNePgcbecV)1>O1jt>c^BK^zNBiU2XD!T#qXZ@2^tv zFwmjyVI3Mckkwcf$ZR1e)57OEFNj>Ww8(b1v}9zl#g{ME3peeUux8>(=LtH5Sg9$p zSMLbTYF`*`ul3!5UIrHrU;l{pKz9RQFS^!Ytjn}7j3`>4xwaF1{=95LF0mR2wFGrN z?!qSdt}nk_2lmc;A6ImBE=~}q-!ge4hiv1Nrloukm-VM=(7!5z_@;?x2#P9k@>{E* zH&m$T)BJA{;>JOd&?%!%vbPQ3v3xo`+T9`nioi~XW{#N3JT{>O9_Hd z$c7Pwqr21@MsH1eW!;QmIW=@)f$Oguj{XZ}0nr~l`ooW?EuT3BVVQI2h_fps9c_5` zU{Lbsxp|3F-uFXsW~8$(%T9MoPBUdl2bmnv1&JESM>BK z=PvdlOViKlc`YF@TEppIZJC4Qzp{!PBjB_(>E>UH}SVM%k)k-5^+8~ARoJ>tUN z+$`tss|Sr;zf0V!oeqKFAr994@lhW2xpSX+@Xk3aoH48Od!83oI%l;>oG-YjtE935Vr&$rrfco zxpES$E1@`@4~Cb;sW`O%eg!r3%BNuC?-sJ8SQ^Md@^^XpCX!WCB^SkM&b zg>xH(6OelM-c+eoM4>#3=(VuKg=vl07R5~tw@1yTu|*jJuCM|bZL*jjC8Y2CtLMwf zpylMyaw^LWQC(ZoAdL^$MCowMgt%kwifwJ0A|&NV)*_eRRAX~^Pj<3+@JA&VlFGbY zDcIXcfqj?`MFo%hHt}(aJmdDc%Y%`sHqsZ0Pl*>E?(9zmGW+}` zs^>`Nn3(%HzO?v`Bk_Fb1CiFcw?2enz;lVpb)*!xVi(~c1yX$C(uHDdoP%HKx}D-T z2-&wx>4|7bkbq`>B0#c^zRhGk9W{31o`)3Lc60S8E=Z>7gCygRD>38*I&Xc~07W-Z z>CI$YsVjuGerwm230 zl}3d&nK*&>w%XLSw}PgBF;o@&O=}8T`nL?0z=UeOifvAP>14A&W?y8d8pe z>;vg*r}k*fXvpq*!a{uUa{-L?tk6JW}c`4bFN^v4F5C z`efsH<|7C!&YZrBvi*&hWq;v0s@jlu?yzejA^yDm7ihn?^}wdXt5Ofgf_Yg2`?@~i zRW`T7?#&+$L-Ifhs0(|?jz8auytoDzyjz+|4k0>4y1Dl(2*H))3z+}CN9FH}KSwVZ z_)S#94ps#NxSSTqL#wYC{sd_6E#bp~L1QD2YRwvRect$=zYZ2>>4by>?rg0iVU<7l z(wmJQ%Ih5~><=s_67FLd?#^GX#T(hQH7bijhFqs=*m?;YDS0G`Gc|_?WxWN-xEpdt zuX5TySD^njSCzloB<`}6SUjc}MF@n7L;I;Y;z+H`pB`&ZV`^%bpbHw|Z`qeSIue$a zGmC3#2wji=seFD(J_zM>%j5_!-$#HzG{@Yru4xc=q1PVO%a(ahg8jjgaUX^h@kA>Q z!$$aL0@zV$+-6AIs#tq+8oE(a=^_LY#G|5~!0a?frhV9OQE0sIps)rekROqV@0a|^j zj!Xxzqm4mUp5nu+5$_ZeKau-?%~rbrJ7dd`9Z5eb*`F9JcGu<2bPuH>pm+wIAl(D?0@ z)V5EzM1IZhRt17ftmkDby!3wK;{;uCRS!^c$1(sSS}`YiOZvv?b$Rwcdc^R>F%CJN z$mkX_2#!{H>?*`nFz4`Z?)lYELoPkWN&3QtdaZLs02d%?wwLIbM6NA!$yC z&zkjfX7q>&A%w(??k0DG(snd5%02aRsuZBP}?i=y@tL~t>1wW&coQce8rn4|w(koSu<*r&Y_57b18!w`Jp)eE$p7{qUGOul&@9vm?`NmZ(Ye`3Vr0u10@0dlGx7tC4#PfwL13%;n77^7@ zg=OCm(00L7L055zVRh|E_QCVE-j+Fy>Vfl5z0c%*uJy0us$1~Pw+%2YLt!%!_H$vl zs^_FVs=-b#t9YYWo=#&ng)aX?rJn#5R{^z_HC;Nf(deMyHp-vXGTsDL`j=vZqde1G&7Epjr_~}9-CEM{~pdPSQ4mf#&*v5jSxNwZliiF3c_6O81fTIR0KmkQOh?Z+X;)=B=3>)26+ z8=s_jBpkkPG(d*JwC3p}uGw_ieYTk0ozo0O`Ute7WbBrP2AXNt#f3b)@5G!M*)BAq+9Te5Ph;Snp7{6=gDEc8zp3whNT)Ck zF?Drz8)l_`?hhi@=?Jc`T%zFr*k_A+G@QjD1Q6a^9G;0RngQDS?q%8^{?SZoTX!M& zoH2h6L#Tn!(Lg^>NNT~xQqNbRV>I1fuRQU(NJh%o^uz@`_R??7@D`KxX_5qzn6I`y z^W_h(u(}<$6T|wUgTo~B;UQ*Y!$|Y{hnhb%H6F$_Bj<@qJt~Yu z`bm}f=l)vI`W8g9tu-AV-t?m>&-2=czL9nT+iF#{i69fmqDD}BaC^~H4+Xe#Q032VA${m~s4-PT6iXEYWbPGXrwhwH$zF|Nry zCg&|B?2g^oC)U4gdcpBN%Y_mcDTPKIvx2ZcXdT2rIsQIjGDP`tsuaI{bolM8dx}Q# z(mo_GF0f`QhnY8yM`fq~CQSI%T?|F_bx5^;k=-9{jo&%s`1bT#76CuKu}S2Yi!I8h zoUP=GfL;8{AG6PSy|>x3n8?eu9tUPgtThU=x^6Nu{b)8KL4eElUDu81Bcl96?_{Lp zn5c$%;5a(jJjv^OwNktjfdmhN;eHZ?nT`)BVPx@SHMQ?{e*fmuuDXH`$h5!q8}lbS zllE+l5toq0LaT2?0sb$>)XQDP(Ancm*2e^g@qD4XSXt%~#oa)319(YM(YUYTfJ$#T zni=X4Cu`1|tE2eI2s7v`YNt>vQ~UPlq?vzI>8`go{q$7%*~H-&m{K1-xCeYjThV*M zj)PHls8{jir;-FV;S-IBRy8!KX@@CkG@h??xXA`mHZ=S@UyEv@x^M7|py{T;7k*EB zm$$E1V!y~=4`2I#ioNVzsx+;6Jp?|pd%;GRMRaXLv9UgF&lGKg)Y{e&Ab#wRygvh; z5gREHgWeJ8eDc^>r7}4D$RQsYidKraZ^S`Mq$62vMF5{bFq)h|Q?%(2o@IzOuOq2! zPn7in^xbw1AinwWO86Wc&QN%|O-;k4(o7?0IuJ{Y|39PA4hk@;Sn;NMOHgd;^YBmD z>slhRNmy!P^u?PD3{tx&b`T#Ms``+V`#S~&zkjA8UnzAEn+=!)Zx@LuDZcTHc4ILh z!do8o1emIIFK6j@{2?Dul!=2l-skS8q)-=<@*Ss@bwp#PrR1=gAPd^fHvDVa*vlk|t4~=Wrsc6Re6RW#?pnw-SAdClU~B_i3T8HL|ztt22`oIE<3~ z1!J0+@rE3D*W*lz>=%W4lcwk>DZSsBbBsl}74oXE^h`|d558unQjc9@`u0$I2pO}c zm}v5+i-2Jp>`btNbmv3SFbba^<^}1fI9k1wH>n9mGjF0wht;Lseo+i`%jUV~X#I#9 zV@j~3r5FgE%FKK7OtNvjtkp!_U;P=-z-;Nt2g2x` zmphpTQ}oN=Tvy_-6*P(zOUp=NgWP4~cA96cby<%(4c_pTh34kE^T0`XTJb%A2Lwau zNd3P`O6El=YZ4(wH7`vHS}KWkLg_+b5GVd`!J7&QgYD6(1?O_)Xa|OO!!Y6xocv=%?U6`WdXH&Ht=9Uly=mn{%_| ze;6IpogAW$<9_G<#XEIqaD+}mPpgA)Rqs?yGLI_#NaVxX&-#6{qzbm5ahztZ6ste$ znPNVE^9Tb7+-h+{$PV*lOZ5mF5A?2WB%;o3-(gPG-c2&?Hn+4?^7Ah*R%qrvoUW>{ zQkx~bYR7RJYea{~xzbcR6wAKCi0QxCx#GXPiPx-eW4n$6tkKZ;XWW+MZD+o`Nt+bh z{4eLu!*m=-4k54+zPM$ZtcPrGE`p@@t3HwqWib#tx)K-H?vx#rb$;iHI zDNAxZ=t(IM6Zp$@>_9K)USI5kAl?qjD(oo0(c zuxtso4b6LQnvn0Rb-+Vq++YT9!C}sl1K#{y*6j71SW)Ec%Vd#3B?=kccLN)mh?mhw=TLP)6Y26qMtF2~6pTGLiP}L-| zQ3j-n>BG(Ee9o{pu5)NE6^Rp!W*zNSUwdc^AjZb=o%`guH3yN9eyKbC+I3`t13$Xk zM2RCK{KUmp%C&le6&EJ2Ulq-ypr%z7q7W^kbBYBHmM$T@dTI9cofgDC3|*U+Mn0!#;K;ciddnIxhK;ENfj>A%0_a9a zIxxpL_Yo#VQ(5^_Qhw&Y7wPER9%j{Lf#Eo|$4AoN9eL`aF1!A5)UXnsH328ddym*= z1}n_z$^nIO+=D@L8TGS^w)ANpYkl+>i|MpVc;n;$ zv8B_B-s-d$%^x8OBO{UPN+7`xDDXj9)JPR^{)n|GlmXVkcJt+ z_Y%?2;zbjJ+(ku8jkspx>g(8&oRji359KZxcsMl?nNS=>VrjV>YGy^0nOSsSKwR9@ zgKb~KI3?Tw--GfVRl2}4|DDaS*g!=xL)!^~>-KTAS;fzr zZ)kmE_3&E-&+O6~y6pUG(MVa5R09#-4>gAt0Q5fav z_22Tnc&_ugC=j{u&AtlgyvV72$5{G}hete51Q4wkH=M4_NgqF|>sbeY2f!U?>@1_; zUR}aR2jzCj=XBPdm0D)A(+5G)6EWei>daT0Y`HINw^=^SXq4gLCdMf#N=2Y+vQz2R zED60gv}n+C3>4PUmPVDZc^p2`P_$bo3azi!8FVzLIy1XtGfr^<_<~P_G7A3e#mYsz z3Pj7P58zTtDQ>k&Az6(Hza=t+tH?z*R%DZ=APe0sNbK&3&o<{z)yZe~K32Axisme= z{5V7c2c&lgolVh*lOB7R6QU z?*b?)sfud1eYE|HRvpWr-3`~*-UfAs?T^vmU`xp#w?2=Sx>6qWBTPYmc3N1DYzTZ` zM8{Fw3h3Z(L{k9ZZx`S!KbJn$Di#AV5`Q3&V9Z#`C?xrUo{|(!{v1#I=I#0UlA(Rx z^tt+Sx=mAzwsvr{Mex8U`M@oM(6G0*Q!i$w_}t5Z)rDkmUHRb`B}^@`(Dk4Y?T2Vpj~4IEPG zDU;IJ2ZQzAGY-ih{}2S*npLP_p^vgS1MGpzh6$eZmT39b@8#!5KHqdn`O1^1vTXZ? zEZ@Fg`8rit*n--kvj2FzfsNBxKJlnL$<7U@lfW^S%1N#7O9T&4mjo4#j*7nD>7q!0 zj9nHccCFUi82?19)~WD2nCSYDBx)o|=S?ZzSLY%RfjHclCq|l4TvRhP;Y=mT6y8qTwEUsBa{t1YJAFv4L--LdpR~K)@XZ(s zm;^|bh{UO_y@GB-1ph~YutgYw$bu)UfdD5%gLlJ0)EOC#l*Q)$<h>J(ayV;ALRKAh!pkZiEvJ)@N5{bu;!=+H>Kl`B zYpv7VFwId22?HNw)5646H7t>ql8?c=P8`x4?Z~464Hq~y$HtP02<3T97=s!UB9y=l z4@IK^H=f1|?L?FH@xm|ro2>Dn{Q)xGc&ai|vPn>RZlgb-N^)@$R{ZY2y6@m>1f9+d9J6eQejJBE~M_$rV@irciMg1U*xePegB+Rlj>M3TX4R= z2}B{|H~a=0vEUUK9UZOk@0L1d#dwMau40TS#a?CDG*`wf@ie{Y4F)_)?b8-;kD){k z%$(egXu9pd#W4HC+RKdtt8o&yFFr%Z9+AiHPOyJpf29LaxSw`ZSK1Z$as3>iF0Da1 zLRgwlQku|COzc^>->K4-x~BvOC(=_&$%a8BKx`nW;D>S+iYm4P%iLyz&AN;8wcF<& zU&SaJL%3+V>NFr8(*Xk0aA98aPR}S_^2Ru3w8j@~*7r2gm6egqJ}77dJ0hhF9kX3! z_LtcI)k`nU+dc6gj$cH=f=Db;J>EpJpgsoe`%cK6({8+ZW;N~!tES8r(i-$8g9l!F zI}sBfQY_IO5i7=S`?*^`v+Pwn?DfO`IzW`%z=>Smr`Ru#GFyDt>0Ao2!?pnddPg%$pkyCoQtX%Z=&gCm*-xh~0#X;{LcjL@bzmJ2r_eFJC%%Pp zW+=L9Fv6bWj(La3O{VbvD*HqESiL}|R9<5qVz^Vez)(e@Q|ufgwu03SwKu37FCrk! z%0IV_Q@RL$H;E2ucHrH6eNw|H==*LQcvLh-DtL0!8lBwr^)DWQ?9PVLcrBzf09x~Y zzR2`wHtGoo-I>TJZhEXTHe6!$P(rsVJg4kiqCtyrjMK4YZH1h{>XEL$W{O;-s;sR1 zkM=Fs9Xy^*lP;z?S_NelmG&{c)yh)9Z}Q-Mh4e&(qw}7i4V*ep@7Gl>2XQjw2utJT zTl;~pO-&;+Gemd8#41m)0sEaV&{-*e?{Rtf6OWI9*SW)+ruP`r;I_V=rDuQ7j|U}m zeGvyiXf7ybiDM`QYH$+}EDxtavzu1aw4?(XcThW7=b>rzMbFB%EBrXcNhs}6kw}OP z4y1{L(J-#q zQ7_dnGjl$2li{2a(?De7!y{^-u|f0swQ9hlq3A=~`q0o29Th(8X#$!=3S(PkCA6K? zn#q#WeIo|uPFox!?bAWW{73cP@xw?b9nKNyoa0Vpd*ur2^-uMy&8@!A4oga7jzD0* zi}xH@hniPR!4E4v8k*UDbFm)%C2UqaWGoG91lOoiy6gc=Er7*zSnN}61o z*8JZcCrc~lDK;P1qyFn7x5n86JE~23pWg9ISE*?`=RM`vjF2Eyv8<_~QR@P>-hym~ zXw~pOeL8nEy4pCrMv~R zBUwnd@lyACqI!&SCu%*72j93!$$30FnwGc`(bP0CGh1bnvP7XO?{wn*HVFa}2mp7t zeV>fXJ~)=^*xfaubYoQvVuClz0LI~Rp2zy8lO(=D$8_Iuw9}74lQ|WNcBFQBm{UaL zjEMx;0T#Rhdf!qpy`Li42V=+_e%cnd(sz}lp~4F;mQp(py<uun3SnJ~Zk#K_C&j@pj0&^ALjSfS{x(odF5zP6xVbgw4;0B_c`Ou&N z7o7fVqepwNHN@2?K-8zE)EBMB8S=x%B=NYU9)tY`8VDAH_hB7IPMIvzJ{vmEH9&)qe0lHq~N6rHH7X4-%p~b z>eZ`}$XD#;^0!G1TRIS&CrsOvP)5{unZ{+ve{sV2Y<#7YO06{JC)&39yOzMlDze#w5?JW;i0g3Dt`bctam5MK6Z#F#=%h`4|ZEgb^IhCennxe7DZ+kJSKwQ3uEp;Bo# z1jw0Q(&MV57gj7sC@&UG&=O`8F4F|R=oalH$~s@qEEyH)CqCnXrS8a$<Zq zq?tBSgxRP`kC2S$i2KB~^f)J~y!ToAR-`<(tJ?_!)uhMMT%8N@lcd4^1Sj>~QFi^J8^k;==B|KWCglZhiPh5zmy&FoC^CuhT?8wqg@H?4FL zZDIKzD+u!6J0lwz;dBYy{_x+ zj90nV%$hc7tlSDx<|#=&k_t1Uf^LCbDJVA?uSm{&36t9R`P=_=&h>aT9NA8F?mH5} zRBf2$>{~LXH^qRauC`bj4iPvwnWF%utGztwy{5(*mf~`-7v_NFJ}6o?1Xjiqvl>7| zVw9w@So;JCroqCEK!>%2tOaM4PgDp{s9v6ZAq#Pm48d{waNr-&y54G%`G~zZ<7Ap& zxXF3`0KT>NteaT1eKOiEU2{%6BpOWxyPsYxk5};^QocWH?D=YPFgpcN>>9f2^_YNA zNII@3tUn789$q|NUw6KFd@Q&)KbEhxZw24_xW9r+&rN%NBrQ&69@WC&7=m0?L{U{`b;7H7deXxbpMb$Ed+>#DTf;bnxisYJ0hX$RF1fR;t{$K_s z5h4t8hhbVGX5pI?Bs}Bz79VTU)<-n_1{SYTHsxD=h8j`PRVt2M<=f4%vdhpa4YxY4 z=K9a)qVv22g-V~5m)Z}azvbdE4I;j2}TV$fye76)53(=iF-wK>Swav~I^iBhv-K!Pg1=}anurz?jL6@;0|UBJfM+*0oYipo0`MVVg8F&)ni(3 z+-?U?s0RZWx3|B~QZer}WoTzpyRw6P)fI;WG7{M=urZ&n>MyAPq72t7RtSh|q7GR# zsE!X;RC*A?r0J98qX-sxb~+db;Jht)!kdNHyvpQZYzLcf0EJpZo}lj7v0p|gQzRq- zSu$Z>6z?*f3R+;dKH7{kzkCr(1J2@E*c?=Q(x{v#n+7Q4{eU@19^}8AhDF9g)tRH7 z%C#E_N=nA0?V|?)oyEQu^vE56B@X}j#wg{u6Z<^c_|P9QNFcekGNkw30{8LKv(MLX zVwBoU)#O!TLaYq&&rNpi$S+*zn3M4P*e~84`B+MT2=u6LjN_4j-(UKfI!OhvbuWvk zLmIyai4@`@C&`kv=5xp;TG8Ag%%IuK{u8jB7OAQ>B<}rbD>AAc)fb*v;8Z zfZ8(hU8{dKa+_E4_NH^w{UMwA&sT|%qgTr7+EjFAygO7RA&0z+!-l zGBaocN146ny)((wSS$U9zbpD5*!Jkd_ehOdXQsXA09P=%83_Q&!+AMFnI@b5taPNz zd2BHjIW3~Nh>c-S4~dD zJn76!AnrABMkz+&_Koq9x1i+sfL(+OkjR-B_{HPaj3r}f#X9Uct>Pu@MH_kj(Zw;VZiZh08QSS@*` z4sYulJC23EIIY{uqZXwxup<}a)Qq9%?s}fSI~E&t`JcJh+neAZ%-2+!OgztYygd{&R{c7BesT_aCfPiAU!Gi~*Qe_alWx2FNU;sd)IW3A{uz5eCka$Q&74gtq40hIt z6YkU?Qkr$>us~KdJVCIea*VCS)^a<0eS7aOabfJam056_o2HM8bv#>4R}hpVc6OQu zeDsf57@S%|%=>oJ*CDiMX|h`%@JTI6nT!Nb?Mz|L|}UVd~G@20O|`81VW?ZWBB< ztAN+zr8WCCKeDMwaLVxyW!Hg5oH`Nhb-~6!%jGHMd0|%-iiW7osnD+0e`%BQnc2Xo zrqx8c*YdQ0o!n{df)B=-F6cHm6DSQ*-`tS)siD8-+Pl`i0FyGR7UObN)Ql0nKd7SK zyhH}T-l1tuS0XC~l$!XI-MZfBNQ+HH5eGFBoIbAr4=nTqxdc$o^T1VUr*y4NGb~7cn2}xq2D#(SP zd@0D32?r)aFg|zpUP&;ujfxrjoN9=0G&Kc{ie^yZM8*L5d?HrQ5lQEeiY9*?<0TYX z%uOb1pTCE5z%Y-l>l3YeEqP`7`h4;#WNR7_J;y@@&GOs-QUKl1TIQ~E2qm2s`BnFN z%dcF~RIQgA@0lA<-_+?*Q1i*T@cBefZ|T>gNT| z9CqpX=f!j0W4m}WLbnWYK}93TarKIulX$lF>{UL(JVg~Mq*XOCSvagm zr%4B~0&-t2uy~{n_Dmm_|M9yelrqi)C$DRWCMe0H-Vn31JT<+ZX0vCO<>roE^!4#V zur&^=(mUInd(PIUf8m|*wiq~BXnRv2GK@r3qDO8|(r0%`uD5d16N@L{nC+MS4c7l* za8!pfCPCFFFHx<}QU>q(lyj#^3m8BG44_gg)Iu?y^|wK;m}AHgNX_Qv;#R9x9KicQ zFA2siJhDVRpL_C4@g7`;4N9U0i!Y$w-%DXoi_L28BLGi2&ky!sM8+@`RXwG6-t6Jx zY`~I7;BDPY)Ip=tr1o)VLQyMK{313LDj6#o$84Q8F7j9e9}W#l2R=cFzT%g4rh`Gc zdFG$NE^%*B(-*RKN&+q$hG|O%u|lTK0z{O9gTlnmkZ5rjK{8>!IKc@bJa9)@MKSAT zNd&a1(M~{v^*WtF%XbU_VLw78JnW86^y$9G3%~B1dg_>65uA8`VGBZ)F98PZ4@~>O;H^YG(#q)Zs zklIh{T5xW0NpBV8w9)>XMGA+XKRjURREb&iP}1$w2MyyNgeH#5y<68O@+1oa-)trMb%867y`q(gu^*}EVys~bhm9h{ou z!I(OP&5efE7t~p!7CoFc*`6HE9`+h-m7i!Ve=##gGOzZ+JcHd1gH0Oc6eemJ>q@RK zApY9{G5cFj9Hu(ukO?*)-1h$w^_6jL2D;bSu;K1SIv6tC2StYKU=$fT+})kw?(Po7 zVYm+;Zo_4`GqgBwcklbV|L04;w&|1PBl+3U@mcvzF=sZ7F(>eoSrs&=l;*M}bWpx1CZ(wex)p9{AY1H|ce+IoACm zIC!5WJg^}+_5qj;q!!2;$I$l{dpXQqet?Z=M!=@B=o2W#{r#ELz$Pxy5DO9`38b;r zc0HxGe4}~&jdWaLG9#O}HoOXFE(u@VHsVf1<^2Dz*2Z6&W7I`yNX>^-`?Nlxi{5Eo zswgRw-NNqI@P5#5fDyy>Tc@Y1wswK~Iyk2Mq+AD=_3CUi9bI~2GoR*3mWcIr?wdET5DKbw84AJ|BSXdVKEaO_)V<|y z8mrM{97PT8_#nt0fyxggN6gP(<1L7&Tm0$3pOb%qfBpBhFFDqy5`yxL$TdIUhePH) zGqpu&?aZG_%`QxuMI5y6#M>kkQx2^&xcQv5n45yXcnR?HyuTy?Lt4arnoNDLl)I*m z+QgcFwb}Di3qM_^5}Umab~`lQ-A`u1RT zO7n;Ri<3+ZC9Tz_4GVSEH6I%qSBYWxudlNMZ0PUIUUk|cy|Yl8GRgxRu5xwyveIC- zr|g4SVe$PT;J_me8sNR7j>N4lD(+BCJtDgd<0S(~=qi{ns^Zn|`*7flF#a@5vI=tb zP+D{Gz?tBkhXqjB&yTjv(E}V|r~nh2*bAqN0U;`s6={I1p^Zkbg-XhF5>^ zPRzbrUyt4-7G6x|f&HHP+YfON!p|1Py@gshVtCY~h~5J-jbqngN|JX$pO_J1vO^9W z4h7Z99i-t`C6)vu$cak%IgQEp&6+`8FT598hH0PVs+f@;pggmK*ft;}-dEUIo%coN z1`7n%K}F>9I$x-{>}ofIUn+UH{?f1i^!>Pkx#YZy>n(hKZ0je+ZzOS4kO+)k8FkI7 zqTPhAEMeowS(CjudGis6d-QkD&*`OV?Oyu{hEY}sHW`zYFwcLTGyF#&xGNtGY}vTD z-U{C08FH)(H(sjb^cgWn#9n@D-&v!1)K56RA%jAhy{=42-fqdDDT{$k+FHt{nz4UL zhf5${dWZ`&OqX9yf8sJi&9{s1)(5d%r*VM%d}9U*#A9pJZB$W03%(?Wy{8UCRq6-L z<6y#XP0^m?yX5jC86Oy()Sc>U*J6t%Z8OTz(QyI|n%V~+?AxP#20?UJFRZ(O$;7Du zZ=|5lHuzP_6!?bm`D_Jb%N*d8;;n(N`6@XE6)mh!Py57X&!QLi`?(rmU9Q0v!axeH(Us8W zR^b!BzZveXAxDvHdCDNM_N6@CnmW=3J=VIm*51n-E7qEtD!abR)0LK^m9tR5q7RfZ zPrOc$goN7&C}B=tmEcwd=Pu6M{beMQ4lqDPwbrsByQD7+NIazI?fU;F}!^;AlU zeWGg%Eruh263VuBWL+gHPfQiwv?=O-N9-+c!Db5GQW8`f!CBKB{Y>~1*MNH^SW=XX zWdqPo6zqLXFm${14Mseg1F#}f`nv|=@Z&!l{;1+cdK*TBYm|HHo67{k zn09>s>JLxlF@I(!FkQ4u8#D&W@x~}Ct?}R7+gq=$S+RGbD*|SPjq#(R5=Gv{QPh=s zv&u>JU=a)ZrFz%K#dx}x~w;4=N+t2(_8Nr;>9O218_zC zIsDn_0AAT*hhBcn?Yibu8=qEnFSRr8vw8jW^jemD~68ZC!U5E>0FSNUd-I zKAR)cvh&Elfjw$v#3Ii>(xEkaJh{GPMqo-g$)!CZdSI-&1WVAj{OQw(qe?5|otQZn zZFt7SW!KbdBeL6#4zac_skY8yH_DweZtXzpM_2r@GebEj9Nv29|FF8DZg+U(fQxlT zDvTt>dZ<+p4!uPlX@fyy?4fF&lkH{Rb#hTaDR}P6>Jq5JJVfYH??taFOl$c)ED= zj(}P&r?`KRS043}KaQAI*0A)!3JWECE1SF|h|RX)d{R~Qm$dDpF&b&{ zC>DhJC*Av>zmu)C!jGweQO;owq!T%8+gn_*Hy$Jmlz%}UvZShIR$P(!XE+eYRhhfJ z%!a+=n$Up#KQMfL%RUrKS(DUXBG%D?wC_YwUCD|TsA+t05K%EYF z3OPbu)kbv#>uzlO{F8V5=e_z4cF~7|G4t8#(PXdBI4 zvc0AsjHB&Owp-Y?`*i;yQ&~Ht6%Oe3L-)8`{zEUpckWEwhCvdud{<7Q+^UCBwoI7z zh~_tsGGWA42Lpo|VWmaSlJ%>ZLC7VKm~5am#(FIG;95S7a9mrhX62SyF6MDam`k>K z6BgLTh^F%YkPk{N1~GBXw&8^(4o<>L-Jti5(>f!dRWM#vRsT)zXe#sqb_w%hk3RdF ziHI>HfP-xD%6?vwHu)_P#Y`ZAtbKrNJ*YSTGpelpNK?Q;4pJ-m=4}tN3TaL*-^l`^ zSgiVSa$C}%r8KxB#zdPCR>k}3zy`#y=+FnAy-B;z zelxvf@Yhw|&Xy}m!b%8W8-8sh?gp8RbaR!wZ`6Z0xgs@?(NP>j?+CS>Uvsngbmqte z$B{dmd1Up`kG|wMBZ^s*=SReB9Y-h)uucS)%ZJDdMeg?K=4QA51+J}6X_!W$dshqJ zc)cVW<^wOqUVM^b>*8IHzjt^zS#9_*J-lu^}@7zAIgxph>PVz+93ap5j%s$@TLM4*;(~j@4sW_-gCzFE*!a z;l*(=Rn{jMl8o>9SN+&S=i@kmaVvc|Ch{F`$xRBa8-jv9tBXEZH($YBiL*ZJbVso~ zV3Iucc1Ti0tqOd7!;x2E0NrNRwDQ+v9MiA3zy4jJ2jVC9E5m*XQ(VbEPJo~{@pJJu z_(RZ90}8b4r{dDbD}8TS)UR48<6~o4X_G|pwrPXj?*k zbL#~~Qpfj?v_f&#e{-y7`P^zn{x4jCf?%7Go5#Ihn67CJ!7h=0QW@qVD9+GGI{PQ%pdj0LfdbMS)YtA%6PNqjKTyhf0HaD8`zH|LChjcbNV19Ap5(K zJhXk~TX6o99!E&pP=NDD>-uV|F#Of>Otwzyb zem+>@($eFNc`O*z(12qGYikT ziu(8F)zu`#Z^M$7T}eN=UUZR|NGMq{_GBF@N$dRzaj0vjdYDVh+0ivdd;NB2LrNl7u+9R`{ws1f>M5jtiBHO zk!z>Cj&{2GF*Xl;_c!uWwvAO)XTpNeNcu72Ni^{YY~FvAeY~0rUDfy&!`7rXrV8%I z*mWHyC8iyNB2qn?Q!b6-V8H*mf7@T<7xPe&oBwkC=I6bDxbO8^g5_qT$?rEzMwYmBP0@AXCXLY z>Vonf6Fic1RteEt=*)w4ykXCONUe*2qNA;?wy!LDgC+Ono$Zg&!+)_qfi8I*l{M3Nlsl{5~WE6N-g!C?R~{pAf~Yx_irdyod6PqdH@ zy4HvrPgkrvR;*RC*!rmwsqsRDvEugkox0o|O7ukSySuP!d04omnEU(U1_HI%MdN<$ zVxz8?KEVtAxYzNItty7?g5+&tft}zL6*qv>&4Y{Hm+z@YJTI#JyK{OO>P|ze1)_2y zEi(tJ2iK?U+Tgj~pk;_aCkyV!a|`Z|tkhC9P{%Qd3^^|Rzf_lv4?g*7xxJoXTfPjQ zx4fL{E4+l}>fB4MuJy*hks#Bs+xMhnSwO33l$X6mr8OGtJI`$PqDqca6~tOG5(C)s zZmv-xz)gNX@|bsr9~iUoi|y<_OWYZrahZndnqrBe1vBv|T(NWMnC^9-YqUby{X+U$ zNJD%duTOHg#w(F|+~nVt%e@JQ$1=5|ghu^kskxI3C?Z>ZG1p6P4-GXl#u}I%O(B)J zp{-bb^vD5LaLb}IY>h{$FX&Y;4PIRlf@3?NrG;&vp`MfPet_=C6&>2SdcO zwKLb;>hm|mtbiV;5oxIJTbiSbi!2w)Ssd}Ts6T_ztApG9S#0I?Xwx zDCIb8LoYge)p0c$ZX-VgWuPMF-i<1n0$PjhFCwvq&kaoIqb1Y*USwz-fi@pTmrQ4(`-!~SWnJ?}NezlV)dzhp8U5wKDcNx`PFVD;3CodD2u}sv+cE|v5qhpV>G|UwcK~^HQc}-$TR7K?!Vgb3t9rKe_Mr34UJ)}T?4A`S%0m0n0zO`Rc z6Vpk^vIOg{EN7h`T5mLHf6mbcc_!oZ-QT0bR&<|g_ws)vA2?nZYHLpQFUp*>YIQQJ zuIc2JjxtrgXCa^&N3!wfwXO>}BeJ5-GmJw1RCFIbhl)XMpxZ7NHT`Q|eCSV)PJ)o7 z_{M{Btw|?fn7e>R!o@`=%)JCD* zuTm(@Nrljq(+I@Jq31+1QAc?R&;8X5s^6*UdQR8d>Q=Q%pdX3You~!|3AB&>lLimW z-60p!Aas#*f7*|`g>I|CdlXO6#|jMivZeUzlbkGFPp3GMzZAT_Y|Txz1e29XlIozM zP2>VT*RCZ#Apfd4%^#)WtjMXkFM`OZ-z$lecxPgs}(5 z>90pu7S4S+IIu?6A)OiB+MB%GbQqb*Clkpm7gn9kQ+`rIe+iN z9Z)>CVFNz}R{d%%lwhDrby%xgA@dp?7l-5ReYi(PcxIF#rmGdu?t8&H>m;n3`gVNM zQ|VRGTkF&L(f2p~_6pv&+R?%xj+JMUW1p1j2b@f2JUqpP8V0n>+}C#$FTJk>k0Q{k zVQj3pBE|$@0Q7}(qBVQyIqKH5<1#$Pkb??j@Qt9>b`ktl;7rGH|-0zSXPlWedOJF3AmTY88~6QA71U|YUledC-R`HEg}g&juc$xo`o!Q!d3+n5EFd1=QP zZQ7<+F4o{rbk2#li)!DL;&_u|`!-&5$1;9N3#MPcDFT1pcOMIk^^vH4nW$+S zk=9dUn3YX1OF!>drT`=9tGp{Y_nB}HQ-jJ(iNHJ=>F$16o_;&hg-#tZu;73UPYSaq zzxQ#~m9e1D0iXOO!c=$lub~EBu_8$0;a+jATds|8^GBEUFCD}Y*UPkvel=(dsO3~; zs1OA5KP@sdF}mNSG+6fDtG|_G0GuusKh~%1uj_V$Bm+Nayeq8Ex?8Vrh^0-9m!{*p zgg-nF=xPtDeUsaTrjOkoMcS0H_K{KawWAwytc0PBxs_>N6G~@WL_64MaB%C>*2*v# ziL?f^l|Vv%eZk)P%jf%cHrAJ5#QcXg=ek9QG$@l7@ABK(^UNeWX{-*2Odyxv0SU?9 ziZ-)|Y8mJr;8Gv2NKfbHNP82AGKUidTxWh}k|REZ0BSlzb{D#_g8pg}R(u*}y77&- zWda<21vxv_FiyQ`wEQTw|J(;uj5Vv6N7G|RL`3{=2o18fJO4JLCaRMG3X*x&(71PZ zO?;T;C0$L39qn5`-s*5uY{A7@3X^(^1oudYKP6P8@V;VPazW}FD}vs*Hy=+$rMf1| z1g_EIy1-LC?bgiP{qX{nT3nm3u{y#@nuEa?7KlgZs$*M8+nwyS#lx~S?P^QqhGU|! znxSNe`hh|-C5*@JhvP-qj5r-- z?BmaA%I?yr8ofX3%6x`b0p#@J+8WcS9zdEDcf+KJm@k&}L_s|Vf9+lWUXE%ELg>8y z_8c0*C%CIzzX7Tf?ls$OpIGS!cHPrk!@4_BqS=COD);89P082`BhqZkz%;Ejq>1rF zp>Dok_3d7{4{E`%q#i*0ff{Bp#M79A!!~qB{f=CbQ18XJiP#C}QlPM*I?7ydR^bnX zdN4fvf`4i#0UoBafeZ~ZO1WsB86megfjzPljZ}A5ZoVcfn_Z$_%gg=5`FSjhA|+hp z==^Fx@5fLZv_mG^=XaJ%k`MbjvF3knCCc;$4CP54gefxf7^p1$EtNrYc|PTff+51| zrPI^GU%(5804s@;ch28&E$5!=7-)~Z778tOOv<<4JEeRK(Wz9xJqAwBc9Qd9nT{Ws%^q^kw!0PEU`R)~$HyV5 z)wTHnhH5&#?-F;TemPAoZc|}X@3{0$7j&Dx`XjN$$~YhGEz#5en8-bP7CpS(o%9k5 zvz3_WKPSZUz3ZDb76PrSiN%O+>5fQ~pHmJQO|m%Q{CS&uo*-5GUR_8;L`+wsJ##v9;e>;RuG;WiO@U!ne+73^$~9S2Yv`z>Rs??QY?yYjWYnW(WEF z-TRW%OB~+hXx{wX&*zZu&RY5l{)-aTI(@y9uL$&Pok&nDCIIbBv`vnua;CLi$Oy{1 zSv{XR4O^*dIF7|$eDOM`*q=|D?`V139gO^7Goa(8v+<+#!1Z>st8<^pU)x3lZ%g<= zgMD_s_@ywmVk`S&`%WL;zItX?EG@L_nigJJl&|EQ@wJ5~A9?h;($Myor*|Ce?Chw+ zFbF9+hK03m3)~49y+Xk$U$65H6SZ9omK55Yb_v(cLvllzzqlzG1(04ZVts6DLB`f$#aN2}NZ!B5UOr8F2K) z`tl~g2G^oOzTEGySU8O)3+8;)8cJC_hpq^IKez<>o&g_u|I!KGAE=c{>l0X4ly}J2 z{G6UNN&Yz{E2|G`JJ#L4F);`+%P0C)(z$mrRg|edU*mhdLw607$ox^$cvytgb6^A* z>J1h74xIec=~rTKJ=t7iSNi%j!GvU^W$@%T4-LSEH4TWsP#aW+7yXv_$fS#JT>9}b zvFSB%{jqHM1=^m|eJSVjuyzh13jXs(Wpdy3`l#Yz^GohbLAh@a2IX2^SQ4=hV1m%5 z!wFuwXN&|xacpzdBf-Q?TGkPI==z0MP5N_ zn^)fYefj4-E&1UAIs+;!`6!NC(-FBY5hG}#yQ#dvr9C_8Fd#30pl4-MHDVqftZxwG z%6_>RPojvct*`8}Y{Z!nlwAt3p;{@je3;PIWLjr- z04*fi`BVtrUzcj^?t_{|bv2MlDw{cWOU~(TeCLZ4BoN$gbZzjWRLOj*g5(oWJaW2u z7)QO}F#O8@!EXdk*qd7!(%9`MVgsS}<@S`pU0b^A9L1+k3<5k%C7t`d1~uF6b~w;E zji>n@eA=F#d|q|F)b4gkdp?<*NHd`VULY_5R-%uk94lzI53)9*e(3EYU`}HcL)0Ne z;}3*5!;+s%;mN+XbT+sJlBPsnNr}g(Xqfo1#kpiTjU+SUk2+z4nJ4%6A!HW{ zu5Jk{F^TJQkgAYNJ!cJH+O7LB=Rs`i!hzpwH+gp{_w`=(!p!g1wO*5= zSj%-QI}@>NOab0}>$yHLwz7F^o)yEm_&O4mco^cbp1I`iZPxKMdy6goe__;c##XMS z8A@&AC6-XsaUwu{FL6)Z`}~X)8meV20dzmN1f%(z=XkZcei#K^#$Ozy`gw>v_cN3v zl+??E{!Z-lH8CNte#NYyq?BYu$a{U*J)OqLm$msVJG>i-6fdgIXhLJW%H#vrISbp4 zG8yBrE+d1%)8r-gWSt`4PEkX8l$dy>37((LD#Q=N3e@FwgEntzZZsB3_e~|Ddl9UC zO_sCCK697XTrNA}+U@gPY65b7nwtujP!?~q(1?|nIT(>5w=rD1{y=uAecrdT?81MZ ztF^Y6{n9zvHB^e0aFKSMgUpT$R?#iy$Wr>8tBiZR8a3jmb@AjqFY{9;HfG}F;D7=f zkkfkaZEM$2C-~AQQmOY?`buEucT-`$gLEE-Q`zRzF7&XvzlZ($7duwn?~=hS@&1Wh zdG?B)!6SvzM9Vb`IO4JQtmW6bc3N^}Ge_-|T&WfTefcIx)y?C0_5qIh-}`+1_f8&$ z!VL_B(kTcVmQ;Ash00P#jmnh9Fq)8?M)75qty2bBoS108f{%B>$89Uo8NR&o3E19n zg*X@{7W3GC?pR!2lb6%}3}S+CT~4y$tSTq5$7X)qh)3b$g`$5Mbk z4e@WJq=j9#PlS|&0W$BtB02{?f{NV+*3j!FATJ(&&hIZu%fppO>~xj3XwG&xK`>Za z`rG}o)%;WDhR%e@^I_t{kVp=fl#7)Zq{E4(>^DeO4)dEVmuLKDcE$3Aqxx^GF@KLF z>uMR=U7?XgzAK?_pW0?O?0+B&HePICuvrr}JKfvc>CD@n8Y4Bd{|dPvd4m4u$JfO2NJ=2>y-C6oz}>O+2TQHDc{ z3?#Tf=GXjmu50rkRiJ3e)vpxcIjKx-BG+thpa{1^|M5Y585EbejlwS?ELH}n+B#0? zf45Mr7+sWXqRUFOUjp$dM{FvQKh@bx}OfhPo`h z19WE3|?3hb%i&a`*n9xZ9MESRII$-0f?JnV-}9S z=jYFzvjc8jUxFcW-?hlbv(jY)OJr4dPeA|=kX^mZ%n$o@4SGYlOIh8I(+Wz3#iMxy zYz#~9y=;WCyFC8pJ~5QS3Mrhh5b4_xHs<(5e?Z)S#Vq>`lMg0L?nYX(##gpk38 zMaje9fJsG=pGOZIrS=o~@pwfRzgR|9yRH-H0nY~t`&a%~R}X8yhIXe5k4xZ+(cq-x z=Z2Bss-n{o*q2C6h`_BpC5l^M0Ea z`MWmZTU))ugHz}Ak7r=aruFVjv#yr=Ra|b`?{G#MG2*#rX188i=IA0hhK{>9>yKS5 zLMf%werLB1si((9X@hyHM3Edv4Z`L7C{ep^0o{~8csaUYF&Qgblq^sW(se{?tfwGx*qkCvJVRcw8x+pzz=RT_($z|_? zrh;7(WO%7cz3+GH+WoqtMDBV8)J|bAJy%b;4IY7-E^gTrJ>{^(Q?z~ss}t=3+jgks z5Jh%m^!_8#*e3{56Y3A4o=e@?TEVfC0z5n;2{7TXlo5ZbDEtgxt$;;Y!QskVmm0UYM9xBaWE)fi3X^MDuXWykn&Hh`Pnp3C~^xnO!daeQ|`UM zsP}u5uxkK+79N1~oyd|IrTzp0XO=KzGsLo17^y$3wJ7p6 zLPVM@4ky5bA7JWWox(b%%p;L1BSBrg0=e$FUsD8fOMdSC z3Ue-p@PRF(rrMg?s&QjtesuV(9PM%r*3ThlK+I7m#)RNeIPDBSutzS(h~jX@MQ!OY zQ@L#UWb+ofor!-&C6NX9w8>>TM}A17dA}o(&K3?FO1xQJv04EWa%%-#T&5FVGGx< zz%|S{b7{`O(Q}#*`(mL>7?v-k#quXHsqz;=!w4#rJ+`#8!m%!`T?NF2IPW~0wI%_O zR9tc9oH@pv)_Y?l?SZ(O5_Q(y#J+fI)MR&(L*qz8u_WzFc8>xIIvG?1dU@I3qf#%x zuIZvH?qck&xP5-Q3oY}#6h3%DO4YsecI_?tQuH!~78bS=RsTg)PzLk`)K(|YK((?! zLYFWgbT&l07yghmh9pCKXw^fyX@rkRA?{pV-N_~|D{}!mRM1iC6a!LR5YYfTf+Jn3 zZk!`5Tb&g@XJ-eud<}k2%OM2-v*loKI-X~kn_6+GzOxVy>3=MipDivvEUw|mPvAk|d>E0~;q^!Vmtg+uEMPhd&q+mkC_mZS!n~y|jn06l ztafkP{k#OQZKr?*?@y;%QIhGLnLe=Wf@_GaVwV&>6Ah$AkUhcnTnve+Ie(G?|zl{jx>W(^u3#NW6YaT*Ov6AQa{os zX(YvD@mjcpM>+54Wu3Bq44t~1Gtye?S)Df+bI(|Fip>`Z2}ipy)6&e5(S3@lmZbuB zrsKkW3qc-oJCY<$uo6qX*7r&CtW~^Y$)PO7QyD>YWbWW#YPlaB~TsXX86i7p6^D$Z?5ZSbulY zV@RCS?*{F4dcA398s^#w{K(4+G!UQJ^-$WEs)KMwbCibR(;y`@jSVm1elPvErEpPWx_z zO7)K~f~DFP+NEsMwmOX_PmbAf4wZcnP`aTa122Qp2Od)YV^li!3@nHswV1Y#y-T~I&$!Kv1Ia=pwit_Zq@4}?jj-G zJBS=RK4ME}gcjc*4o*aY7s&yIPD0PVKmxCpE5`#cpTBvDb0f0IB2!&XZPdpQbyYS;yiDI3;HCE7%qfL!pu_VQ2q@xReS z{Jd2Y{=dQka=p3f>IgkMmD^#U3DVAgUu~5zmC!!`#NF>~;U+v6R!R8@!3ℜ_Ox zQeIRSvt1otTha?B@6;m@BxM~u%Z}OX?yAsj@pB3Ta3umUzd0Fn6{_;FXx<8=Kfqet5m7KJ?B^D zvIQ2x!0x#_7eq2`Nk81%|N8~o3sLp+#+2nt7-MzfriZX0f7^r%|PPJ`_>DR zQoq}wsRlZ3k`pcb8iNjQ0S?a450+|?@6d$o5ItWXK34l{>LLyeDePTI*sl5q_h_b+ zrOIon^r*S1r&scQjS|%@^wHJAs*0J>q3%P~Vefk=o2s*O(F;G0F<^@iC4iaL2DCvb_FYUq5&(oIe7+ z2>$Ct|EIHWK(Xy0klm$Xwxmq^_eB$2KJS8;Mc3(87hY-7pB~mQ|ErKi_)L(^st#9q2NH0tv{(pzw9_6N?0Sr0SA9bokGEm zQ>Om`gg5NK37@~H=QWb+NyH#C!Y~bk z6p-H^V5~LH{L%T-X*ZNnlt1}ZY7G8pOmp@=x%j@dZ54-TTj+7(h2PI5WnY980{CBG zSKKu2Xoi}mDXh<3+}*Uj$NDf>ULFxlts*feK%6&t+!-~XsxVV&YBcSF|{VrFJCb4Z{ zBn#&zTNLk*VAsL43-B|;((l*RDv{+CiQ-e<$C-1Tslwlu7}T}L-OXV^Oy2S2 zKq(lVLYI@}-$0tRG{BScP6$qm6_;GVqQ*u(R9I3f4LA&ia&$TF7Chv0(Eb1?&mR@k z7wil)BC9lT!`d525?o6ON`&!0tN17?gVnqu;>gV3nl7~#70M-tw*byf4w=1BX3v+Z zQmOt^F#m`8?nc|0SYtG6;M-bSI_4Mg(=t+k`#f&`{O%LH-TpqkBu$jcW>%k*!Fu!0 zgnO$6mKr9u3h)m_sEck&oo1EVr!va&oT$`ia&okbiSW^SiR{tuS_K>rnhP^27?{>3R=LLS%#@!!R4it&Vo=r>{u_zki%lNc_5*3G+W%4#5!M*jyivG9$ zI}^q3efI(>b{zrrr18MGvnvm5jzH$oND#KLaB#Xy!ZhW1e_MAk^sh(Ev^6rWSQM^{ zR>9XQR$o2XFcL(z5^~F<0^YpvC8WdYklB?u;-$t^{^?Jccrv$}*xEWKAhb$eTyp|Y z&|d?Tioks~f?0i&l>cAT;%UBCJRz#j+qVqtl4s%(?T>1Kv|JV!&w*^5-v8{gp;R#g z#7&q4zlo>!h~mWR-K7PM*8^?f($gTJP0IMj6EwQg8MV=NXN6D?q|2{7%%d~G^+n2f zJF|>g(6V26qgPI0TG=LA=^py=KdQs|%f8MVYlE65gNcQf1F{SJdy&H7sSM;72~E-%=`fJ1n8rX4 zVs3W<(AjrXY~N3PB$jhIUZ-Q(Uw=2_q{@?^*&vEavc?#pPSs0fO)ex?_!jq_l(+Y_ zSCb0M!kFIry%w8g<%Sb+g`!qdT4L21xv(1~#pN`uy=TWH;HXLlQL9s9oYYB4!5u!S za%|iLQ=?~ATaErll2>d#TMBR79_{}W!T+NArX61NXLi=qP}n+7a}}Kik|FcW?a@DE z5@WZws+meYV@}GrTOQQuWt=rksY3@HVQz{BDWW|2>B=%i6BfULgaK~V$x%oXyVEQc zMq?111jk0&v9uk5`U?`E3elU0dkUqZUo$@x5{u3vgEMH#zV;LC>~^;zi{o|r;=t39 zsl|~Uw(N0Sv{1BBE`sUK*1BU0Fu3WY!kblGCC6943L#j}+GD-Ppgz%h__p?!DsSBP z*e?S53QMQfr6>aXtPLNifz6Y-+TuD1oFN9IfL9qChqilpH2+H3bz=!bCPi1fH>_ch?78o zNZj-b-gWsVL+I@FmxY>Bzn7z`RJ}Ji|8U}e8Ild)pv6~|c?(TeY(p(7*e-^Ebhz5H zNOy9T#<%xr5yW*mKx4%$CxZPB&s+^t zi}_#Dx}{)nD+wLVcV{7~jQ!7W1CHU2b+tg*Umf^y>q*{1%kVy_-kMNSF~|1GPg62X za2&L6{>Yq^`Q!APvVX}MSGIR-l8CgJTRLE$xf~rw{uY$(l56}i7MbfL#2vjqWrk#I z!#%h2sr$A2?~$~~`Y&Aus!5EIQaKs*rKbS{m{ zDY0hCsgW03K6Na($7CE*nu#xXJuX_+Z;U}GYQ1`I`XiHCp^@sMj{9;6A(UyP#4@v? zwjF*G#wTUc4e_t5Hx&)a8{UV@8gN-#0{>78&VOTz3jIQLXY$!;=kCyU^TFzqD$xuy?8cE%u`TJJF$TzU_S&=G1&f!iI9~owXWD_LLrQwIg zHJ;0u>^SsnKks*h!cURFz9VoxTw*^f0jUKUrl&7jmuEt#XL#!`{5#I}_h2QIEYrPK zNnrxtp2py$$mr_M<>B!=5Y+3;DK%8jV$&+fi)mTnUp}qS&S(pFa*>8xa@@l!#@3Ym z;?!nDcFkEd;d_<}1F8U{1e2BdSliXEuB%4f*~X^x5hynM!)9pk2*{uDn~fATCr`@| zuacXo0{1WpIB}-f;oU_e;A*`T6kK@jOU&LESs(}+yoYwjO43oQgJWyb=IN#S2N3dG z*@V1Ng+B)V5n%6gWeK{Ow0clbvfCIO(YE{ZBfh>t%Ae@;N`u*4%k^>^s(3{UPHdCK znfZ%^&hK_2HOobB6iBo^=J#ly;&-^Y`L|tTXq)bz#W55QU_i$2OFEog>f@u^^WkB3 zi)M$-{{G|bBGj^=clFRBnY+A32hD-|G6mT0gwxRsA;ig3+H)#QXV{$ThgX{^ce22j zn3;#jTBQ6|6pxC53l94B9Ampnul+U*EpI}=CDv2JU~yH67GKO>>3h1RewNI2ZA?`MKCGI$RoKfs}N$aAbW(s}a34pJ#>n>nnMYK*i1FCMjv zMxl-t5AJnIiIM<7JGco!vS3Le2Kw)DC0`k2DIu1m5r!(MCb&n);3~uGV9Vb? zyCH~d7>AXfq5@b(yp0O+d|}9xxDfx_mXEa*;Xdu}>-M$z9xu|YV41Y0(Gv$iJ`)AgW*ywf! z#rgFUK&lP=?Eag(QqN?&oG|wEu@g%COwPGROidVw3ddl|+IS%S^*)?v1jzdQo>jM6 zK2`l~-4)l7A-o#l^&QEO-?jAZGT%SlU_r&_${{AR%GzB#I))N4?_?wDnlZ%O#vMC;S4Rz>ff|4bJ0D4}-OsyS#s#LTN zuRKl z#{lFA2l!AM4LNi^S^>qS?PEkYrK~>~HeU;v5CE zjl9gfD^$C7=e^7oS)3GcyD28G>9__wON;#6z=J6;LeT$d3VMXv9d|!0+ZM5LoJ(YP%^5n@>o!vrUiCdSP@3} z*JZjA9+9yJhQOU?wb5_sP{Y*WQ$*sNE-@a7#vc?{8;!DgRICn)OcIIUjS7U66(=Y( z18aTB1?p%NIhCYHc{SB`3;2l|RZ+L;#lO%bWjYH}tQ_ zr?8XW)V@a$!UP68fx6?c*4y2V&ez&=tJQ}*q8RWuoh5RjI_;dYKDs3dxWObGZnCPd zo0t+&?t%njuFrxcfG28G08AP?o~GA{E}L+=j*auV`RoQ!O^FKXN{(qQr>U9*%ZxE9ub+Oy3YHWC+-#}3=cBzKx z;bZe1!yGRmD>G-V+Hf{`UWWoX1SSr3HB4RQz;arMAQwKbwK8IZ;s!jQzB7+h56hHw zvZubl{?;ByFRIfT8GYm|!|-kWxx%6zVWdCq96hc$84HK_10azal_X` zJNIX6)=AF%qgkQ&^X7Ej&EQ#!hQLy_C){fPB-YLE*Gs=t3h%#xt2wHYm%$pMZQJgy z5e4^K=j$Cdt@9}~K?M0q1CV~%Kb`vYHVHe1*RnI}wd&m~~EY?V?bOBb{=G8!qHbVmg?=|1?r z0_;HpC@p>4?)QoT<9-`hm{p<`EyFolOs!P~>TPEt#FC3F%-;x-#wuP?P+LIi=ZD~X zGmLrrcJ>sh`Rz}=Bz{*I5e^ttvxu5k06a3P9ru}Ns@MQ#G7XW0HQr4ObB0?N z*@2xWT#pM_-QMoc56o~WciMw6r@53uhM@o(SUJZBr_c=}TLbnP2%JkkhES7x!VE(x& zpI{HXRO~$bp$m882{a?M9|SJ%Dvl($y0QPNex*Xg#}p0&-Da&|;jIiCGaVM)P`xf9 z2Vzb(A_|VYwD@?pwzq@{xw2UD&Ip=fB;zAVm%pzjCFOm6j);k(PF4*1;kq+ON)E?` zX{s`~UANa-2GQ=kFG-3v&oU(?L-Uu%3xnn{RkE=CY(wUo$3oDZj8ow$s78z^s8<(N zQck`7`v*bs9A$Z7Y_Xpgp+eXKO{qwgOJ!e9e)el(QH-&kXKpfWmmZ61Sv~xTyktU; z8L3p_$be~(tdliOUdE(Kc9wr<4jluY&=S4v=uqp!PX46XK5I656@|d#^g8$B(~12# zxd!yM>!R6pWDVvw|CN@l_V%v4{QVofb8_eBnt%xyXeK82S%Z?(-+-gZ(MB?M;S1)=aU0h+vICR&5oHDz)5mX}AF+uZG_cS&YAGK$%xWBM-v62mGW@vS%H z@OAWAHA)=hgx3@zx7%&oIp$SESX;VxVfnqgbo(>c;m=fokPcI17LBPAV~Odl@0MF~T+&ULZAT?W zxn<&QC%lVp^JRb($lg~^bJjivf-ZwcD^*8QMLKQJX`y1O+l`vn^g_`npdr1_B$!Gg z%jv?3JdZmjQ%N&*`oW^lG#IeO!@T)i`-jv$`b3qv>d`%MQdZES%eVUTf#`~6Bg??w)2 z_@mbsVyzDx=mOpL<+4h0|3-v$l= z0y0Z5=~I4Dd}L*-AbD%C6zf=cfed{UA?z1L=2#wVZhfqJpRSQ=knwGmssQ^DnZRGP zkn&qyT1kmoX@&~Rx(qHkrVgx1vQg}~pju|6Y3rLWsL4BwXTtAo_WYCiqYZr^D8=O* zKq=5ClEPVcQ-hnb@=2{CQV>2jM_JRLWGuS65|9*?MY^4Pkp-jEtIE3sZBMgNvz|9e zH(O6L*HB(I_Lahshxy!hQ!T>ScTrux@FL?oaq|BpEhC49&cN`n{I1<+bGnWhj92E0#cq$MN z*NZ0wAh}=j=8J(aYqyf;AZiBv?kIEek3q6fB~O)a=2j@4wZq9b~t- zftv21vhr4P^0y7l>(;4%ymHt_SedAyl-X_H+T!OZ28kEs1|328cBs*ll0{J@QL^P> zrtJG$bG2`Sl&P3em?DXDxe7#33^1vwmboi`FFDa@6B(va4|vsaP>)Xb#8AFbldURT zU^F~am2QwqR_c_psm=(`tPQwyW>l&<6OOq71ZQyZf)MVy*ySY;+t#AWe-I_E1ztSQ64s5kdB5HGbpYx zN00sR9Xz%EvOHs^1#%z)?z-V3jz6|u>KYqUwa`LIHoGD6g+b_BHZ4+|YonzYyd2O64 zFp5_`Yp&5YcJWnsKwea$>`?SL3LN1o_YBH`z^2B7M@j|HcKVto1K=gu`&ZVpm86!H zat835zrnv#o}~z%8Y*+$96fSBiO-y$HooDj3A~{_Ho?IAhXKsa+$Y4&e&QKBw5!=z zUQ)+UQpa4QADeRM^cfiIPF~EcHA?k!?R7g_qMRP5h**S>Pw?52)vfhsV<^O3{3S)%-N=5^?pEQaz!&?6DCX_m{sbZ|4j4<<%_y;^ zgzSmj&uN*pi@VJY&{!TgKZ8>1{*$e?fQ z)AtXd!+FQc9k;*9nvHLTgZQ$D{~?zDkG$h=`;b>BMju19_Kdbh+1LVh<8g>?Ei{ z5Xmpby@xs9>q#FQp1^2fV=EhKb@Y_z7vr~g2w@cPM2O|n`DM+U9;(E$5iO$Q4u3cY z>76){c|U_v5-k^Af66C^++)#|VE00fhp!RKmw6A*mIwH%t@S4`Ay@i@Kc_T}b_c=O_RN_c#&{JbeJILCxc z+PlmN*A_Z-*nS&5xkx3_`O_eT!cZ>8FL{1rx4*NCPx)!%#QFYJ1YdQ)@4eXy6Z8|j z{|az2?*yyFK?E$(a?B$Vh}yyBbN5K%#-zka$&`Y~VcXg0h^EtXZv$FPuy|ryb=D7u z**+qvh43*BPfeo9_L86&7?bx$B+*MWS9y2($7?)d3r6?@+wGD;go!!)r_s%y4}#d!?5sCPRIW3!#|Qke4+0G(H*LCg*GN4j4K`<8G@A4nZ`PiqMK&HGV)n#R!d_Hd+jRS<4fh8NCET9 z9+sDNM9EhfWH)Ql!Y!FDvxX)kGTkGs?+iqTxB=cNn$m^LORJtYZqL)9kZ}=x7ovO& zDiN_rxb+_`u{mA7i)x3?CcDpNxktT&-@d|n#bal4+us;enLkPK=0^u_GFxn0WncTjd5GzcQ7`wX}u}+`O#hq%((Jqhk)dx?LJDuv) ze}+u|t7$zCz%(sz7m3&6Y1PFaiCwC%?Mwyl3nDkXAn*)MTlk#@>npSOiS%zk)cF<> ze#DGD%s5}mct=9_B5ks^Q!r&VIgFBFlGYrycMWCb2OhNnKe~H zc>v@I)9*rnu_DwQTADI4jk&p62~ni&l$PJ+L<=px=blZOA*aTmv5vB0C71brnb;19 zE9yDfC z3ah%@;;)}QppJ~U&Vhp?5gxBbPNb|!Anj#SGgX}Yj-+%Z$2SEZzKOlr?pb#`e$C$H zLL{4lj5_*+`hM5#6~9YG-OEc$FMV@-o?XDFhfAmpEx3I{B>VBW`x!)mt~R#7zd1-b z>5Uf?Veq-Jy5U+jLRHJQJU`R@8wFCLOQ?Z9972>|2%=-P|cge^vQ^Vzggou z_b@-BqB1Mj{g~G`iuWBn?rW{pTF-E%I&WhbgE6Ma`yuR70I;0nbcx7Fh9x6=jT<8U zt&Xes60zm(Rj&#&lVhnG8>@_OcvTu+QEZpEsewRJU5hJ zVdd_(9Pvk#_PSS*s>uj*_$X(ywwCW`vSxn>$c~;aN;oHWT4;0OVs_ziYqpO}$9S2~ z?%O8tpFBc>;PxL7Q!)v^amGu3o^y|Hqu|@59xcI>V-QEP*RhLeq<$M@y<454n&ffN z%bQ&;Q&1k2bwlT`aX2K`k}s8&{FKy6xzcAIAwK#gAQijR$(9iP)*V}s zz?^%jG!{JcGnl~Yb!$G2336p5U01BKVn=MLfRo|sW>t1XhqpuC{1tu1HY0dlUs(Iu zxXLSjp-c-=lX_+*Yi>h4q8Q4~_?unK=}wXoooc}NI=Sik7~g)wM1aTptAwZ1zCE&4 zewo##yIg!Mt?BmLDgM;#gbzdbC7d!!gZz@I$m0_EkSfpGF?$kxQ^9A~S5#vRWBaU1)Z=d@=iVPdSmo9lDv6R}e5u3_ zJsQzvVYTsm4XLob90eQrHv=aTm{G(?FD~yzSmku|W?vL7#@*Twqqy(Gf z?f~NvbmyM`G;P`c=On7DJ*&zZAZH?6=?LYD;%aU(ne8 zlmiONjXw1CC<I)z$vvnjwFv4DVpiJ%!#fmR+&^N#M z^ArNf?q^#Y=uMvh1xz{vT)vvBgj`}5273IC7?$|wF(zG|AxSOXg_tRA<<`rF5cwhsSxd+j!+ zJ*U7s!-)yId!mgwb!tMjerpwQqBZSqrj^rq?A4q0q zQKGa=8O9yNz*x!OuBjvu&s`)Rw z0bGVmL2wdxT-}$Iv!=Q@9snY9Y80&nG?s%T5u;`#I)nh!BozU)3nnExKNJxc%KaF# z%uOij(L-KNxG4*xqWA0z@dtBd&|090yaX?ri=L8!rn0@FC zmv}Tbba-Z~BCsp6sW`x==Q$$$X1xAyVsXU~KdgLxX%ww2ULPsdiVd@<5C{UOfz_-< zgB%q3lV8B>L#xAZbKz9>LK5O52SNSt6}{kfkH{ zclsHdY2b~uo8Cj@yPGl;5ouW`Geh6ra>BQQ6nzdQc0@8VNOEgG9_zEBYP*KE zBOm;qT24(IUm3(R%5#2o3N7U+Tk+NW$~jVbc(m_T%Ywne+nS`*-q-0iYE)Jleu$-> zZfe3>S3dn?)&d*VjAXC|Tl6m&_TA{Q>sEbPi{hP{dDP!3?2Gy;KtJy5Jzau0*k-Fh zi7I@ToHF!#~f<>AEF_vYYEuF-SnnU3&(s(51l z)2GTDzl$f|Ew{6@ii+dR0XcusiDy6QGp7ZJovEpx!^WWHKF`1?mIMUTuRmuaJP0{M zaDp7z92Ut5RCz)9n4jSi@Dl>qHOk1Tf|`}4;OWkc=D#=09x$x$SmlCgpqyN~O| z`f2vf-C&K96L}!+SCRj^K^yqp8|vCxXP-igu`ZW>>|wADM6v3KS!|_3^yX{S<{PDF zf$(wnk&5JPDQ@ae0;FMnRVYe03kgT{AzG1lJ&+`~T!1T|gzQwdye&;z zIQlpE9T%Nk$dp?&ROu6h0q|ywHSh&*2FXvQ%)Q)eTBM)I&1nYC$@u5QPll{-P`>~1 zm)ohl%Z(i5z;`sMI9d_V?>hF6XjoX`mo9SnW>J^vNF)I~z?_K^#9U}|tUbza5LnU_ z4e4v^rIy6Ac9yiS!8AMMz`Wb947j+1b-22%aD~HoxDHCP!;5joJuU+}B)(L5Weh+h zKJ%6AczGZ5-%ZXor({2lU1;CH>rIv?=q@`3nh66j{|lX`o2MARr$Ci3*KfdWQGOQx zvUt2+72-`|tp~G=IK(LS3H;@^QD9Kwo|HKREKDLP$(9|^>>+=Ayl$58O1#N~)1**I z7!R-ad-O)2<#A`zrgjtap@0rBh`UJE#?DD@iQ;Z)#OJF|WI${{beFx;h_8v2ZxVzn zIIPo6K0ETBSN$5uZPGT$kAT@dH zp}cpX6dqZ=@?OxI+W-Kwu{YjcsNlLC+hsjc|Hdo}uRBv+mpe=Gi{*JS*#G2p!*4cY zr`u|ioDTM}mSUFEFq0e_}-v0IeMj|ox zq4d6)=!ylx?7kdv^Lr#PDZMKpEG$e4_7Zt0@>)6{?A(lqWU;NHE*q4F)w-0*1EXpR zwM@h+rUz(TP@|LF6^G%C9Asb;sxXqa?$%znc;B;N62Z%CS$h3KcpD1>p z+0gG5s|E@fBcp&u+c9cB$tj=|J^kTv@#&~0pvKtPFB-Q^ffxtZRXtsxqlU`KN)#m3 z3nkBiTE=n=L^G0}gBmE4PrTTR5p=XZOLTcxzr6l9efz@I)*hiS+7kMg=(Fd>dYy{+ zlZ(UY&3VbgFz)464BJo9z{TwbEgB!z03}b(kYVW|cqL_V_YE2GKcKus(bLt{^>V`| z{_{~Tq=#M1X*bVa#M?Wi;65}^kI&u2KyIvVJ+o2CX2|kWCPgx6qhI7EBX?rbfGi25 zp2c>#e>8P1M4r&2pQ%&PWK=a7`&Pc(|G4ka;kdb3D1OxrUaO8?IZwWHMFqjpke7C0 zrA)Le);zktlW-ldwvb2_$KY0H&`OfYYhzP{_e=QGl^)YL`y;jeZ2?a`mv$T`mWI{i zW@{In*Tta?5E#H}m0oEERrTjXCTY-}VbeJhU2!ed?(s zw=9&T_C)Zh!lat;kCDdXptC8uT^Y)n^H*$V3p;rWd!Y%o?IQb0mGa+u-xS-zX!X4M zM<0aw#*Rt<3mC(I<7?%K_4#OiG}zG4Ix|w!&UiEV=DX)@y?t#$gXEhkz10k&D0&ZlJ%cSP@;#ztzS=7}ft~5m zx7(-K#4OfgG*-aMZgCDGenjB~ApcDx-0K3gF3G$0R6zJ>9!_wM3{L1juJy47-l}nR z+0OE+*Sy{CA2(wHSIh zvQRFX$&6qxzkhEMG5X|zhayfs3KbBQQ%__BN7Pnh`nymJc~`Ox6&wTwMgsUf69i^P zzr7WGQP<>f;QRu16o7|E_6`O4kp2!qsgd3%FKO#he|k#_Max|22Y0dkJigmn=_bMe zd&+*&FYY^b$dZP~a&U(hTk=zAD5Y4$&g&nKVC)c}z$bSV&T%kRb_OSh-9Fq41VMAU z82p-_WPjnBkFT?E)0WLeF zyDs5fzT#JElm_nBcyUr;b2b~?hUdv1AQ+c#)=3NVKL@qGwzl>}s5|tp&eQYn-X_Hm zOlt2A|9hScr?HLJOh~34PXJDlNC+xy2COc6#C(P{M;@NIa#6^6Wnxy^Y@Ai#*5`H^`GP(*4lP9{c0gQ+78Zx|ZWT=&0bNHnKkH18&BFNE*fr#Q4=Q^^T+ zk3>jPBhNW$uzYW6p1>?saAfU6K&iTX&ecJ!5YqPhN0J8G^W$L|NmwzE4)`Zns{>)f z&Va?4SDg#j&C8=HMPCn|Z(i-Kd7byMvp2;9E4g7fephD~N-;RP)rV7-S$^X%)4HL0 z*qoY>Vv~DR$WDDa)o(FSM_cx(s0c*9an$QK1Cx8#U85*&xcy5^hlvd``%1QjdsCqJ ziZ`RCq9XKWN{yy|blQ!NYcwlz=q)GU874-wjG@p1fwQa z6aNWLgEX|057|`LtnPxGEFwi4_@NjyepQ63G@&%X`!%ZPJh`Ya3M{JFNK1h1HE$5V zB%GuOlR$$Ky*-}hJN^lAYilcx+|^^WzC7&1ooI;?-b@C7IC(EaPL$i=g>EEBPRc6+ z4QxY&BF9xRdGBOv2`;$ybO|&a9kq-F0F}}m7=z)mR2kT-$KB#Xm#5qB4Bd9%P!O<~ z-32>mhIO0w(ns-Z{dh~_qrhR%3s5(;;BNp0ZhYOssRWdZR{%|&w6N=Vx7aHx(kPX9 zJYz#k|J1eekhb0HBj?AVTGYzmz7E8SChoaDS)y#v3^NyG9h5oEG$f2JT0#LiByUlZ zkB?+uON^%(K4AI>mH&f9W-#G5X=57TmV>d#B~m7pRPBM4`REP)z)O;9=1Lsf8e&3( zuhEjlHeS(iEHMq>+KLZZf49*oM1vCaE3O}$Ye!EOOyzpTe>74x?^+$zjBFp2|_^qesLnJVp?kK?3*fWyw>?lTZ!w zquQgXf6urJF-g%cghZT8am9?itmU5bQD#|b$vF@RTAw%V+AXB;9yTKDW9&v96`+Ai zRPoAQDqRGe6Sn#_3B(Y5LkQkOX=K0DHJoWq9CC7Y?E!cfB_ zA-k{P!*cOa9M4UU;e4!@RAWKtMP!-S|2(Rgpnt;9eRB_a@=9#$bC=&7kl8(BTglT% z{bKdyij?(tv&0SWc*LRUoFjX8P;^W@;0iian&AxZ>!`9xGB9aUQPMiCa)0qP{*m$T zqw?AJl21Pv@aTx-T4lov5qMw=R$i#rnwLx9l9>7U#6m?;8sylwTTE0eFJVfJE#{$b zvWo34Af1Er;F_V2N(zBVi$Y|2@SW)K$R|=X{NDCX(P}OY*rxV&C@vBSQK5Z@k&8}u zlAra*!KK6d75l-Oc)(?GfA&LHfRN#Q#jB6EDihO04SHm7Y;}|$wjLTm#&AnX16Y6ftTI%b-JQ;F}}?JvElR>U`e zC?YLB$p`|-TR@uh3vm5Hr3LZcx)yWXuT`heK?z(&DFg?t0=x&L+N-E-vO8>;3kNabTrWgiH{@MC0ke+eRC^SH;C!Oy})n~O-C)>(z z&e7cAv*H7#d@31XL{>>=qGW#!4P_QMp>}o9LRKbjvg`Pv4Le|2bJ_1#<7p_R@#4s( zkGoomw!b(5<^V7{h>8*5vQn!;N?-hXVdP=yIWQZn=Xm5y$XO{@EIw9T%*$ z@rapB|K+36%HjT1w)P<_0YsWV9T)XEIqtKy0IzarhO2UgRyJ?;2YJS+>(EX%3{FmH zt3oFJ!l-ce>6aQ{!dKZij-0j*t9hQCV3HRMsGg`BN4SUYIdi+|jqolmE>CpY-EFIt zD6JYIMfdD&(OV5XTMtQPNj>=5=YJclt{!+9vCa%u#N2+dDWu+)6O5gd6Ft}&#*oWY zW2EI#5+4_c@8lD{Y|GcSVyxER|0E6`TuU?oSY?g}vY<|?Ryf{v8Iw7?wkn^7vW-ud zs?4hx8@cV)I|#4V*O&~dc7^`nLZu1z+@Mxkl;BLj50>Wy22C4Je`vP?^wv%E;hID8 zTlB6r4`dLj083PJ1jt!2u|I=+Y&>$%;@`N`)Jc#iRJ7#$R3e+s)4LC)ob43*!O@6X zdGrJdnWSvb8erG8ebv8Z5~IXPEJBrHp>bDIi0Ko(PofTIb?Sj*AyHp{Lm-D z_cJXCDdK`e6`_3Q(69!s1xxl0;C=CenT$u+0uB70ey^X39 z4S%H=sK&Q)*qfJJh8Id)XE_F6CLTW7`|Q9{&_C{U>Dk_C^dWnef!b@<;X!VOA<0bg zzQbQM)iqvi(XaE!JT|K&Du+P*J118Leh^hK{@ZKQs<1xQmZi&%&Slr`W8BS1FT(YV z36G1TJsNY*rr|zZV~0a}lS;9_8?&~T8}$l1O69z!3^jF267S}Fg-86en8BET@KUUU z+}z~zfYA3Y3W>pyOY9>VIqAen=u#ao>OZUZ5~F(Dc0xHWpXl4TX0u#?i2nvP?~(9Q z?8(ZSm}-})$j;L$s+OZs`oe;Ir7QedesWwA9^P;)6;9vtlS<8{Xfu}8fX%wcE~)mB zB4=1?2>!I6Nt~QGs6)qw#vyygXSw{CEBur2-$YylphlgB@g0}vFdjaYSqB;h$QlSN zOR@a5v(f_9o!7g*+1crvH#xpVAAcC3MuV-hWWP62ki^OqTo?(yZ3I&7^Xn;HlCnNSUCHc zV`rn_5|?^nxolricZ%l#aUOzBY1+)f8t~$+B>T>Hxk;Rak$+P&{KN}=c$NIkKIv(F zTV6t=<}NX1lI@3%*S@_p3Oq%UV5&Iz0siJtL~Z4-hz8Q^W5KFw(qi#YU#eabWMP9D zRD@yDlf|^at)ySo#xJa!2Y*Z9HN;27zz9jsET*H~iKG)@PZk8=LCf%kzoMyc96bk* zd1N#1C@tG(ogFs&8=BxcC*;vn(1V;sPOdbz#U2N%0yd29{2Byg=D*mxqS2fV^x2X_ z0n!r6Mx0!!i-YsO%YBWUvbvq-mPd~4uM3us*s`m69jtzDY}oS)E#1N!=DxIU%C*>g z=v{l1<)KH}X!rl%mUCE&Of@oEu1!7L=g!|}c+G%M`l~Q@fzlRrV{@nC=;>y6;fh|& z*h$oXwp#R_YNw%5FD0&MwoQyB`6a+6%{1+8ldN1UNS<^Pz$8mTMUSZt0CPaz1A7sO zn;XP5@kYG&?7_fI2G3YI4jO90;J+9n-3(>nIO?3O$Ag;b>XgBpC2hDCy)?EY8niXA z$>nF3s8KS>AACHfFMA#Milc`k3kaNPEGnVS6Sm=>YcnqOdhm&S z=vuiG!4>mm07dny6!A%TT$&4GB#9 zf1loeOAX(B(gA=ZTZ&NYqhlrLP~LZBuLr@;P#9&ij-o)T!x%E`0yfT%_;PO?n!8sG z4e!rUf=-qi{`1fGB8d$&j}P4uvarPny;prFoe=dX>(cpJ^Gf4E!eOjQENgs zKNf>ZIEn`*||M%4XeWmQifz@$jJLu}QXXQFivy{j%qLPUl|9 zXD{G}>N1aHS!|=^>o6Me_(a~ci-O&YUHCVZk2Pm8=|<=4G2w-ZHl6iDusDdOO&c?s zcyatjjD zl`ULl215Ct+*{BkMg{w zU&>6cuqV2XpKj@@ubV%qRN3%RQRCg!$FS9ORa?2+ai5iU&p8I}L)@l8GDUfvHzZxh zH9tLAa)b9!>UwNl!$;^<-gk*eELHz#Bp=VJc2n9Mj%9CXJ|4{_oi46@=(?`EE)00M zjBg-IzE@3vyrlRfAx;BmQOabSC+k|QKKxcPBeVZ<$DtQ)chD4zt%Xy&SdmHr zB}_6Kzy~GD^+ZOa!EF@#QjxO&8FWU}7t@S5s9+jLlph?33vFIP>S8y+bXoI$S!)lw z@<>FlMPf~6%tEOh zgT}H#33xV~zm-cll!t%%wUXv<|Gm08f_`6Xxv5IqL7TVN3yeFK_dzf?*w z@2(*Y|7y>}nAmF{>V>{}e|gei=H>7DP~<FU1?NZdovz=ott!y5FFD;9Kg49T6 z_u+tpCdomu)|M-Z)6fQ7zKSdQiH}7&FhIzTAb;2~?^kPE<(BDULOCl8Ni8qsdB|&S{R0;rJda3-R z)OQ1kS06^@Y%zZT4Dsjj?eQwk4fW?8TUht+Et^@>_mx_8s~#$iY?dv#F%`co+;Mpp zr6#&c&sbvj0c=*7Srv)#q^0AA%-~TM3wGK?((ybXReG9?kwJt)gf$o!c%v;DLIX#? zrow#fVkr32daleySROMrshvusCD^R}oKGd;O%os)yZ93)bF2n z7qyz}8$4p%jX%}J)~x!)AR8YjRMvE;Jcki{+0jnp75?2<^{$3yVg{@kG}pflA4D_c zfIT$UV}FZm&T#WSx{Cz_bdg>kT8j#soKjJcXcw3=JpAC)ET!;Zge*(RL#VF{B45ss z0;5t#NzI`U27F6#d2(8IaUeH1@il%N#0M}QNhY6jSV&AIfj^K6g_SOec8~FsD?sVm zQ%qa>h*1=xG_a<{p#OEtTO?t`N&LfM(&a;hFa$KgIXyp#*h1YL0mOb=kSrfG2pZb> z%3cBmpc<-U?rI0Wp=T>-av+~}Zyh!{*1joNN-N!0>iokSuu^3{!+pTXOrnA=PU@;` zkL)>Ody*WrJXzL2ThSPcasc#rIhQ`8P{~rg#Oo~!MM``~6bJnOrSleQ3gIBinsqf8HVBR`5nf#El}ztj%_kGwxRrj_9Ia zkkS$g`a!{!JJbk4#qhs8xj>MPNH04-&Q-6iLb3gYGFPtQ2s!{Kls0)u#Fbl89%pcrzo;`}ucrTKi z^9DHlY86b3w=S05Lr1dHC=WlMccm%g*)q1wHLj^I!PEW99u5yN!fNJMjGcZtru-3J zhBaK1jtiW*dN|RmOutVe?a*~N8NT5S^s93)HmsK3wO(q_lCPLcjFlo%+n^$0_?;Ns zn8wlP)*({`X58|-h=>%NBgNN)M+xxRV2=uC@}R(=IP6*P5^?A;Lfs-0NHi;m)nL^C z%UA|+{(KtYoD}+Dyx=>s`Iwng&G^KOzHBnQf7>&{tc;(w?bqXlWDz9frT}7sWa8u( zJA$aO>z||nMbg56!hU?k)o4uHxqBi1Di{cOZG29)dOGs>9a#lmRWARqI?NO^$hGkk zdGqlLi%wimk!5%)cCj!6CC!E`sxWkhSWyuUjF&wRC4svohqmhgl6#@fSCftw8@dA8 znIw>q(uE|n*Z7|~kn5({$ra!s2#(%hkXZ=a8ufQ5iiruj$+3`-uZ72ojDu3Fro6-V z{pl{bJMGt5Yl)Cnvyhx_(aV^kPz`F*MY9qKwnZgjlbX{h54(v$#`w~uIo=Atzkfau zls_wck-u4m*CJ&5P;>Aw*aaQ_a2QwBIQENXe0Y=zFl)V>{#VrY3ZWl+gr+*0l>i%O zFb>*yR(6@%cLz%K6o0U(GAJM`XF4AOc{X&mu>2BMxd-dPJ_U!^-aQ5ES37J|1W57* zH5~D}8c4*5kLlT+mvH?`$1@7{Jtclqe85OMm!-Ck<01+vwuNgjL|HOm$x>SYo&^5$$W~ogXK@-ew*E%t`$yFCy^xt#+2}t z(M(s~2~76V@2EP+HFD=;chv}A9HJ&XbvrD|F|1{sH@^u3Jiby5yRx!YmGjJ|a%yB% zi}&FR5ELaTvMW%P#-Q0RPF&<`F1aR6;|!xg<fjyu+m?5fvRp;GIKK@-EoH*1HoSP__|C`%4wobSFqf#fwt^M`1CjE#! zTSS>fC{(e$D=g8{vU3pw;;=H}RJo4k~8T>x4QCECS=+kqNNi)jpDp2)|w4{j(ZBd0Mqu z1#vz}0g*SKBd3sXu0+Du<@+xMW2q$=v#*82_D}vC{pl%;2qis2?)5%Z{}Ec@OO`V1 zD*9FA_%k6Kosl4aG+F)oI@c&CzylK=^d*1JqiPpG^|riIBZ3Z9dUz^v`cb8TLUn~6 z-vyWC`C(CK8Zb-fH01 z%k#GCgvWaC`@RRt_6GXr-ni6b{ppl-T8l_uwavD>kc7Gk6Wrl_)7}cwmLoSyaf}Rz z^6cuc#)G}+Uc*bl7kpms8SgXbnkLN-9GEdW5IMBYU)Bj-j^zteqKoTWoQ*j0odqm8K?1Rz#i zJ3pD_S*cKBPNJO5k6vcOFUf4GSF>;*w{@TVatORiO4Kg z3-v@MK)ne@CbTt4ox$O{B_$IuHI;lc!nr(uw`uR{Pt>!iCCc1OMn*oPI;*NWIJV%gluVnM-78| zj1=Y8i5P!h8{xrTIamy09prUxGyfAMK~%`gdUBa~zV&r)D=wQh^DXST7QM!h6lYN) z%8(LpTI=r+HURHc=)rFWET?$f933Gjtj*XXnGBjqBAf#~p#9-jJ1i2sbH6sS-gN>$ zYj&@8ZOtqo>-7`N5E&aCIc2Y@v2P2^b?Iv4UgaFaSmX|7w7^A8u72j(+_JNM66aL3 zwr1jYs(gK&kla$kcT0uf{!?^yKWS(-zj_~kf?9)hA)DC#0_KhOrUs>(XzxuyK-Q>J z3L|1O^@9dF%U#+UAQjcB_$Rw3-cMU}Sjz2d9~?VKX795h=yCT`3{5dfP%mbb2tCI@ zW{_!`Zbc^45#jE>Y4KcxPtnhG5*L2su1dJZ{oOl)R)EKNaa4>RC0bZHxiN!=oPkoeQP+poNJ;uG4RW&-dTkzroLn=`W_aeegiv@%LM-d6yl3`A5(_+~ zt&7EhV`7lFYL3e&&8j?YPsQ~BKnW2U?S zzm14iR#&&p2@bvfXv_}tGE{N zY0+(Mr-=`@UcN7F^(zkidQq@)xhIBgn>2TF=2S(tIw;N%I!PK`ALzU45yNHn+ePK0 zF$Anxve2=X;k?_P(mZv@!TkHz2Jkzfd@#0O71HSqn$J$S% z8CsGJ52bf5%_nBdjc?z%uul6r9Qev)X_UJ?|0pk2PQ+is4+hwLG|M=6A(m55$zsv8 zl|uV2KggXC=B`VtCiP4E=2(pIDL1SL<%Ldy=!%eVK@oFHSTa;#^1F@XrtBwAvHQr*hr|uiz^yCN2Ww2;xztOpD4$a&?%d#Gxeef_M7PSddo9Z*PxEFz#3mU)G{iTfnZ67NWXUR98WH@FriMU2I`QbG;ICx<0}U{mcwJul>I7vr%i`&dH?) zlVu@905Bie@9VW>-tN@I3jU%*zHcX8z^Q1!&LY^Zo?*IZDFyc;6_osJAK~8J$Q1n8 zJh(O*FZ$~}`?5aB#=qu;cOAT+4uusk8Imk%2wRIf-FA3~L1F6iA2>?M8)W_zQyui^ z&S3;SGsqvA@jalTO5wo~auM~m3NBtl**$t`-zC*%^JZJ&wtC#)#+!g37|%ln*<`!H zzCJyumm#tSNH3aa!n70#7jeGr=5i6j9UK%xCUD8{DYGV@Z+HHqL7WE4f-hc2xs)P% z{M&Ao*Yq-3i1_p1uW+L?5r#@eS<)|~djSf6L9!4cSJ1|vf9y)oG@%vj!(Ju}l+zV2 zgm#4jelejnM}lAKm(6s&5>nNj>S7Ul7?rz^cJ6$3=bOsXy-Qm^qzY2`n*7yF=la;rb4`Tb{sNGMZ_2y3ks2@Qi&#LD(+FA1 zC=%%5!-XfSVBAITr3~D84d2HCA?`sEzBO5)URHf&Na#`iPnq(FwFqz2dS_5t=cSaKa~6D zp3tk1ZyUG=uB}2%;A?=M`<_Z94_s#o9zCO(0tfaQ}ZDyMvNJv1;~6F;4xhGTz_r@Nm;&*-JG?-Ga!<4wh5HR-$e#}3+> zw8w1C4#YavsG#%k`*N9mcJl|Gy zuP$w_phWy$ET2Tf+?iet0qIIyT*~NsQcbRb^H(v+rBCfi^@5;xLnkkQwI?C~AtLXc zJL;iO-+wj1f%$i~@T_wi;HTm`TJ|cdnr!CT9HdesGm+7i5uBM>FT=ptK+S33RUFN3 zpu|u-u#N2tx`r}zXX9dD=nFaEw7at)(-IR$vnJ--5};a&J9B)ooc`yV`jsJq^(@;# z#Vf5`>HUwz`FYaeFSU>R9RurOf5eOSUqkWMlbhoBk&(_j!$vFAa&gq4?n30|V&#RE zX~PYUa0TMhMD1>UndKZH{ZRqaXmtoO%!z%A|6OR#;M7v$Avu@LODU_`KnSsNM^ASd5@)G3kg4g-LtKbNciPa;dUGVBFMTPFr?^d( zI>E8&(b#%NS+f*n(8aa)QLNEfxNByNU;ceaZ1bmtsQ$Uuu)!vwlpz7bY;j*^y00G6 zj1n|_&r_;k%RIVpb3b~nP9cqouIL$3D6=5(jhug_^<#MevBPoOFa01nzs!@WxSYo+ zyN_1!jx)G*JJib`v;T8v!KsbL@JMz2Ji#fnjyA!p2-Bvta@rJ|l)dM@9ofs+eK>Rm zU*wtUVmM@fmoD>?^vLwy#RonQW5ifyI}oSQYLx%*gIB*j{pI zP(4f@viH~}y2&jMedYc&`Z#EQLr6b=Pkn6Df=Ukx z;Etcz-2pT~*5zLfaJF}WS^Sq7%kE(i>pF@!@rHJR{X61MqTLNUZWwJ`ar$NyQd-yM z^agL**Ku0Ts{CVOf^&q4;fE&}WaY_%vr|W2$1u8u75YQWq>wqwpTnNwD-Z>_E&2 zTYj(k#Lnrl*wrUaS;|D$5%)b-AN>5#?NY^Fo9 z$Njg`sD3B8b4N@Jf z_=)%qDj)6RJ`#cfdRj#wu<=TtSh6^@Rwt-Pd2o4dhCC7WGG#_AfXgdVbMB|vmP)`; z!dB$|apm0Ek;L-n!nI-O^E>O`r-i4#c`1|F(JbAHVeR8oEzX+3ikWcG$h?*q)jes{ zNETUmz5b#NYe+*idLmK6a8(2x#9%J+sIC(*BAr;c+(*SZMsQ4fc+fC$!-$|IwwsJ! zvEAVHzUZOED+^{(oQ`eRwIx~w8drorV}x=n>sAm|jTgGv(tCR135bz2pEGmO0$>sV z(`!tuC*ww=#as*g8M^tu_^?ydT=LG<|AZLd(B-A2s3oM$kW|ndWnJ5sLOX+R<|C?T z4Sjm zKi62KPLBLY)`GOa{0JHs@25k~19A(;%gHxlcNJ%sF>liLZp0%D;}d!d&Fc$=^eZrY zxrpx!kMlf5oV@`0E*lJ%K8rX^P#L)dz*(~46nEvp+~~J9{_y$79J>iK4k*1+fX7>Q zu50O34ZU-<5fakI_fcDOg%R%?{ISjU%RF9ZXLFRTqFESm1*}ZqJiTZKAFv3p61)5m z`B*_YC#z`W@3l$eqU=3?!(&C0NT=`EI{@x~#BI2yM!I%R`tQTnF;|0_|HLid?Uc?h zekxZ(?QY>_dezu{sGb)R-LmSDL?;sn>-p{YXE)xxET^DMTSD+GH~BrXlNV&i2V3Yl?~Ic{ zRH`WNnQiJSNz1;4X>6M~n$Erb)8+*U#z?pN`$DJ*5UsP#GSB+)2rJp1G08>ElF zgNtSx$eoqhU{;9^>sak28<3)>;G4s-GL=T!q(UK$T}46ZQ^dgD>$*4HJaE?SuJ}n; zO)t3?DVxgI#X1ld}%> zwir6}L|!+*H*TFZm=r+qdPg6Fw3KgoFNR>XH!mQH zMm)MNP3+h+gX>P2rnA8dLLd#}(uu>^>oVM4z~!Dhi>MfH~NKKINAQtUO{*l>=T^76V5`XT}grG{bQu;1@4se`8S4g6#Z{b>a^y z4ULpY`C&ZuZZ%CpQMPpFw=3hOW@_Z6Yk8+%hgbhoGWf_CK{=E*$M1`Bc}6`-c*M*= zS!N~YIK~pfRc+ZLmCR(GL(|8JFl2JR9d|>3DB{k*pV{dEyOc?-MvXUQ;tp~;s&@Ux zh;mil<b`yDr=jMh@^H#TTmxEI#J5-oqVaIyA2C`6ps zQ$p$gPX+U)r_F@=A7^`N@&?5if+I>sd4ei%

9$EUXlFa(o$=Y3}Ji=m5HeQsRZh z*1V1;>{}UOZiX`py+y-K3$P=u&lDG6^6{(;pU%G_>5L&#&wagt$StFI)5U>DVRai8 zubK=K01^9K{+wUTZk^D!`SA*O5XrZ_uSw55x}E&x&17IzKAZsrWagb2&AS%+v-gnv zp3s)pDF&Qao{O9Nh72~7ugN4b0|Ff^1f$6{m7Y&YSm+9tF}b(;q}^z}Ks2TE{99C+ zO0Z68$=G&!U=Bl7V5B}$MT)N5?h=GKeG-CxseXOSLFi%EthK17s!jQ{P6E*fO}-R; z2mTmHY#J0`^`cg|=f&`EY;Dapz3*?jVp}vSc+1eU9G`{9TggZ#1FV3tjz^>2#Vk9NOdjk_8|qgZlO$^^O^PY&kuC()e(XDO<}M8+>#k>V_`mPnutnAk5!J`LZ{gp9z>Y>WrzVi z&^xlUEw|dJM#UN&~BTOxe9YyHrQa)4QLky@Gi! zzTmjscLuAusz6qxW&`jTY_^Xtqn!S}T4pwEB1qQ$hb7vUn?Os}#K zrMPmKG_roo13!vlTJw&in*o>u8PWZ%(;wBaBY*ex5N`WvcRfVd6UVk4Ygd_kWw6~ho%sGPlxrt#a%Z->_$?}b|mJ0980bZ#cOYZOpvKB6H=T%-X z>nqEu3ZHcMlfNPNl(w*kSZboHl6yJoU_}g#EI8MApto)s1m8XXP)Dm6%V`u4L@noZ3mL<=fq0y8LV(EF*9YUEFq@O@8KQ0N`6dz@OC3aRbUGUBr?yTPAP1 z^mrX&p4gYtvCER}3FcHi?7g$gjv9Ei-x`hNUWzanDjyA%>wq;GS>c=F+2~6>S}T`} z{#CN+dYS^eF6G}W%ImX6@r{<&)fkLdiXqS(SV{Cu8Dr%AV#1PmX+fl=1wGDXh@2D) zs>xiA6#wdl5D%$k?|TESaH8t|fxxD-zO>nIRuQ!E4Dg_2$b*;=@9~n@qgVWvq1Sju9yCb<1M0PM z1w3FzBHTYOz2n=kty6aLC?CL&4e)jxj7GA$@4Z6^@c9VIg~Exq0QywOF_kg1ayn{&7an8tUErWFdpa`hs;CdV zQ*3`>oz)e%UZ8ByDVsn+nS4N3O4#Y0)5wzc?6;jA_DJj8ki>t@>0@4dG|h}CYZilH zVlIiA>&@I0T}?mq6#->VT6Qtsr%*2eb%=A^WHhoSC0?Jflg^Gt%%wZ`TyLGH2N#5x z3;H%x-jh_q_ZViL6cWCig*WXTpR#C!wKoqaHXe_{G4cm;S*qJO>dh{Ui4vJdmiI2u=1#S z{JeP+*q22U%uZF^VB^OKXLQ`~{3iSQpYss%k34{ARuGv+mGGSX>`dm?Jqcf!FtarQEGhj>%oM)m2|aoOoz>9ZEHR^-sG)S}BssCD}9ghw}>WuH2{=J_DS z3<6f1HwV>tovmyD^)dM<;#`aHsrfjA1Eobs&VZ&&0OYyR?#(k})HS9rYMORqw$VC+zF{sEIPp z^Ylw50DFpkTeR1h0z*2(jxwB;1I_+w+!2kwDtc|PQ^;`*Nl;sn)H(W1LAL*6e$eS3 zFjEuG_3t+_gkr|Ci%1?WLx!pR)~{rAPykiP)6}5!Sp4~wQP(sJ!}NvZPaT8L^_W@G zK@ZRJH8++BChpVlaJd?iAeyT>DJEqk4{KMoHo({(vAuWNqcM=zhx-_x%--ZUn4Y9A z5-Dh>eg;Q=y3g?1Z;&4ndvg|k?_`lx;{R-e!i$9|KV1B(?3%yt?K!CBZQH6c z`g~q)9h1gvF1oToNP*04R#uf1gs}l9kFitx>lGNlc_6`&rz)|oHdaIAn%Ic%O|9Bv6 zh=_fdE1hjTj1_Q5)=PY?m`o-$zHrW?<~xm0#u*J-Z)Y)#=cV#Ejknw7=3N|e=|G-; zTiMv#V;kVmKa?AK(KF2;;?uSLFq{z*#xQwPV9`5#{DlGY^=;j}%4z@ht2t@y!D+-F z@j0>OZN%rQ@PiZe;Cv&NPrTd)Wz<7I#E+HV9o)dVqU-(~2)!E&4g7v~MbhM+hogfZ zumaG0SgB`g{m{P#QZtD%M#$MA6_ zvEmT;QRCnpssvE^oI1EpYYao_9M9{8&W^1j9@Xf1fA$QsOZ$9%+K9em)e#J}ti1p_ zeedy%s3<6&=B)UHA0u$Rk`(K9@C3EUI>*{Jha7!CNSqHDMoRDg$G~0u{!d zy;cNIm`#cES67Y99v+a5p|SE=y)xtA8#qdLYfmpsS*^ zWzm0LIF$i)cTkW@z)r$mS1h1{=v{McapeCtSJIqj#9YsM@kIlx6(l&&Y{f2(f*9*KQC%Ws-y2oyWH?6V0Pd4Ny^2M26XYisfmGxr-VPPv?z~Vblfm> zkA(ykW&Q+;1E^;7Xj}ndd^)M2%5Km3lMz=XXj(TpK(()R!;b-4r%SV+3|Sp~X*I)> zkrPhi9_Us!4$g}3i|Z`pw+W2aIc(vu_P;f|?Pt5nk_##XpL2;xtmeJY3T~V6d~3?; znk6Rl!xgFTG<3UW60p>U@z}ni2rPH(EW>SuzSw6!;>;KUHD2R*@VNcW>&T?22|)Mpkj>DyL>9qX}j5ej_(|~nu4Jgw9c|%OB3J9 z(z^Pl#V`@2AIWGG6B=!dv=ZxxI6Qx?8oBoGCCM1-bATS$ArG$OFTYaAFXn%vh5?!q znT{u0Atdfa79Cw!#!pCe^cTRCIwTckm>=U;ro(eh3X2*gpGA{kFhg?&Ce$o8B$fKs z8D@Lu7lIoEAZK#U>6tPH;atBcv6HSfd$KWJQdByG4F$}KP5#}$#D7VYG5NOhFCnWd zLsPfAonIP?OT7KC`u5QM${k{6$TTd zmrBNp>Tk4o&2Rh?o-Ahdvsm)ro{E0}T1IyyWo!Q$(weI@a?QH{1;f|D>yqRhbs7Wf z2l}gUa`1{hYO)@LT>J`*YlhbdO-+T*E+vq!oL4-lWIiwaed>i5@N#%8DTn>A&HZkf zVI=(KL2I$k{UDgB$VfJ81|(*6=w{zuQQ&{gmj87No_ZX&P5zm#8#C((6%P^F0xYP7 zQ>hwxE}!@s-2%8QYENjPPA)xq>KdVP@gx=93vA1~}7A z;m!F8QuP88nh9pFV+c7~66D(oyvQ175EkjduYA3(qf3Qg5K#bLgQhZNQOhuvAC~X^ z>Dp_Gd@;Rtbf>~IjqCvxyd|E+|1+g!`;K~Z7~|H3%F_zi+b@w+>SZ>pW_6gT3MzKW z<@sMn>ga;@k#3O*f1~@Uh0ar5$Q;_DM#}o zpfhIuh5Yc&*|#2nKrsXQ%yOz~qxw8B7*#y_A5+9!>j(uxF<*`7+HH(aTG{y~l2aB? z>Y8G6eq@)ej&t%cohL;quFBMgGE){bZU%D2kNl1NHbUJ^uAq1Kvd@tnx@(c7atmJz z^84QC-Ci=EF~#^Bf9jkfBvln9+D4x&#G=eb+r-M)mlefVed2$j+QBXRptF(`=k_Uy zhL!e+vqJC93~$QU?yJ*X6NfiBfhyJY8?2T-b(TP`k&_UIapCge{l1n8dXeIT{OlaS zqqNtxLMa*&(j!~gv4mPKXKAF(@-J74`_G-2`)}RwbiJ|h_`R@}=2>%;gh?g-fMJ3W zp1Ev1C5DQbQZ<0X`cQUm+s_*^X;f#+4isRV3#Io=|Eg0*O=T~>@>)XcKCuFs01`R% z;t0eAAuZ*~IhNwUvhkgv&O2@;SnV{zkpJo?)Bwf*YDaJb_yzkyakH*D87<{pOkd>X z$}Ci=`^{C^sgptNjwVNy`7fRTK8PsPi(0YskoAqh8~3Sm-~QwJv6Iqzi}`f?8aaWS z#N$jEkrCfJ$llmIc)Yq1t<55|e9*>yK`{gJa%4QXGwviT42rgXPb5cUF6)i4(J68+ zroowc4T@^jeeW_JL?!&fhetW2`7C3^sS~01n#K*U^cA+v{|y?QbW| z{0QX95)C?8LcY$^%2TPD>OOyUGAaw94wIC#b59d{1mIz9vIqm2FF3XWgV{vzP3cXs z$)hCe#<8ZHh`SqlK-2z9G+{qEARBLzIeb&f#l_gHC9WQeV`uAk9>ADED)n0j8M;sm zM_{Ph24pxf8n|#i1Jm*LE0MsD`nPTRS7F4nekaH>|7-#;uZIDG*HACLTl6ANQC&i& z^(?o1uQp_z;T~C@UHPNSMyGuv+|55OvBSi0O*u7wa+5Dt!Fdr0;LNXLl5fPcZ0FrT zCFlO0GYz^pNB^f{AF5Zw1mbTj&XW@D_}w=wo9-ZRFg_TvN&~F

fnH+BA# zU4E_<5KOu#Xb*`zCq<2Wg+6+gi}+QO`3P-ct*m!79s^9eK|O$=&A%-&@00h;*=dP_ z*0`6|YHiDvsHISLO=iaK$nsaNg|6U-mlrAu-_X}RakRHJnva=l1`Uj!6_muYp&-37 z+;Dpx8Vn^H(5GSzT8d1X#Fn_*c?4p+rBQ%P3f##wX{8=&)@I>U=0L6<@p!i|G@P2g z(;DYdA5$8>9xH*MvKl>k@%u#tKTF`rDK}D92l6Fd%(9b?+@i&!^-}hhPL$Q-O8mx~ z1pZ@5Y-?7Jnzyux)_A&q+i|X;(-2v(N zZ9k|$w;8VZs<{^!fYVK%q~(N567M+i{0oIoZO!{woicLfle44MfXXS#Kv{Vg`5Lb{ zc5GKGXmzpcwIY>?Oi%dL%HlwvIj0SyfJC`ze>(e+0Q`30+!;uB6Rc0XceTz()zN4h zg|RhTln&QWQnS$zL(Oyn!ZKj~znBbsdExu3`U;nEG`r%dl|?%gyk28HmOaxf?C+0$ z1@m}?0^$zYe|h)M z3b$ZB$&+bNud?+Fh;O4kl;AR)+{LjUZh1T~y_e*-T%n9;(HBy3(W_1U?cayT zv+8^QVvI5>wPx?<^`Z(WUeV{_h#J;L;+A(pm?`lw8jl7*=by(g@bz_+^^GL|U@oLG zQhBCd$S40_guf`%=E|; z06!zwTcOF=(%!B{2mf?d;yKQCKBPQ|^}f86h<|a&dUUr`k8UV1?cvzWj2EZe&5&ZK zbeTo^hgwAw<$8N$hR?-vkS<$sAOqhs-#it7=AU6xrz$`?*ZIryIR27 z(LIS!-`5oWwlg`EZNn_Y>qAO5#vE$0xcV!&piJCcVo5uT zyaqi6rSroo#k_`Pjd|W#ZA#Kc8_z#9u>w|~#WwAQ{RPVX;l)IR?y{0PLjSfra= z=4QCUkVP{p&qGsvs_FiM3#2^M9m{6jT(b_~;w?1eq&qucd2B~mB z>4$1JYJQ%D>EgVYh01=1=aYXHt_2Zn=il14e*);{=do+;aPD*EKjqf_k{%YDm#BY^ zyQ-a?`?@8km%w67ahf&lj(AWRqY`JLNCW_F2V~wUXtM(oEz+Y)F{c0hEU$e#azB3! zIT>x)!|2NC4klsVjBc_U78Ds0c6WBntp3l5eB0Al*={VtB3Lbc2&(1f&Dt^atY8W) z`iHNZTsG(J=qh^k!9WsXGQlqE#>T+HUX4WSRm`RN!Wm{}+&K%F-ONjQl>q!UeZ7qN zK~mSL9StIO3@0~!TY1QnHX<10QM?9I)HElVLOi*^xNBKr3O}UdZ6xU83C3%D>KPcX zB=uSTecwNAPu9V!)tE#S!1YPkgQ-37G9~{XVJc}uD&lKx@T3DVXM5_n5zXX^7vHQr zE6JN?x}d1t1JCYPc1Nnj2QJnkeJFyOk3SNPc!}Fy958qp0H;!meN4|P0`6f+Zsg74 z5Ay{sY&l-D>9H|ET75Eqr#rS&f_9uOk{_6tjoQ8TU)PmMkn$<)M}3_yQdT#e6?z41g6YO+`pKl~h%Xo6%BqjrJC*-`1cUk1H^MbmWx9BHY= z0ALkPmR=U{;bXgM=2|{GCO%MdHp1Uw*>R0HcB?0~xjH79(>DbOHzCTQLYO&1E?m?p z_69XqwSk$=ZoCxdQpsz6A%E{|mzk0E%z_=g%cKEi6{8uIEv%B*lY;J(JvPX-=@-me ztnID?xAR+hpo`aFj=0JzT7bAh{eDLH;gDvv(0QT`n}1s9wdLKLUUP4w9iT?;!e)2l zuN%P&5~!<$1a!nzuo`%qKD|1|h7qZdmrpC8(kmFpw;wMk?DKzxO&bQYtEqNpFk@WW z8Mm@ykvbW!j7`fPewg;&R62%B{-X;{ENc=^$5It@PLCVuLb)=5Yx}xp9%!;CKBY#E zc`V(QMLDX>zk5Z~MH_E<*R#%cb_Sd7Ylg<+d>zy-0}SB~boZII7DZE8=5@KjGhZxiVnatU@E+FJ4c; z%h?FWz9CLWX(m5rA@=GyBZK5dsZnVfT>64eYCNh<&BlbT`x(8P9eblHJVl&&doOJ7 z>xsf>v%qN9;m*fC(dKcjL%xkxNE`+b_aWqo65~>=CSQ;4#3Q=GFFWF0j;U_%mi;PC z-0EsaiL`cGQ+i*FoRhOskw;HQ4&H()4?e0XUDI;TBdcyxcaQRQhD~r~C;^es-wE9# z`l!CJ2^SJ3$bjX6HC2`yPA$`;m^MiXZ{JHW--C`2e60LrIv(T{Mm_nje%1VM1J6d^ z;?C=V@q<~ZHMzb$u=I~1-EN)l{)WTLZMSp8YXUr2abIU^migKF8p7FVVmj}dh=2=x z;3_{27CmI+t~$FV&;4EQ5!xP*;EFEx^|tmcne77w!MSujrqTm##HD~&Pzi7a+G*7k z8Zzl8rat7S1CoF_l$~@VLZkriE_%T*kI>_&E{5e-p?fVHjO1(VZ6N20Sk`5`*MDU)8z#Mt1e{%n~V@cwH&8!6EjFXhNej% z*{yzl96~)427-f=X;dTu^jxYaabGNe27X);^DqOi-9LB+L;zUkez z29gW$Ik}qe;Bq}OOPVc>H+v*ea$SbMSbV`-UdU@TJTNaMm+rA*S;1uYChhsT^T$F! zu7A(QbKs=UC->sTfOdPM!}!W|LBH}1e`aB1BQ5$GN%3ni`V=zCSasdzS*17wgEtY( z#B1+Sr;A$VW}=oLojJna!4)M!80zB_G*J@0i?^%RpZ}P}inavZYoQ0#KJ9K!Kt1&z z?4=MkxFo@`*XxH0j>|OPP%^+t2SMaqgGdX=0Ku&bOaRLekdo{in+wa<1J^&cW_7el zgpHFfJZ4*VX#EI42V=G2g;6@A9t7!@^3qG}DpM=;ITlF=2qb!aSHy<$-Dru?jt$d} zj(ocI;jpz)n}4=_nrz@~QSP z)hJhUFYtF{x(>hc0)FWnMsu2}nXO=U@wtVeC%+P+{Qmw|l|oNf(%etR=YF3C(>x&{ zOeejxjE5hjjWv!zTl$e{vwOgXnB;p8(blQ;u%|a(VlR`~^8^0K4T{ZgzvrjHB~#He zexJ2}6PKA1V^qp%;>T83K#!^Q@rcI>Is0Z_a|cu^dt4Q3WfLWGFH6=SqPYKRYp94O+Q+hZS z&W^9p#oP#-r|w2-ekci2W9VJ1iG>LvRQs|xKdnr*F}48}0w5i~BiE*CQCtTW(BB`M zv_%OLYAC+Y5}Z=0t9S|#FzL#4cQ>E27?2IFQ*$=+6!Ku0Zb1BEqR=@z-2vor%PZ-C z?0VMjmW)vbk)$Gw<2?2Kf73x9mQwEdAIGV7Ywo6Nsse=Ac3gs&qC8)nEaNrO0$XJ%6%@o=dB-o;^#?BEYWPCsC+D0n&Z`}_KtEIJ8)+6e zRVltCez2#K+}0xXa@wlxEJ%7J8-SYwcJl1VI9&O0tajRC6w=9~&jYvzgQ7l1Txc3W zbklkh8_b`M(k|XR^4$_U6btvwr)ANg3YD)Z{f5gHS`_AqJ5>s2AlF%3>7tv06d^}Ej#(NbU2%ZufXd|3#)N1fcl2;I z{|M>X(Yg0nxkT{+OV5nM1SzHX?Nn#HO|(tgBo7QA!Xe7Yc& z4&BuPJ#Yq1x;`p8_J#RNeLd&7+z^hl&e1k1-U#MPAL`!8QfBzwq8;kJ^n|sGhf{)n z6|ErwP7a@c{ZU$5?NAehY?ONCB@c9F(--uiG(Eku#dy>~3 zJlH}!gBhJP@Gr_YHj#~4CX!#n9a3h6nXAUmgjFqPH4pF)!NdaD7V}b24iW!&@E7{M z&XjI_M>M@cUB@4{xG?l$IMt4^Bt{6~@%jEheAn<<^=T^oY1)6z?LWQfJ8W#}NU7h8 ztB%)T;`gYG7j4i$w>gZi#Y+1EW?@V!IiAtOZ!;>AUi)es_CU%0jEH zNE>$t)bP`oA(KR(&7*?pQ@NL*xR_3V;q-WC@Kcsp)LGEvDW~tcxKrPbTbi(6|KhMq|8PWq{NR4Z&?%-S z_D#?*W_v}g{iXncGD}|j2#5;%_n*Z6Slkhg(8_6xT59X>$np$K4cB&)YbM`9nAkD6 z5y6i-8e`wEhtfsIkN9$it-95n@LeS`KnkMbyeB*wEsA|fnwS>0tHeIfZo8Zr&u+$i z1IRl4Vi&OSM_E2q2MJak4|XW@^lz-JD{22++U7OrsJl`)HEX1{b+y-Sfgy^eT?*UA zaOMpCO7%Ae&I%u;$!(~WfeSI=bg%$*sK+4OM&wE*G4-GpLUda55xi{ zcqFrs%=kz|2+2#+*Ov`^s0jPHSP5k`3+WE*8X(-m){Tn(*jdevv)(Uz1N3%>&J8}Kj4 z*WGRF^-9^doGG4m$Flq_#gJYgq= zI33HGty|=THD!#WhZhY<%hmP7SL*^82-U6yNdUpDI zjP*~&7RTu~4C)Lw#8B`ezt-CCt*`Suplk)-1|!N39&881M18C`{Z@TSnVS*0ZNHIZ zMH1#-D4}9?HeKGTm2PgLNiwc?P9|Mi4Jq@_Gg>TZjGbc7 zvPp77ireEB!tsS#d3J)i6YV^`iI-4R9fRkEH%4q19m(6j3jo~Fj(&Jhqw+nV-4Vd! z5}UMU_{dGaMunT2lnLcmT9{iE%95a9D#j`M7`CM210sQNoIo+-oNNO$mOlN*!}UKV zzko1uQm*IO&#Ue^vnfC`RBWk1d0Gm*71=im4k@pV&$L`5W)zpXNq58%JqeSUTqwSFMn>l)~8 z%_waW$Oh(sbqa?uG)kG2FWvplYgDdHiBFr66xR!kQ4J6$uFxEyUD*0V$8>V@cMDq$ z1+X17O1NW{tDv#s%9&eP(qFPyj3nH_qEzss8!fL_b_ydJ@4aa(ffDKg)zIPV4(-Hr zxBYB_Bj^?4_m4R8(6)tn;3&|2{AAmwWD);LLpx+7?a!Q7NOwphbE}d2tA)El zT`v7gC&y26=WnenK?6w794>yY03;*JC^7thrUJq0&Wp!9d}h?Bdr?Q{fQGND++h>$BkgcRPN-xQ zxvTqCQfiYL_=FFQlRR1{d}OImv1>bi$b_&*<+KQQm!@<+*@%I}yAK&Qe|*TAIkU&D zAU0-0!u-elMA8{}HX;m*82FV&nuHnN7G)2Z*&PioDN3`&cb%Or?gU4Lc~V*#7Qz7i zM_d1Ty|CEr?Ah%kL~Dv1@N1{u*BMRfTnGe4rwXi9^D1zy6H?C46?m4ioBol~r8UTr zKihnziB%XqW56s3Z$nRd7VaA`UxAyHymo@FjJy5i+&M$oEz4-XWvWsYfVuI4mp^)) zX!KUb&-s#*XD{~}*Ma}j_L!tcWy0ebY|>Qytg6qr7Bp$p_#1QffW~yVVv9W?*zQ-L zr+r@N8}Bx?XEc1>G&KTQSq<)Nm-hmLa05l+87Z0z`%l*&EdL^}?M-yXpAastv{g4+ zt~w5W&0wnpxS{U;x~KAGR1Z#@%hZ_d?t*;GK=A*6IJ!-oIQj@@@q%MsLQB3 znW-A@*)ee7A`Eyw8t`5Nw=95Dy!<+{vkm65Ca)$}x*h&yH^_i59xnH5yVYO$FuV)H zD(x%x{7&yYsN%+NQH#VrKQ~Tia3yizUZ|>(*@l&NEy5L7QW?y77S_JDFkHBOaYn30 z%DYhak>|1p4N0ZNUDi(=IhbUv{iOhp7EVst-25L}lta=m?9pAqeYUF|dd$MZBVUTL z{bN%16-|@^!qo+y2(iTnXb{`ytu>Q z&Me|FhXH<5w(in#cRz^#ky6na3j#yD*dg39oKt`ERA~8#_^$$L#>aH;FoUIiYqxK# zsMH4dGx9?P`JYcFVptu%4Sv9g$B9lV`#h^M$y}sC<3{oEOxZ1?9g_KR0@xT&T!ikC zYFN4`fAU1LmL>qFnTWyxHz~!##Yi2$c6GP89-593eOopm_wDuKs7o}EXc?dfRmfAK zn%S~=VbB_$b7F!N2XO0r#AGRaXpYuZ3ERn^+`^l#9f-wb2-+1h%y-^l6Dmi3 z_tp^I_UFyp+5Nlt)v^~Ejkhww1++~KM<>xG5ouy%?|>g_FNRxOz53}n=w~aVOe4w` zU-+wUHbLdq7JxP*HI85ha-X^>!t>;z^KD>q?L}@5P71astw@w)`P}OMIT8Rdml5)} z+8Ell;gjd2$>MXC0;X8?Zcbj;t|#6$Jch1>ru&Cvh-(#oS;6>biQn3R*?(FIsc(1| z9)3{z_iov5Z|Fr{5a9om%U|exs*F5ps5nlwecPkdXbt#ItlBY;whSe8@w-r$o-nQ* z>d@o-HLt-L_9K(%K{3p3x~p%!bCP(p9F_3W;WM^u{Arur**>8Qs_#Y{M+>r_Jft`h z)Civzp4Q!H9_PtrST_Il%-8wMZRaRh?sy_XrbM;M3PnTEw+((l@o^ovT6KA_U8z$J zlGS|q|GK-*zowRG8=8Q438GR|5)e>9nj%#anhFE~r57n8Mh(3vJp=`#f+kX>7m0KN z(mRNPQF;kQAViP|!GN>?0rJAV?|1L}8{YYH&c`#qnc1`U-fOK{dfmXKyfe?3#-2s2 zLHd7-VdxqB(Ev+Ydq;CsDg%U>hhT5dIrW5JE9LtrYD#~fh&pN0%A-LTk%QE%RIo}a zvAu^TI@bzDdBik-rf1f!ypaqWW(3c#k$s-GnJ*KVGkkyx|0Sc`1{Xu`(Y|@gd^Sv} zlKrBi`z7zI4=jcQM0*frCXtINODBh=SS2@Fat^l~ZQ7QrG#jcJ-%X4*pH@v6T6%Ew za*d`~2k?dvOt{@+TAy3)ny16o7-=`4*(l8eA)=SiW}hVm8CB#1$F*EeH(ZYBq@SU1 z&VkfXG|s#gPzwtojz1D(0WA(e9bmMiTLW}7N_VXvq^0%AnvQyIk_eHyg>V{I%S3PL zd~oFxqpS%3#!vlL&(@pphso%52JnJ3^3e&aQ#IUfFLq?gf+wHyh#l@TI|>EO2ftn! zNODT#k&Spn2XA9s{_FTNl{8K>w;tNjoJJc=Y3E>ks1vji)F7f>*z3_F9hoQrHZ{Uu zide-rSMNMi;vzR_DoVL|ZN8`m#0{o_w489IYre-cKQ#Uf%Nv&tc#DgXbPRLV_Q8*g z^|dOw`^&IgsMw!N1cCoTI2JJ)+0~&}1^*nCQi=tWu+USym$`Zh0V`*di)wtz(O5?u z&CgknD~=i-2hcYUjAG{w_HEXxE#{6@rA#k24^^zs_#z8C+&wjmd(ch9dG}>v9mj*u zu1WS=&u1=b<1SaP_>vM>TGqoEt~9c5Dv^@zKiNfj*bs#o3qr=eqNsfAawf3_{hv*r zJ@5gV@x_lrj#piOf*oCT9THwB&KDy|GX!Low4! zrlEg^okDtKeiQ|5w0A4~ATGSsfu~C6B{;m=+l_%vo9)57Ic7!<9nxDpYs7`qQuiqe zb z+x9#Uy&G3=hEH~M{jpsJqK&tL{_`1lYU8~>aJc05XxGy!GNYBIBPpT9IM;&A1 zHT(d&+^@y?u(c+f|I`FRA4d8r6Mh%rh!J3Q7BomvU=bK3Hj3W^_I*`0PwQg-3pfLb zDGWXPQ1?m7@B5yd0sz=P63;1KLKTjwVm}|kE1`$py0~7Uarw)l#D>Xy%foPIW9cgE znIFRSSw&UDE03-`m8ueZ5~T3ci|stGUU@{{snD$g00M8br28&VYbWT>o;|qOk#A#W zpeY(-0R(?*nM7PP&cDeg)=a%f@`eGwf4C5O1haEGR4cP1D|goSwYcB&{ySzzK~ zB9H2vt@I>}2tiZ%lgld<8z%zp4pl|R)jbpN9}e%(*o<%SpJq3-fW63zWYc84JWF2; zMFZZ}MWec2>a@?hDDSE-?GOA#H_cda3|Bdal!SoULI1mPc38yeYV zPbEkTW^eYBTY^b0v^4AWEJA1?2RJ}CYz@c#y1!=f>r_@pmR|nJwZvgPTDzp$6gs3! zaw_d7x|ABp2w(2HoI&&3Z{NENaW>WPCf`kUGH{Uz%tG$ic(ZB!hXvcYf6Z@_BAiBN zkJ?sNJt=|Ms()z?6_2wT6OcKt6c4Z)dNewVQ)a*O^t*eYb`M;w^xFVEtQ%tK;ZOFY z`IU0Lc{{6``QUN$5<>3*5R}v8tw~Bg&*sCmWSo&YTvm@COS+OFTfR7}d)EN$n0iew zudLym$v*TswwS}%hN0P~(nMVv6r83No+$0Sn+3xeU`o%$mwpG;f6=2kXB}G?{&n>{sHzc$6-Lv-q@pZa-9vz7l0b0brjEE z1_N9&M|GAoC_fkG&ype--G27AXpgUl#M&4P4xOx!g#$ z(?^a2V29gLXLEF)ys}Q|$x*X_F)}glKHs*81 zuq_a@iD>m58IxO}Z#?YjwZW>~6B~_1bkMsGwp?0?IGW2bDEo`o>&Ivk;$hqS@n;qW zaOlC7QTYey;X_6CX^PWo2^VM&S5-+(3HK9M=W-?%?ZJ2V3%$NIo5oo7N}0mt1I9QA zU91c=&=cP!q$l^W>wUp;w?K8S)AIWT+SKK}#+=%t&{d1>YX)=Q1JNC=q1-dE|7MVf zw4*5xq3Sc0mOOH2uF1q+)3E}c;)HQYp~TbP#VnDZj*W@A4MpvD7P~zz2VLs+ptB^l z-m@NTWhC$KFt4RD=p_AW)Jyb^Ic^WQkvm^q^JjY%9bD>sd}awudaiK0$2Y!#P6g{A zwZg)z48wGy1CrLuh&# z(s#dM0M@Jw`N$zr7K&HelHGkCmSGVxr=`uIj`k{-V=?wQs@==-(8WYeD}dD#j-{f? z#y$pNF$kk?f1-@~lsB}_SFncbdl`pR9Q@%6GW;S8^h{@zvyXIr_T8g_vJxSN6slj*vyVwS5}-A-3MjRguO{fWr%iGcxvWtKa-(~&B_buw=XCO z4|cdq``3j4-k@#bx;$xuSL5r&w}%lSiKXfiRk7cCpxJDQnOo%<<~a!rghwE`ce!to zAo0H^G2YOywKOJeZ26!kqCF~Daxk`G8@#r5qHBp2p`>Zq@@*-$D4qtm`T@-XYG>cf zO8`nuC7!Ug9X}ikSKEul6fQ4}sbcdJcHRaBzbX>|H6Rncen+|jPna-@;W6G*n7zc> zY{M5ak z9#Z&4lPqnvzf14`Yf4P{M4HMNo&954J&0m?_^=lMP*;m!D=Q$u8}Q@UfD_mCL>1G% zyt-(bbN-yy*CYMTIo}}VZ*@^Tb*UjMJovGu%jl}2Z&NYG*BDTiM^2rhd{_`4n7Shn z;?PUcj(E&$^arZ_#kONu&_8nB-_g0z+F)zjH@t*SDsjRMTg zk2y{LWXY5B(3nTSGvwr61J9PSEX8Uhb5}Pzo0DuT9h| zy3BT+x9Kn#g1UUd13xyE?3gh^3OW%IyWO=vUu*@o>m9zpsb|oJM=?sh1q~J{p*v5?(B!9%EMo!G%h9>{i4O`&rj8-NooM98OL3$~$3WsF<)aW48 zW2F08Tj!aZ)-bm`? zIb~zc5&}TAeM*Km?KI<5d5yHKeAE(t5>nl3u%-Wk#sV#Ybee%dLmBdMR8}lhNOQRH zqy*7A-uffN5;b2^O7nB!1OIM8mSRNB*=&0z!slzl78b1 z+4DwfH~gHP#Rr%wt|9t3;*`US5b3=$**y|hxtpGG7K|uZ0b-WhKQ^zwnm6`OCWcQ| zFYl?#u+k$FY6zG-J7xcsl#$0+g36KUJB+%&L5-m8o9)keXGP!~4~!{A#Qq-A?f`htEMmPjf^pv%<_(jP zgBM~;=H@i|aCcPk)e0Bvgez5Or7@wJ2_Ha{RNW$rU&p`F=or*%04DA=?YDhO&+8nS zM#@zd_3y@q0#SaJXgP3M414m?!23!(b)2rxAXbab&>z2=vEvJ>3T=3!>p!Oo z9csQ2FRxQ1vn}X<5idx)7@VD zZKpSe*}vQ{J%&|IxzvW0)-ifwlxw*#U3CVLu)0KxK|mNa+q*5qo$`h(*aKdxwg~$z z0p#h;wi$>=R9W)%3d3c(=aQ}~P$cwxAtXDf)rp{b{dux`n{_-WH?2C@+i4<*@1!+q zK9r|$O1>#6!PX(eTppE}?%*gS{<->IPq4jTrts=|fcoh^nZ3rCE4?g6-T8ltBwO`l z(ryp)z*$LYE81U)G3vd)>CMdA-K=G%Mc7hJINyXgP&M)H8Vz`~d6NFtX@71bv^`|^ zQl=~NrDo%30mSGnDy_fRm`8##`Gg0XFOSNT^JP!aKU-gaWh(h5uF55JgCq)F9vh_9Px-C)sE>8Ii$dvc9+df+{ zl;F;tZHg`P9rIQ~Mb|;NL|SMbkg1f)!F#{@UQ>5^%qE#LL-yXzBa3P{EfDcha{y#A zxsd2z;0Yro+1A$r{Cca)N=<3^5DLLbBGXU0?y&n=u)haEMZAQ$i;T*r*(7dOjNHdm zoFtd=S)Fo!6d`MVl(#t@s?ha}4#=f?8Gx7XysM9I7`B<>Pu^rK+zHQ=*vPJk7E~TC z-7L1D)|_k!R55fyWLqn1`YgB&BO}&Q{r^JOPI#2+qmWkABnYi`azi^o7UoE{$yO2045k${`Bc^P3Z@ zp=Q;Yetc3i+#}swXg(yiF7!dYdbW=nL66n4x+CaU%=T>F?un6UV7?Szv`w8c>RU5R z)>BG{tWd~>wP*wkrac6TL8M&xIF7$~$>)a(;RQS4Ja5qMywmt-Ri^)?1)N8+sAae> zyS!b#!%@ESL&mQ4{wZ%qAfIA*CInkTCO-Xw`+h55rnGHHvoH8!Xf+6vm6^YgL?Hb+dczO|+^V?701P`{viXTmEH_GV;1sqyuP2~ zh(~!N{h)X}?w(UpBae6>+H-v5Qlqv*RGgH&OsBk_(D&ix8%&^mF-oPJY~HpEALOE9h}IRLnRMO6u`(4hNp>!Qn`?q)N*`S&C)1d8}EY2Q)Fb#W^iML3SHK*mhQ= zvFw$ZISTj#X!Q4e6fRgWXrDl}H^XOUWdUix_y9FLRnwTV;{Ui+c|jx5ywXz7y~s** zr65rXcyS)KhIQ~q5t_NMHpM%*D+UKq&%9BTqfrgeLrNy>TnM&dN#bE@A(V zZzgzkVSPuur!oOfPrzBIro}ZzBc4l`xv4&^wvT_)b#< z*69Mjh*eg>^xTZy;2BQAM8S||)#V(HOU5m)gah-_nn|?-i?S((#qd5A+#K9uSZ1+N za5iqu(m3V9HeM0$)Si`j)AZbfrbNWQ8Dl#>>3HO)_X}Xs+%6Ut45{ynuewofI=+YP zv0r%`A8N!p#f=BSpLUncq+DaO2b<%xMA#QtG_HU`!w`NZ^4D~aHMc4D*RQuJQfsA9 ztHR(inUuzJKX tuple[str, ...]: def _image_equals(a: Image.Image, b: Image.Image) -> bool: - return (np.asarray(a) == np.asarray(b.convert(a.mode))).all() + return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all() @pytest.mark.asyncio diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index fab44fb6062d..13c37c979dac 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -13,7 +13,6 @@ TODO: Implement CustomDataset to parse a JSON file and convert its contents into SampleRequest instances, similar to the approach used in ShareGPT. """ - import base64 import io import json @@ -33,6 +32,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.image import convert_image_mode from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer logger = logging.getLogger(__name__) @@ -259,7 +259,7 @@ def process_image(image: Any) -> Mapping[str, Any]: if isinstance(image, dict) and 'bytes' in image: image = Image.open(BytesIO(image['bytes'])) if isinstance(image, Image.Image): - image = image.convert("RGB") + image = convert_image_mode(image, "RGB") with io.BytesIO() as image_data: image.save(image_data, format="JPEG") image_base64 = base64.b64encode( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 66e78fcc4e80..f68513553846 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -23,6 +23,7 @@ InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, @@ -77,7 +78,7 @@ class InternVLImageEmbeddingInputs(TypedDict): def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD return T.Compose([ - T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Lambda(lambda img: convert_image_mode(img, 'RGB')), T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 91f6c7753c68..eefadda918f6 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -24,6 +24,7 @@ InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, @@ -78,7 +79,7 @@ class SkyworkR1VImageEmbeddingInputs(TypedDict): def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD return T.Compose([ - T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Lambda(lambda img: convert_image_mode(img, 'RGB')), T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index f6ab72f4e9b8..a5a4dcd0b6e1 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -10,6 +10,7 @@ from PIL import Image from vllm.logger import init_logger +from vllm.multimodal.image import convert_image_mode if TYPE_CHECKING: from vllm.inputs import TokensPrompt @@ -35,7 +36,8 @@ def serialize_item(cls, obj: object) -> bytes: return np.array(obj).tobytes() if isinstance(obj, Image.Image): - return cls.item_to_bytes("image", np.array(obj.convert("RGBA"))) + return cls.item_to_bytes("image", + np.array(convert_image_mode(obj, "RGBA"))) if isinstance(obj, torch.Tensor): return cls.item_to_bytes("tensor", obj.numpy()) if isinstance(obj, np.ndarray): diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 939928bbf108..a63ec0bd8ada 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -22,6 +22,25 @@ def rescale_image_size(image: Image.Image, return image +# TODO: Support customizable background color to fill in. +def rgba_to_rgb( + image: Image.Image, background_color=(255, 255, 255)) -> Image.Image: + """Convert an RGBA image to RGB with filled background color.""" + assert image.mode == "RGBA" + converted = Image.new("RGB", image.size, background_color) + converted.paste(image, mask=image.split()[3]) # 3 is the alpha channel + return converted + + +def convert_image_mode(image: Image.Image, to_mode: str): + if image.mode == to_mode: + return image + elif image.mode == "RGBA" and to_mode == "RGB": + return rgba_to_rgb(image) + else: + return image.convert(to_mode) + + class ImageMediaIO(MediaIO[Image.Image]): def __init__(self, *, image_mode: str = "RGB") -> None: @@ -32,7 +51,7 @@ def __init__(self, *, image_mode: str = "RGB") -> None: def load_bytes(self, data: bytes) -> Image.Image: image = Image.open(BytesIO(data)) image.load() - return image.convert(self.image_mode) + return convert_image_mode(image, self.image_mode) def load_base64(self, media_type: str, data: str) -> Image.Image: return self.load_bytes(base64.b64decode(data)) @@ -40,7 +59,7 @@ def load_base64(self, media_type: str, data: str) -> Image.Image: def load_file(self, filepath: Path) -> Image.Image: image = Image.open(filepath) image.load() - return image.convert(self.image_mode) + return convert_image_mode(image, self.image_mode) def encode_base64( self, @@ -51,7 +70,7 @@ def encode_base64( image = media with BytesIO() as buffer: - image = image.convert(self.image_mode) + image = convert_image_mode(image, self.image_mode) image.save(buffer, image_format) data = buffer.getvalue() diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index a35d32999991..f1c6407e1f3a 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -33,6 +33,8 @@ Unpack) from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from vllm.multimodal.image import convert_image_mode + __all__ = ['OvisProcessor'] IGNORE_ID = -100 @@ -361,8 +363,8 @@ def _get_best_grid(img, side): # pick the partition with maximum covering_ratio and break the tie using #sub_images return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0] - if convert_to_rgb and image.mode != 'RGB': - image = image.convert('RGB') + if convert_to_rgb: + image = convert_image_mode(image, 'RGB') sides = self.get_image_size() From c6b636f9fbfd0308ee8d883afd8ccd7ef823eb25 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 23 May 2025 03:05:44 +0100 Subject: [PATCH 071/192] [V1][Spec Decoding] Use model_loader.get_model() to load models (#18273) Signed-off-by: Mark McLoughlin --- tests/v1/spec_decode/test_eagle.py | 58 ++----------------- vllm/model_executor/model_loader/__init__.py | 13 ++++- .../model_loader/base_loader.py | 3 +- .../model_loader/bitsandbytes_loader.py | 5 +- .../model_loader/default_loader.py | 7 ++- .../model_loader/dummy_loader.py | 4 +- .../model_loader/gguf_loader.py | 4 +- .../model_loader/runai_streamer_loader.py | 5 +- .../model_loader/sharded_state_loader.py | 4 +- .../model_loader/tensorizer_loader.py | 4 +- vllm/model_executor/model_loader/utils.py | 4 +- vllm/model_executor/models/llama_eagle.py | 6 +- vllm/model_executor/models/llama_eagle3.py | 11 ++-- vllm/model_executor/models/medusa.py | 5 +- vllm/v1/spec_decode/eagle.py | 38 ++---------- vllm/v1/spec_decode/medusa.py | 23 ++------ 16 files changed, 59 insertions(+), 135 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 7d93a44c5059..e000d955cfc0 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -117,34 +117,13 @@ def test_prepare_inputs(): ]) @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group') @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config') -@mock.patch('vllm.v1.spec_decode.eagle.ModelRegistry') -@mock.patch('vllm.v1.spec_decode.eagle.get_model_loader') -@mock.patch('vllm.v1.spec_decode.eagle.set_default_torch_dtype') -@mock.patch('vllm.v1.spec_decode.eagle.set_current_vllm_config') -def test_load_model(mock_set_config, mock_set_dtype, mock_get_loader, - mock_registry, mock_get_layers, mock_get_pp_group, method, +@mock.patch('vllm.v1.spec_decode.eagle.get_model') +def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, proposer_helper, draft_model_dir, target_attribute_path): - # Setup mock for model class - mock_model_cls = mock.MagicMock() - mock_registry.resolve_model_cls.return_value = (mock_model_cls, - "test_arch") - - # Create a real context manager for mocks - class MockContextManager: - - def __init__(self): - pass - - def __enter__(self): - return None - - def __exit__(self, exc_type, exc_val, exc_tb): - return False - - # Make the mocks return actual context manager objects - mock_set_dtype.return_value = MockContextManager() - mock_set_config.return_value = MockContextManager() + # Setup model mock + mock_model = mock.MagicMock() + mock_get_model.return_value = mock_model # Setup mocks for attention layers target_attn_layers = { @@ -164,25 +143,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): mock_pp_group.world_size = 2 if method == "eagle" else 1 mock_get_pp_group.return_value = mock_pp_group - # Setup model loader mock - mock_loader = mock.MagicMock() - mock_get_loader.return_value = mock_loader - - # Setup model mock - mock_model = mock.MagicMock() - mock_model_cls.return_value = mock_model - mock_model.to.return_value = mock_model - - # Configure mock to test the attribute sharing path - if method == "eagle": - # For eagle, test the lm_head path - mock_model.load_weights.return_value = { - "model.embed_tokens.weight": torch.zeros(1) - } - else: - # For eagle3, test the embed_tokens path - mock_model.load_weights.return_value = {} - # Setup target model with the appropriate attributes target_model = mock.MagicMock() @@ -204,13 +164,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): proposer.load_model(target_model) # Verify common interactions - mock_get_loader.assert_called_once() - mock_model_cls.assert_called_once() - mock_model.to.assert_called_once() - mock_model.load_weights.assert_called_once() - - # Verify the loader was called with the right config - mock_get_loader.assert_called_once_with(proposer.vllm_config.load_config) + mock_get_model.assert_called_once() # Verify the specific attribute sharing based on the method if method == "eagle": diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 92a0b0923b6e..a443a652d8a3 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +from typing import Optional + from torch import nn -from vllm.config import LoadConfig, LoadFormat, VllmConfig +from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.bitsandbytes_loader import ( BitsAndBytesModelLoader) @@ -47,9 +49,14 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader: return DefaultModelLoader(load_config) -def get_model(*, vllm_config: VllmConfig) -> nn.Module: +def get_model(*, + vllm_config: VllmConfig, + model_config: Optional[ModelConfig] = None) -> nn.Module: loader = get_model_loader(vllm_config.load_config) - return loader.load_model(vllm_config=vllm_config) + if model_config is None: + model_config = vllm_config.model_config + return loader.load_model(vllm_config=vllm_config, + model_config=model_config) __all__ = [ diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index f17cab05c25d..010dd515784a 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -18,6 +18,7 @@ def download_model(self, model_config: ModelConfig) -> None: raise NotImplementedError @abstractmethod - def load_model(self, *, vllm_config: VllmConfig) -> nn.Module: + def load_model(self, *, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: """Load a model with the given configurations.""" raise NotImplementedError diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 6771c128c5a1..0d83c8d53419 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -569,10 +569,9 @@ def _load_weights(self, model_config: ModelConfig, def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: device_config = vllm_config.device_config - model_config = vllm_config.model_config - with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 21eb7d8a75fb..ddbd60940e9e 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -264,13 +264,14 @@ def download_model(self, model_config: ModelConfig) -> None: fall_back_to_pt=True, allow_patterns_overrides=None) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: device_config = vllm_config.device_config - model_config = vllm_config.model_config target_device = torch.device(device_config.device) with set_default_torch_dtype(model_config.dtype): with target_device: - model = initialize_model(vllm_config=vllm_config) + model = initialize_model(vllm_config=vllm_config, + model_config=model_config) weights_to_load = {name for name, _ in model.named_parameters()} loaded_weights = model.load_weights( diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py index 5047a161f3f9..0e2f0be1ec26 100644 --- a/vllm/model_executor/model_loader/dummy_loader.py +++ b/vllm/model_executor/model_loader/dummy_loader.py @@ -22,9 +22,9 @@ def __init__(self, load_config: LoadConfig): def download_model(self, model_config: ModelConfig) -> None: pass # Nothing to download - def load_model(self, vllm_config: VllmConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: device_config = vllm_config.device_config - model_config = vllm_config.model_config target_device = torch.device(device_config.device) with set_default_torch_dtype(model_config.dtype): with target_device: diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 2766c9787b83..806004bf9604 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -92,9 +92,9 @@ def _get_weights_iterator( def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: device_config = vllm_config.device_config - model_config = vllm_config.model_config local_model_path = self._prepare_weights(model_config.model) gguf_weights_map = self._get_gguf_weights_map(model_config) # we can only know if tie word embeddings after mapping weights diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index a695ba03bd1d..9f1022c25925 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -100,11 +100,10 @@ def download_model(self, model_config: ModelConfig) -> None: """Download model if necessary""" self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: """Perform streaming of the model to destination""" device_config = vllm_config.device_config - model_config = vllm_config.model_config - target_device = torch.device(device_config.device) with set_default_torch_dtype(model_config.dtype): with target_device: diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 913bda7e007a..78bca89f0015 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -100,9 +100,9 @@ def _prepare_weights(self, model_name_or_path: str, def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, vllm_config: VllmConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: device_config = vllm_config.device_config - model_config = vllm_config.model_config target_device = torch.device(device_config.device) from vllm.distributed import get_tensor_model_parallel_rank diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index ac9ef6164388..26f8c0946b0a 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -93,8 +93,8 @@ def download_model(self, model_config: ModelConfig) -> None: with self.tensorizer_config.open_stream(): pass - def load_model(self, vllm_config: VllmConfig) -> nn.Module: - model_config = vllm_config.model_config + def load_model(self, vllm_config: VllmConfig, + model_config: ModelConfig) -> nn.Module: parallel_config = vllm_config.parallel_config self._verify_config(model_config, parallel_config) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 68b1f1ad74d3..39e380f07297 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -42,9 +42,11 @@ def initialize_model( *, prefix: str = "", model_class: Optional[type[nn.Module]] = None, + model_config: Optional[ModelConfig] = None, ) -> nn.Module: """Initialize a model with the given configurations.""" - model_config = vllm_config.model_config + if model_config is None: + model_config = vllm_config.model_config if model_class is None: model_class, _ = get_model_architecture(model_config) diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 018ecc2a8c0f..172dc8b5ec06 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -130,13 +130,15 @@ def load_weights(self, weights: Iterable[tuple[str, class EagleLlamaForCausalLM(LlamaForCausalLM): - def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) self.config = vllm_config. \ speculative_config.draft_model_config.hf_config + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) self.model = LlamaModel(vllm_config=vllm_config, prefix="model", - start_layer_id=start_layer_id) + start_layer_id=target_layer_num) logit_scale = getattr(self.config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.config.vocab_size, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 2302d1352de6..96e666a3543d 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -175,13 +175,15 @@ def load_weights(self, weights: Iterable[tuple[str, class Eagle3LlamaForCausalLM(LlamaForCausalLM): - def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): nn.Module.__init__(self) self.config = vllm_config. \ speculative_config.draft_model_config.hf_config + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) self.model = LlamaModel(vllm_config=vllm_config, - start_layer_id=start_layer_id, - prefix="model") + prefix="model", + start_layer_id=target_layer_num) logit_scale = getattr(self.config, "logit_scale", 1.0) self.lm_head = ParallelLMHead( @@ -193,8 +195,7 @@ def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): self.logits_processor = LogitsProcessor(self.config.draft_vocab_size, scale=logit_scale) self.draft_id_to_target_id = nn.Parameter( - torch.zeros((self.config.draft_vocab_size), - dtype=torch.long).type(torch.LongTensor), + torch.zeros(self.config.draft_vocab_size, dtype=torch.long), requires_grad=False, ) diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 588bcb628f8c..95ef1134b1bf 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -51,10 +51,7 @@ class Medusa(nn.Module): needs to have truncated_vocab_size (=k) as an attribute.""" def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: - if hasattr(vllm_config, 'draft_model_config'): - config = vllm_config.draft_model_config.hf_config - else: - config = vllm_config.model_config.hf_config + config = vllm_config.speculative_config.draft_model_config.hf_config super().__init__() self.config = config self.blocks = nn.ModuleList([ diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 19fb2a2af7dd..460d645a1a6c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -4,14 +4,11 @@ from vllm.attention.layer import Attention from vllm.config import (CompilationLevel, VllmConfig, - get_layers_from_vllm_config, set_current_vllm_config) + get_layers_from_vllm_config) from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger -from vllm.model_executor.model_loader import get_model_loader -from vllm.model_executor.model_loader.utils import ( - process_weights_after_loading, set_default_torch_dtype) -from vllm.model_executor.models import ModelRegistry +from vllm.model_executor.model_loader import get_model from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.triton_utils import tl, triton from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata @@ -280,51 +277,28 @@ def prepare_inputs( return cu_num_tokens, token_indices def load_model(self, target_model: nn.Module) -> None: - loader = get_model_loader(self.vllm_config.load_config) - target_layer_num = self.vllm_config.model_config.get_num_layers( - self.vllm_config.parallel_config) + draft_model_config = \ + self.vllm_config.speculative_config.draft_model_config target_attn_layer_names = set( get_layers_from_vllm_config(self.vllm_config, Attention).keys()) - draft_model_config = \ - self.vllm_config.speculative_config.draft_model_config - # FIXME(lily): This does not handle with distributed inference. - target_device = self.vllm_config.device_config.device - # We need to set the vllm_config here to register attention - # layers in the forward context. - with set_default_torch_dtype( - draft_model_config.dtype), set_current_vllm_config( - self.vllm_config): - draft_model_cls, arch = ModelRegistry.resolve_model_cls( - draft_model_config.architectures) - self.model = draft_model_cls( - vllm_config=self.vllm_config, - start_layer_id=target_layer_num).to(target_device) + self.model = get_model(vllm_config=self.vllm_config, + model_config=draft_model_config) draft_attn_layer_names = ( get_layers_from_vllm_config(self.vllm_config, Attention).keys() - target_attn_layer_names) assert len(draft_attn_layer_names) == 1 self.attn_layer_name = next(iter(draft_attn_layer_names)) - loaded_weights = self.model.load_weights( - loader.get_all_weights(draft_model_config, self.model)) - - process_weights_after_loading(self.model, draft_model_config, - target_device) # share embed_tokens with the target model if needed if get_pp_group().world_size == 1: - assert "model.embed_tokens.weight" not in loaded_weights, \ - "For PP = 1, Eagle draft should share embed with target model" logger.info( "The EAGLE head shares the same vocab embedding" \ " with the target model." ) self.model.model.embed_tokens = target_model.model.embed_tokens else: - assert "model.embed_tokens.weight" in loaded_weights, \ - "For PP > 1, Eagle draft checkpoint should its own copy of " - " the model.embed_tokens.weight" logger.info( "Since PP > 1, the EAGLE head loaded its own vocab embedding" \ " weights instead of sharing them with the target model." diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 14bc9c9e0d1a..fdac2ef64c3f 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -3,12 +3,10 @@ import torch import torch.nn as nn -from vllm.config import VllmConfig, set_current_vllm_config +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.logger import init_logger -from vllm.model_executor.model_loader import get_model_loader -from vllm.model_executor.model_loader.utils import set_default_torch_dtype -from vllm.model_executor.models.medusa import Medusa +from vllm.model_executor.model_loader import get_model from vllm.v1.sample.metadata import SamplingMetadata # Initialize logger @@ -49,20 +47,9 @@ def propose( return [list(row) for row in zip(*draft_tokens)] def load_model(self, target_model: nn.Module) -> None: - # Get model loader and config - loader = get_model_loader(self.vllm_config.load_config) - draft_config = self.vllm_config.speculative_config.draft_model_config - - # Load model with proper dtype and config - with set_default_torch_dtype(draft_config.dtype), \ - set_current_vllm_config(self.vllm_config): - self.model = Medusa( - vllm_config=self.vllm_config.speculative_config).to( - self.device) - - # Load model weights - weights = loader.get_all_weights(draft_config, self.model) - self.model.load_weights(weights) + self.model = get_model(vllm_config=self.vllm_config, + model_config=self.vllm_config. + speculative_config.draft_model_config) @torch.inference_mode() def dummy_run(self, num_tokens: int) -> None: From 4b0da7b60e35bbeb2cbd656c54966ee2ceb4c4bd Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 04:12:08 +0200 Subject: [PATCH 072/192] Enable hybrid attention models for Transformers backend (#18494) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/source/contributing/model/basic.md | 2 +- tests/models/test_transformers.py | 56 +++++++++++++++------ vllm/config.py | 19 +++++--- vllm/model_executor/models/transformers.py | 57 +++++++++++++++++++--- 4 files changed, 105 insertions(+), 29 deletions(-) diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md index ad31995f76be..1fa56dc4728d 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/source/contributing/model/basic.md @@ -117,7 +117,7 @@ For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `m To support a model with interleaving sliding windows, we need to take care of the following details: -- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model. +- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model. - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). With these two steps, interleave sliding windows should work with the model. diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 6e38c4c7cadb..1a51b4aeab04 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,37 +1,50 @@ # SPDX-License-Identifier: Apache-2.0 """Test the functionality of the Transformers backend.""" +from typing import Any, Optional, Union + import pytest from vllm.platforms import current_platform from ..conftest import HfRunner, VllmRunner +from ..core.block.e2e.test_correctness_sliding_window import prep_prompts from ..utils import multi_gpu_test from .utils import check_logprobs_close def check_implementation( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], + runner_ref: type[Union[HfRunner, VllmRunner]], + runner_test: type[VllmRunner], example_prompts: list[str], model: str, + kwargs_ref: Optional[dict[str, Any]] = None, + kwargs_test: Optional[dict[str, Any]] = None, **kwargs, ): + if kwargs_ref is None: + kwargs_ref = {} + if kwargs_test is None: + kwargs_test = {} + max_tokens = 32 num_logprobs = 5 - with vllm_runner(model, **kwargs) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) + args = (example_prompts, max_tokens, num_logprobs) + + with runner_test(model, **kwargs_test, **kwargs) as model_test: + outputs_test = model_test.generate_greedy_logprobs(*args) - with hf_runner(model) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) + with runner_ref(model, **kwargs_ref) as model_ref: + if isinstance(model_ref, VllmRunner): + outputs_ref = model_ref.generate_greedy_logprobs(*args) + else: + outputs_ref = model_ref.generate_greedy_logprobs_limit(*args) check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", + outputs_0_lst=outputs_ref, + outputs_1_lst=outputs_test, + name_0="ref", + name_1="test", ) @@ -58,6 +71,18 @@ def test_models( model_impl=model_impl) +def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None: + prompts, _, _ = prep_prompts(4, (800, 801)) + kwargs_ref = {"max_model_len": 8192, "enforce_eager": True} + kwargs_test = {"model_impl": "transformers", **kwargs_ref} + check_implementation(vllm_runner, + vllm_runner, + prompts, + model="hmellor/tiny-random-Gemma2ForCausalLM", + kwargs_ref=kwargs_ref, + kwargs_test=kwargs_test) + + @multi_gpu_test(num_gpus=2) def test_distributed( hf_runner: type[HfRunner], @@ -65,8 +90,11 @@ def test_distributed( example_prompts, ): kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2} - check_implementation(hf_runner, vllm_runner, example_prompts, - "meta-llama/Llama-3.2-1B-Instruct", **kwargs) + check_implementation(hf_runner, + vllm_runner, + example_prompts, + "meta-llama/Llama-3.2-1B-Instruct", + kwargs_test=kwargs) @pytest.mark.skipif( diff --git a/vllm/config.py b/vllm/config.py index 1c916915a046..b1bf623f9afe 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -533,13 +533,17 @@ def __post_init__(self) -> None: self.model, hf_token=self.hf_token, revision=self.revision) self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype) - interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"] + # Workaround for Gemma 2 which uses interleaved sliding window + # attention, but it's not specified in its config. TODO: remove this + # when Gemma 2 is fixed in Transformers. + if self.hf_text_config.model_type == "gemma2": + self.hf_text_config.sliding_window_pattern = 2 + sliding_window = getattr(self.hf_text_config, "sliding_window", None) - has_interleaved_attention = (sliding_window is not None) and ( - isinstance(sliding_window, list) or - (self.hf_text_config.model_type in interleaved_attn_models)) + sliding_window_pattern = getattr(self.hf_text_config, + "sliding_window_pattern", None) - if (not self.disable_sliding_window and has_interleaved_attention): + if not (self.disable_sliding_window or sliding_window_pattern is None): if (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( @@ -1037,8 +1041,7 @@ def verify_with_parallel_config( if self.use_async_output_proc: self.use_async_output_proc = False - def get_hf_config_sliding_window( - self) -> Union[Optional[int], list[Optional[int]]]: + def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled.""" # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in @@ -1049,7 +1052,7 @@ def get_hf_config_sliding_window( return None return getattr(self.hf_text_config, "sliding_window", None) - def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: + def get_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled. """ # If user disables sliding window, return None. diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index a8f30b2f27bf..b22d81d88abe 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -16,6 +16,7 @@ """Wrapper around `transformers` models""" import re from collections.abc import Iterable +from contextlib import nullcontext from typing import Literal, Optional, Union import torch @@ -110,6 +111,33 @@ def replace_linear_class( ) +class ConfigOverride: + """Context manager to temporarily override config attributes.""" + + def __init__(self, config: PretrainedConfig, **kwargs): + self.config = config + self.kwargs = kwargs + self.kwargs_original = {} + self.kwargs_delete = set() + + def __enter__(self): + """Override config attributes.""" + for key, value in self.kwargs.items(): + if not hasattr(self.config, key): + self.kwargs_delete.add(key) + self.kwargs_original[key] = getattr(self.config, key, None) + setattr(self.config, key, value) + return self.config + + def __exit__(self, exc_type, exc_value, traceback): + """Restore original config attributes.""" + for key, value in self.kwargs_original.items(): + if key in self.kwargs_delete: + delattr(self.config, key) + else: + setattr(self.config, key, value) + + class TransformersModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -135,8 +163,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.pp_rank = self.pp_group.rank_in_group self.tp_size = get_tensor_model_parallel_world_size() + # vLLM handles interleaved sliding window attention by creating a new + # interleaved_sliding_window attribute and deleting the sliding_window + # attribute. This breaks the constructors in Transformers so we + # temporarily add the attribute back to construct the model. + config_override = nullcontext() + if hasattr(config, "interleaved_sliding_window"): + config_override = ConfigOverride( + config, sliding_window=config.interleaved_sliding_window) + # Use meta device to delay allocating GPU tensors - with torch.device("meta"): + with torch.device("meta"), config_override: # FIXME(Isotr0py): We need to refactor this part in the future to # avoid registering an extra model layer, otherwise we will need a # weights mapper to rename weights. @@ -262,9 +299,17 @@ def create_attention_instances(self) -> dict[int, Attention]: num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) start, end = get_pp_indices(self.config.num_hidden_layers, self.pp_rank, self.pp_size) - return { - i: - Attention( + + attention_instances = {} + for i in range(start, end): + # Handle interleaved sliding window attention + sliding_window = None + if (hasattr(self.config, "interleaved_sliding_window") + and hasattr(self.config, "sliding_window_pattern") + and ((i + 1) % self.config.sliding_window_pattern > 0)): + sliding_window = self.config.interleaved_sliding_window + + attention_instances[i] = Attention( num_heads=num_heads, head_size=head_size, # NOTE: We use Llama scale as default, if it's set by @@ -273,9 +318,9 @@ def create_attention_instances(self) -> dict[int, Attention]: num_kv_heads=num_kv_heads, cache_config=self.cache_config, quant_config=self.quant_config, + per_layer_sliding_window=sliding_window, prefix=f"{i}.attn") - for i in range(start, end) - } + return attention_instances def init_buffers(self, module: nn.Module): """ From fae453f8ce5cf6cb43f464068eb0e201669e11d7 Mon Sep 17 00:00:00 2001 From: CYJiang <86391540+googs1025@users.noreply.github.com> Date: Fri, 23 May 2025 10:15:32 +0800 Subject: [PATCH 073/192] [Misc] refactor: simplify input validation and num_requests handling in _convert_v1_inputs (#18482) Signed-off-by: googs1025 --- vllm/entrypoints/llm.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 053ee55bb6a8..52b50229b8d1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1306,27 +1306,25 @@ def _convert_v1_inputs( ): # skip_tokenizer_init is now checked in engine + if prompts is None and prompt_token_ids is None: + raise ValueError( + "Either prompts or prompt_token_ids must be provided.") + if prompts is not None and prompt_token_ids is not None \ + and len(prompts) != len(prompt_token_ids): + raise ValueError( + "The lengths of prompts and prompt_token_ids must be the same." + ) + if prompts is not None: prompts = [p["content"] for p in parse_and_batch_prompt(prompts)] if prompt_token_ids is not None: prompt_token_ids = [ p["content"] for p in parse_and_batch_prompt(prompt_token_ids) ] - - num_requests = None if prompts is not None: num_requests = len(prompts) - if prompt_token_ids is not None: - if (num_requests is not None - and num_requests != len(prompt_token_ids)): - raise ValueError("The lengths of prompts and prompt_token_ids " - "must be the same.") - + elif prompt_token_ids is not None: num_requests = len(prompt_token_ids) - if num_requests is None: - raise ValueError("Either prompts or prompt_token_ids must be " - "provided.") - parsed_prompts: list[PromptType] = [] for i in range(num_requests): item: PromptType From 93ecb8139c2a13675273835746d025f3ffc61f6d Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 22 May 2025 19:22:11 -0700 Subject: [PATCH 074/192] [BugFix] Increase TP execute_model timeout (#18558) Signed-off-by: Nick Hill --- vllm/v1/executor/multiproc_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 2061806e6b36..eb5f9d4bfe00 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -38,7 +38,7 @@ POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -EXECUTE_MODEL_TIMEOUT_S = 40 +EXECUTE_MODEL_TIMEOUT_S = 300 class MultiprocExecutor(Executor): From e44d8ce8c77186e8242d463d14219b405a2e109e Mon Sep 17 00:00:00 2001 From: lkchen Date: Thu, 22 May 2025 19:54:42 -0700 Subject: [PATCH 075/192] [Bugfix] Set `KVTransferConfig.engine_id` in post_init (#18576) Signed-off-by: Linkun Chen --- tests/v1/kv_connector/unit/test_multi_connector.py | 8 ++++++++ vllm/config.py | 5 ++++- .../kv_transfer/kv_connector/v1/nixl_connector.py | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index 64da0d79bf33..a21d92c52244 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -239,3 +239,11 @@ def get_connector_events() -> dict[str, list[str]]: print(f"[ERROR] Could not read connector events for {name}: {e}") return connector_events + + +def test_engine_id_conflict(): + configs = [KVTransferConfig() for _ in range(2)] + ids = [config.engine_id for config in configs] + assert ids[0] != ids[1], ( + "Engine IDs should be different for different configs. " + f"Got {ids}") diff --git a/vllm/config.py b/vllm/config.py index b1bf623f9afe..50adfe8f2d78 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3495,7 +3495,7 @@ class KVTransferConfig: """The KV connector for vLLM to transmit KV caches between vLLM instances. """ - engine_id: str = str(uuid.uuid4()) + engine_id: Optional[str] = None """The engine id for KV transfers.""" kv_buffer_device: Optional[str] = "cuda" @@ -3552,6 +3552,9 @@ def compute_hash(self) -> str: return hash_str def __post_init__(self) -> None: + if self.engine_id is None: + self.engine_id = str(uuid.uuid4()) + if self.kv_role is not None and self.kv_role not in get_args(KVRole): raise ValueError(f"Unsupported kv_role: {self.kv_role}. " f"Supported roles are {get_args(KVRole)}") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index b00f097110b0..6303d77ad305 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -537,6 +537,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): def add_remote_agent(self, nixl_agent_meta: NixlAgentMetadata): engine_id = nixl_agent_meta.engine_id + assert engine_id != self.engine_id, "Conflict engine id found!" if engine_id in self._remote_agents: return From 583507d13075783a12ccbd774575974d10ca4959 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Thu, 22 May 2025 23:17:39 -0400 Subject: [PATCH 076/192] [Spec Decode] Make EAGLE3 draft token ID mapping optional (#18488) Signed-off-by: Benjamin Chislett Co-authored-by: Woosuk Kwon --- vllm/model_executor/models/llama_eagle3.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 96e666a3543d..f211bfe54a7d 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -214,6 +214,9 @@ def compute_logits( ) -> Optional[torch.Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) + if self.draft_id_to_target_id is None: + return logits + base = torch.arange(self.config.draft_vocab_size, device=logits.device) targets = base + self.draft_id_to_target_id logits_new = logits.new_full(( @@ -246,4 +249,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): name = "model." + name model_weights[name] = loaded_weight - return loader.load_weights(model_weights.items()) + loaded_weights = loader.load_weights(model_weights.items()) + + if 'd2t' not in loaded_weights: + self.draft_id_to_target_id = None + + return loaded_weights From ed5d408255a11b44d70e497796bf3f4399e52a54 Mon Sep 17 00:00:00 2001 From: aws-elaineyz Date: Thu, 22 May 2025 21:26:32 -0700 Subject: [PATCH 077/192] [Neuron] Remove bypass on EAGLEConfig and add a test (#18514) Signed-off-by: Elaine Zhao --- .../scripts/hardware_ci/run-neuron-test.sh | 9 +- tests/neuron/2_core/test_eagle.py | 82 +++++++++++++++++++ tests/neuron/2_core/test_mistral.py | 6 +- vllm/config.py | 3 +- 4 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 tests/neuron/2_core/test_eagle.py diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh index c0b9dd8dadba..3d294ea5f8a7 100644 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh @@ -53,4 +53,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" + /bin/bash -c " + python3 /workspace/vllm/examples/offline_inference/neuron.py; + python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; + for f in /workspace/vllm/tests/neuron/2_core/*.py; do + echo 'Running test file: '$f; + python3 -m pytest \$f -v --capture=tee-sys; + done + " \ No newline at end of file diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py new file mode 100644 index 000000000000..d71c88689a99 --- /dev/null +++ b/tests/neuron/2_core/test_eagle.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import shutil +import tempfile + +import torch +from huggingface_hub import snapshot_download +from safetensors import safe_open + +from vllm import LLM, SamplingParams + + +def patch_eagle_draft_with_lm_head(target_model_id: str, + draft_model_id: str) -> str: + # In NxDI, draft model checkpoint must include lm_head weights from target + # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com + # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html + # #eagle-checkpoint-compatibility + final_draft_dir = "/tmp/patched_eagle_draft" + + with tempfile.TemporaryDirectory() as tmp_dir: + target_dir = snapshot_download(repo_id=target_model_id, + local_dir=os.path.join( + tmp_dir, "target")) + draft_dir = snapshot_download(repo_id=draft_model_id, + local_dir=os.path.join(tmp_dir, "draft")) + + lm_head_key = "lm_head.weight" + index_path = os.path.join(target_dir, "model.safetensors.index.json") + with open(index_path) as f: + index = json.load(f) + shard_name = index["weight_map"][lm_head_key] + target_safetensor_path = os.path.join(target_dir, shard_name) + + with safe_open(target_safetensor_path, framework="pt") as f: + target_lm_head = f.get_tensor(lm_head_key) + + draft_path = os.path.join(draft_dir, "pytorch_model.bin") + draft_state_dict = torch.load(draft_path, map_location="cpu") + draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16) + torch.save(draft_state_dict, draft_path) + + shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True) + + return final_draft_dir + + +def test_eagle(): + patched_draft_path = patch_eagle_draft_with_lm_head( + target_model_id="meta-llama/Llama-2-7b-hf", + draft_model_id="yuhuili/EAGLE-llama2-chat-7B") + llm = LLM( + model="meta-llama/Llama-2-7b-hf", + speculative_config={ + "model": patched_draft_path, + "num_speculative_tokens": 5, + "max_model_len": 128 + }, + max_num_seqs=1, + max_model_len=128, + tensor_parallel_size=2, + override_neuron_config={ + "enable_eagle_speculation": True, + "enable_fused_speculation": True, + "fused_qkv": True + }, + ) + prompts = [ + "The president of the United States is", + ] + outputs = llm.generate(prompts, SamplingParams(top_k=1)) + expected_output = " the head of state and head of government of " \ + "the United States. The president direct" + + for output in outputs: + generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") + assert (expected_output == generated_text) + + print("Neuron Eagle speculation test passed.") diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py index cc3b53a9d7c9..3e651502d1e2 100644 --- a/tests/neuron/2_core/test_mistral.py +++ b/tests/neuron/2_core/test_mistral.py @@ -12,8 +12,7 @@ def test_mistral(): override_neuron_config={ "sequence_parallel_enabled": False, "skip_warmup": True - }, - device="neuron") + }) # Send more prompts than the compiled batch size (4) and request # varying generation lengths to test accuracy related to Neuron @@ -59,4 +58,7 @@ def test_mistral(): for expected_output, output in zip(expected_outputs, outputs): generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") assert (expected_output == generated_text) + + print("Neuron Mistral test passed.") diff --git a/vllm/config.py b/vllm/config.py index 50adfe8f2d78..5653bc79f6e3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2529,11 +2529,10 @@ def __post_init__(self): "Chunked prefill and EAGLE are not compatible " "when using V0.") - from vllm.platforms import current_platform from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) if isinstance(self.draft_model_config.hf_config, - EAGLEConfig) or current_platform.is_neuron(): + EAGLEConfig): pass else: eagle_config = EAGLEConfig( From 4be2255c81528c75dd033b25ffffe1803b20311a Mon Sep 17 00:00:00 2001 From: Teruaki Ishizaki Date: Fri, 23 May 2025 13:30:47 +0900 Subject: [PATCH 078/192] [Bugfix][Benchmarks] Fix a benchmark of deepspeed-mii backend to use api_key (#17291) Signed-off-by: Teruaki Ishizaki --- benchmarks/backend_request_func.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 800d426c6d11..88616e1108c5 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -194,6 +194,11 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("completions", "profile")), ( + "OpenAI Completions API URL must end with 'completions' or 'profile'." + ) + async with aiohttp.ClientSession( trust_env=True, timeout=AIOHTTP_TIMEOUT ) as session: @@ -204,6 +209,8 @@ async def async_request_deepspeed_mii( "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. "top_p": 1.0, } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -215,7 +222,7 @@ async def async_request_deepspeed_mii( st = time.perf_counter() try: async with session.post( - url=request_func_input.api_url, json=payload + url=api_url, json=payload, headers=headers ) as response: if response.status == 200: parsed_resp = await response.json() From 9c1baa5bc6caedabeac1a6da57ec79b41e13056d Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Fri, 23 May 2025 12:38:50 +0800 Subject: [PATCH 079/192] [Misc] Replace `cuda` hard code with `current_platform` (#16983) Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm/distributed/parallel_state.py | 5 +++-- vllm/forward_context.py | 5 ++++- vllm/spec_decode/metrics.py | 8 ++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 51c519d8f862..f67c01889188 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1221,8 +1221,9 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): ray.shutdown() gc.collect() from vllm.platforms import current_platform - if not current_platform.is_cpu(): - torch.cuda.empty_cache() + empty_cache = current_platform.empty_cache + if empty_cache is not None: + empty_cache() try: torch._C._host_emptyCache() except AttributeError: diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 5d2d95f18d2f..3c8083e3dd0d 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -120,7 +120,10 @@ def set_forward_context(attn_metadata: Any, # we use synchronous scheduling right now, # adding a sync point here should not affect # scheduling of the next batch - torch.cuda.synchronize() + from vllm.platforms import current_platform + synchronize = current_platform.synchronize + if synchronize is not None: + synchronize() now = time.perf_counter() # time measurement is in milliseconds batchsize_forward_time[batchsize].append( diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 0bb8d602ec8f..4430da26c049 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -126,12 +126,12 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: """Copy rejection/typical-acceptance sampling metrics (number of accepted tokens, etc) to CPU asynchronously. - Returns a CUDA event recording when the copy is complete. + Returns a device event recording when the copy is complete. """ assert self._copy_stream is not None - self._copy_stream.wait_stream(torch.cuda.current_stream()) + self._copy_stream.wait_stream(current_platform.current_stream()) - with torch.cuda.stream(self._copy_stream): + with current_platform.stream(self._copy_stream): self._aggregate_num_accepted_tokens.copy_( self.spec_decode_sampler.num_accepted_tokens, non_blocking=True) @@ -142,7 +142,7 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: self._aggregate_num_draft_tokens = ( self.spec_decode_sampler.num_draft_tokens) - aggregate_metrics_ready = torch.cuda.Event() + aggregate_metrics_ready = current_platform.Event() aggregate_metrics_ready.record(self._copy_stream) return aggregate_metrics_ready From 60cad94b86ae1cd884a327a68864926b573bc5bc Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Fri, 23 May 2025 13:31:59 +0800 Subject: [PATCH 080/192] [Hardware] correct method signatures for HPU,ROCm,XPU (#18551) Signed-off-by: Andy Xie --- vllm/platforms/__init__.py | 10 ++++------ vllm/platforms/hpu.py | 4 ++-- vllm/platforms/rocm.py | 4 ++-- vllm/platforms/xpu.py | 14 ++++++++------ 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index b1df4fd1339b..49e502d2626c 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -42,7 +42,6 @@ def tpu_platform_plugin() -> Optional[str]: logger.debug("Confirmed TPU platform is available.") except Exception as e: logger.debug("TPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None @@ -112,7 +111,6 @@ def rocm_platform_plugin() -> Optional[str]: amdsmi.amdsmi_shut_down() except Exception as e: logger.debug("ROCm platform is not available because: %s", str(e)) - pass return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None @@ -130,7 +128,6 @@ def hpu_platform_plugin() -> Optional[str]: "habana_frameworks is not found.") except Exception as e: logger.debug("HPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None @@ -148,7 +145,6 @@ def xpu_platform_plugin() -> Optional[str]: logger.debug("Confirmed XPU platform is available.") except Exception as e: logger.debug("XPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None @@ -170,7 +166,6 @@ def cpu_platform_plugin() -> Optional[str]: except Exception as e: logger.debug("CPU platform is not available because: %s", str(e)) - pass return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None @@ -222,8 +217,11 @@ def resolve_current_platform_cls_qualname() -> str: platform_cls_qualname = func() if platform_cls_qualname is not None: activated_plugins.append(name) + logger.info("Platform plugin %s loaded.", name) + logger.warning( + "Platform plugin %s function's return value is None", name) except Exception: - pass + logger.exception("Failed to load platform plugin %s", name) activated_builtin_plugins = list( set(activated_plugins) & set(builtin_platform_plugins.keys())) diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 6f7c5a6d3cae..a8dd7df9f2e3 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -39,8 +39,8 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True - @staticmethod - def inference_mode(): + @classmethod + def inference_mode(cls): return torch.no_grad() @classmethod diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 1685c65ad0b9..e1dcd9870b6c 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -217,9 +217,9 @@ def get_device_capability(cls, major, minor = torch.cuda.get_device_capability(device_id) return DeviceCapability(major=major, minor=minor) - @staticmethod + @classmethod @with_amdsmi_context - def is_fully_connected(physical_device_ids: list[int]) -> bool: + def is_fully_connected(cls, physical_device_ids: list[int]) -> bool: """ Query if the set of gpus are fully connected by xgmi (1 hop) """ diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 785fb6ce1b79..b2a6ad5d77db 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -37,15 +37,17 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, logger.info("Using IPEX attention backend.") return "vllm.attention.backends.ipex_attn.IpexAttnBackend" - @staticmethod + @classmethod def get_device_capability( - device_id: int = 0) -> Optional[DeviceCapability]: + cls, + device_id: int = 0, + ) -> Optional[DeviceCapability]: # capacity format differs from cuda's and will cause unexpected # failure, so use None directly return None - @staticmethod - def get_device_name(device_id: int = 0) -> str: + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: return torch.xpu.get_device_name(device_id) @classmethod @@ -57,8 +59,8 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True - @staticmethod - def inference_mode(): + @classmethod + def inference_mode(cls): return torch.no_grad() @classmethod From 4c611348a7444e3ad6d04b9f741ff239b7047633 Mon Sep 17 00:00:00 2001 From: RonaldBXu <72748153+RonaldBXu@users.noreply.github.com> Date: Fri, 23 May 2025 00:37:18 -0700 Subject: [PATCH 081/192] [V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (#18034) Signed-off-by: Ronald Xu --- vllm/transformers_utils/configs/eagle.py | 3 +-- vllm/v1/spec_decode/eagle.py | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 377523efefc3..31e3172c61eb 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -70,8 +70,7 @@ def __init__(self, if self.model is not None: for k, v in self.model.to_dict().items(): - if not hasattr(self, k): - setattr(self, k, v) + setattr(self, k, v) @classmethod def from_pretrained( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 460d645a1a6c..671b98544387 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -9,6 +9,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.triton_utils import tl, triton from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata @@ -310,7 +311,10 @@ def load_model(self, target_model: nn.Module) -> None: if self.vllm_config.speculative_config.method != "eagle3" and \ hasattr(target_model, "lm_head"): logger.info("Loading EAGLE LM head weights from the target model.") - self.model.lm_head = target_model.lm_head + if supports_multimodal(target_model): + self.model.lm_head = target_model.get_language_model().lm_head + else: + self.model.lm_head = target_model.lm_head @torch.inference_mode() def dummy_run( From 71ea614d4ab2e36a953f0123d63de95bac07f7d8 Mon Sep 17 00:00:00 2001 From: cascade Date: Fri, 23 May 2025 01:03:34 -0700 Subject: [PATCH 082/192] [Feature]Add async tensor parallelism using compilation pass (#17882) Signed-off-by: cascade812 --- .buildkite/test-pipeline.yaml | 1 + tests/compile/backend.py | 18 ++ tests/compile/test_async_tp.py | 248 +++++++++++++++++++++ tests/compile/test_fusion.py | 36 ++- tests/compile/test_sequence_parallelism.py | 47 ++-- vllm/compilation/collective_fusion.py | 126 +++++++++++ vllm/compilation/pass_manager.py | 3 + vllm/compilation/sequence_parallelism.py | 9 +- vllm/compilation/vllm_inductor_pass.py | 3 +- vllm/config.py | 11 +- vllm/distributed/parallel_state.py | 26 ++- 11 files changed, 472 insertions(+), 56 deletions(-) create mode 100644 tests/compile/test_async_tp.py create mode 100644 vllm/compilation/collective_fusion.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 017dba3d2d55..6a7d220bbdcf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -316,6 +316,7 @@ steps: - pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_sequence_parallelism.py + - pytest -v -s compile/test_async_tp.py - label: PyTorch Fullgraph Smoke Test # 9min mirror_hardwares: [amdexperimental, amdproduction] diff --git a/tests/compile/backend.py b/tests/compile/backend.py index a21e8eca3a6e..5a02c4e2b378 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -5,6 +5,8 @@ from torch import fx +from vllm.compilation.fx_utils import (find_specified_fn, + find_specified_fn_maybe) from vllm.compilation.inductor_pass import InductorPass from vllm.config import get_current_vllm_config @@ -44,3 +46,19 @@ def post_pass(self, graph: fx.Graph): self.graph_post_pass = deepcopy(graph) # assign by reference, will reflect the final state of the graph self.final_graph = graph + + def check_before_ops(self, ops, + find_fn=find_specified_fn, \ + find_fn_maybe=find_specified_fn_maybe, \ + ops_fully_replaced=True): + for op in ops: + find_fn(self.graph_pre_pass.nodes, op) + if ops_fully_replaced: + assert find_fn_maybe(self.graph_post_pass.nodes, op) is None + + def check_after_ops(self, ops, + find_fn=find_specified_fn, \ + find_fn_maybe=find_specified_fn_maybe): + for op in ops: + find_fn(self.graph_post_pass.nodes, op) + assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py new file mode 100644 index 000000000000..8e4e0ba83579 --- /dev/null +++ b/tests/compile/test_async_tp.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 + +import json + +import pytest +import torch + +import vllm.envs as envs +from vllm.compilation.collective_fusion import AsyncTPPass +from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig, + PassConfig, VllmConfig) +from vllm.distributed import (tensor_model_parallel_all_gather, + tensor_model_parallel_reduce_scatter) +from vllm.distributed.parallel_state import (init_distributed_environment, + initialize_model_parallel) +from vllm.platforms import current_platform +from vllm.utils import update_environment_variables + +from ..models.registry import HF_EXAMPLE_MODELS +from ..utils import (compare_two_settings, create_new_process_for_each_test, + multi_gpu_test) +from .backend import TestBackend + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +class TestMMRSModel(torch.nn.Module): + + def __init__(self, hidden_size=16): + super().__init__() + self.hidden_size = hidden_size + self.gate_proj = torch.nn.Parameter(torch.empty( + (self.hidden_size * 2, hidden_size)), + requires_grad=False) + # Initialize weights + torch.nn.init.normal_(self.gate_proj, std=0.02) + + def forward(self, hidden_states): + """ + Forward pass implementing the mm + reduce scatter in the FX graph + + """ + # Reshape input + view = hidden_states.reshape(-1, self.hidden_size) + + # matrix multiplication + permute = self.gate_proj.permute(1, 0) + mm = torch.mm(view, permute) + reduce_scatter = tensor_model_parallel_reduce_scatter(mm, dim=0) + return reduce_scatter + + def ops_in_model_before(self): + return [torch.ops.vllm.reduce_scatter.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_matmul_reduce_scatter.default] + + +class TestAGMMModel(torch.nn.Module): + + def __init__(self, hidden_size=16): + super().__init__() + self.hidden_size = hidden_size + self.weight = torch.nn.Parameter(torch.empty( + (hidden_size, hidden_size)), + requires_grad=False) + # Initialize weights + torch.nn.init.normal_(self.weight, std=0.02) + + def forward(self, hidden_states): + """ + Forward pass implementing the mm + all gather in the FX graph + """ + # Reshape input + view = hidden_states.reshape(-1, self.hidden_size) + all_gather = tensor_model_parallel_all_gather(view, dim=0) + permute = self.weight.permute(1, 0) + mm = torch.mm(all_gather, permute) + return mm + + def ops_in_model_before(self): + return [torch.ops.vllm.all_gather.default] + + def ops_in_model_after(self): + return [torch.ops.symm_mem.fused_all_gather_matmul.default] + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("seq_len", [16]) +@pytest.mark.parametrize("hidden_size", [16]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], + reason="Only test on CUDA") +def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int, + hidden_size: int, dtype: torch.dtype): + num_processes = 2 + + def run_torch_spawn(fn, nprocs): + # need to use torch.mp.spawn otherwise will have problems with + # torch.distributed and cuda + torch.multiprocessing.spawn(fn, + args=(num_processes, test_model, + batch_size, seq_len, hidden_size, + dtype), + nprocs=nprocs) + + run_torch_spawn(async_tp_pass_on_test_model, num_processes) + + +def async_tp_pass_on_test_model(local_rank: int, world_size: int, + test_model_cls: torch.nn.Module, + batch_size: int, seq_len: int, + hidden_size: int, dtype: torch.dtype): + current_platform.seed_everything(0) + + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + update_environment_variables({ + 'RANK': str(local_rank), + 'LOCAL_RANK': str(local_rank), + 'WORLD_SIZE': str(world_size), + 'MASTER_ADDR': 'localhost', + 'MASTER_PORT': '12345', + }) + + # initialize distributed + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # configure vllm config for SequenceParallelismPass + vllm_config = VllmConfig() + vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig( + enable_async_tp=True, ), ) + vllm_config.device_config = DeviceConfig(device=torch.device("cuda")) + + # this is a fake model name to construct the model config + # in the vllm_config, it's not really used. + model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" + vllm_config.model_config = ModelConfig(model=model_name, + task="auto", + tokenizer=model_name, + tokenizer_mode="auto", + trust_remote_code=True, + dtype=dtype, + seed=42) + + async_tp_pass = AsyncTPPass(vllm_config) + backend = TestBackend(async_tp_pass) + + model = test_model_cls(hidden_size) + + hidden_states = torch.randn((batch_size * seq_len, hidden_size), + dtype=dtype, + requires_grad=False) + + compiled_model = torch.compile(model, backend=backend) + compiled_model(hidden_states) + + # In pre-nodes, all gather or reduce scatter should exist, + # fused_matmul_reduce_scatter or fused_all_gather_matmul should not + backend.check_before_ops(model.ops_in_model_before(), + ops_fully_replaced=False) + + # In post-nodes, fused_matmul_reduce_scatter or \ + # fused_all_gather_matmul should exist + backend.check_after_ops(model.ops_in_model_after()) + + +@create_new_process_for_each_test() +@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"]) +@pytest.mark.parametrize("tp_size", [2]) +@pytest.mark.parametrize("async_tp_enabled", [True]) +@pytest.mark.parametrize("distributed_backend", ["mp"]) +@pytest.mark.parametrize("eager_mode", [False, True]) +def test_async_tp_pass_correctness( + model_id: str, + tp_size: int, + async_tp_enabled: bool, + distributed_backend: str, + eager_mode: bool, + num_gpus_available: int, +): + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) + model_info.check_transformers_version(on_fail="skip") + model_info.check_available_online(on_fail="skip") + + pp_size = 1 + if num_gpus_available < tp_size: + pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") + + common_args = [ + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "8", + ] + if eager_mode: + common_args.append("--enforce-eager") + + compilation_config = { + 'level': 3, + 'compile_sizes': [2, 4, 8], + 'splitting_ops': [], + 'pass_config': { + 'enable_async_tp': async_tp_enabled + }, + } + + async_tp_env = tp_env = { + "VLLM_USE_V1": "1", + } + + aysnc_tp_args = [ + *common_args, + "--tensor-parallel-size", + str(tp_size), + "--distributed-executor-backend", + distributed_backend, + "--compilation_config", + json.dumps(compilation_config), + ] + + tp_args = [ + *common_args, + "--tensor-parallel-size", + str(tp_size), + "--distributed-executor-backend", + "mp", + ] + + compare_two_settings(model_id, + aysnc_tp_args, + tp_args, + async_tp_env, + tp_env, + method="generate") diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 4d56b34bdecf..509593e7328d 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -29,6 +29,10 @@ def __init__(self, hidden_size: int, eps: float, static: bool, self.cutlass_fp8_enabled = cutlass_fp8_enabled self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] + self.key = QuantKey(dtype=FP8_DTYPE, + static=static, + per_tensor=static, + symmetric=True) if static: self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] else: @@ -59,6 +63,15 @@ def forward(self, x): y3, resid = self.norm[2](x3, resid) # use resid here return y3 + def ops_in_model_before(self): + return [QUANT_OPS[self.key]] + + def ops_in_model_after(self): + return [ + FUSED_OPS[FusedRMSQuantKey(self.key, False)], + FUSED_OPS[FusedRMSQuantKey(self.key, True)] + ] + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) @@ -107,25 +120,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL) - # Check substitution worked - pre_nodes = backend.graph_pre_pass.nodes - post_nodes = backend.graph_post_pass.nodes - - # static is per-tensor, dynamic is per-token - key = QuantKey(dtype=FP8_DTYPE, - static=static, - per_tensor=static, - symmetric=True) - rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)] - add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)] - fp8_quant = QUANT_OPS[key] - # In pre-nodes, fp8 quant should be there and fused kernels should not - assert find_auto_fn_maybe(pre_nodes, rms_quant) is None - assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None - find_auto_fn(pre_nodes, fp8_quant) + backend.check_before_ops(model.ops_in_model_before(), find_auto_fn, + find_auto_fn_maybe) # In post-nodes, fused kernels should be there and fp8 quant should not - find_auto_fn(post_nodes, rms_quant) - find_auto_fn(post_nodes, add_rms_quant) - assert find_auto_fn_maybe(post_nodes, fp8_quant) is None + backend.check_after_ops(model.ops_in_model_after(), find_auto_fn, + find_auto_fn_maybe) diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index 6152f171705b..2cd7ebaacec0 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -5,9 +5,7 @@ import vllm.envs as envs from vllm.compilation.fix_functionalization import FixFunctionalizationPass -from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe, - find_specified_fn, - find_specified_fn_maybe, is_func) +from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func from vllm.compilation.sequence_parallelism import SequenceParallelismPass from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig, PassConfig, VllmConfig) @@ -21,17 +19,6 @@ from ..utils import multi_gpu_test from .backend import TestBackend -OPS_IN_MODEL_BEFORE = [ - torch.ops.vllm.all_reduce.default, -] - -OPS_IN_MODEL_AFTER = [ - torch.ops.vllm.reduce_scatter.default, - torch.ops.vllm.all_gather.default, -] - -OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default] - prompts = [ "Hello, my name is", "The president of the United States is", @@ -78,6 +65,18 @@ def forward(self, hidden_states, residual): return norm_output, residual_output + def ops_in_model_before(self): + return [torch.ops.vllm.all_reduce.default] + + def ops_in_model_after(self): + return [ + torch.ops.vllm.reduce_scatter.default, + torch.ops.vllm.all_gather.default + ] + + def ops_in_model(self): + return [torch.ops._C.fused_add_rms_norm.default] + @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("batch_size", [8]) @@ -156,26 +155,16 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int, compiled_model_func = torch.compile(model, backend=backend_func) compiled_model_func(hidden_states, residual) - # Check substitution worked - pre_nodes = backend_no_func.graph_pre_pass.nodes - post_nodes = backend_no_func.graph_post_pass.nodes - # In pre-nodes, all reduce should be there, # reduce scatter and all gather should not - for op in OPS_IN_MODEL_BEFORE: - find_specified_fn(pre_nodes, op) - for op in OPS_IN_MODEL_AFTER: - assert find_specified_fn_maybe(pre_nodes, op) is None + backend_no_func.check_before_ops(model.ops_in_model_before()) # In post-nodes, reduce scatter and all gather should be there, # all reduce should not - for op in OPS_IN_MODEL_AFTER: - find_specified_fn(post_nodes, op) - for op in OPS_IN_MODEL_BEFORE: - assert find_specified_fn_maybe(post_nodes, op) is None + backend_no_func.check_after_ops(model.ops_in_model_after()) # check if the functionalization pass is applied - for op in OPS_IN_MODEL: + for op in model.ops_in_model(): find_auto_fn(backend_no_func.graph_post_pass.nodes, op) assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None # noqa: E501 @@ -183,7 +172,7 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int, # make sure the ops were all de-functionalized found = dict() for node in backend_func.graph_post_pass.nodes: - for op in OPS_IN_MODEL: + for op in model.ops_in_model(): if is_func(node, op): found[op] = True - assert all(found[op] for op in OPS_IN_MODEL) + assert all(found[op] for op in model.ops_in_model()) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py new file mode 100644 index 000000000000..f651ee6912ab --- /dev/null +++ b/vllm/compilation/collective_fusion.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch +import torch._inductor.pattern_matcher as pm +import torch.fx as fx +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch.distributed._symmetric_memory import enable_symm_mem_for_group + +from vllm.config import VllmConfig +from vllm.distributed import get_tp_group +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.logger import init_logger + +from .vllm_inductor_pass import VllmInductorPass + +logger = init_logger(__name__) + + +class BasePattern: + + def __init__(self, dtype: torch.dtype, device: str): + self.dtype = dtype + self.device = device + self.tp = get_tp_group() + self.tp_size = get_tensor_model_parallel_world_size() + + +class GEMMReduceScatterPattern(BasePattern): + + def get_inputs(self): + mul = torch.empty([16, 4], device=self.device, dtype=self.dtype) + mm_weight = torch.empty([4, 4], device=self.device, dtype=self.dtype) + return [mul, mm_weight] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern(mul: torch.Tensor, mm_weight: torch.Tensor): + mm = torch.ops.aten.mm.default(mul, mm_weight) + reduce_scatter = torch.ops.vllm.reduce_scatter.default( + mm, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + return reduce_scatter + + def replacement(mul: torch.Tensor, mm_weight: torch.Tensor): + gemm_rs = torch.ops.symm_mem.fused_matmul_reduce_scatter( + mul, + mm_weight, + "avg", + scatter_dim=0, + group_name=self.tp.device_group.group_name, + ) + + return gemm_rs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + +class AllGatherGEMMPattern(BasePattern): + + def get_inputs(self): + x = torch.empty([4, 4], device=self.device, dtype=self.dtype) + weight = torch.empty([4, 4], device=self.device, dtype=self.dtype) + + return [x, weight] + + def register(self, pm_pass: PatternMatcherPass): + + def pattern( + x: torch.Tensor, + weight: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + all_gather = torch.ops.vllm.all_gather.default( + x, + dim=0, + world_size=self.tp_size, + group_name=self.tp.unique_name) + + return torch.ops.aten.mm.default(all_gather, weight) + + def replacement( + x: torch.Tensor, + weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_matmul( + x, + [weight], + gather_dim=0, + group_name=self.tp.device_group.group_name, + ) + return mm_outputs + + pm.register_replacement(pattern, replacement, self.get_inputs(), + pm.fwd_only, pm_pass) + + +class AsyncTPPass(VllmInductorPass): + + def __init__(self, config: VllmConfig): + super().__init__(config) + + # Enable symmetric memory for the TP process group + enable_symm_mem_for_group(get_tp_group().device_group.group_name) + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="async_tp_pass") + GEMMReduceScatterPattern(self.model_dtype, + self.device).register(self.patterns) + + AllGatherGEMMPattern(self.model_dtype, + self.device).register(self.patterns) + + def is_applicable_for_shape(self, shape: Optional[int]) -> bool: + # only do replace for specific shapes + tp_size = get_tensor_model_parallel_world_size() + return shape is not None and shape % tp_size == 0 + + def __call__(self, graph: fx.Graph): + self.begin() + self.dump_graph(graph, "before_async_tp_pass") + count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", count) + self.dump_graph(graph, "after_async_tp_pass") + self.end_and_log() diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index f4d3fd9b457f..07ebd3e1b7dd 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -6,6 +6,7 @@ from vllm.logger import init_logger from .activation_quant_fusion import ActivationQuantFusionPass +from .collective_fusion import AsyncTPPass from .fix_functionalization import FixFunctionalizationPass from .fusion import FusionPass from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context @@ -54,6 +55,8 @@ def configure(self, config: VllmConfig): if self.pass_config.enable_sequence_parallelism: self.passes += [SequenceParallelismPass(config)] + if self.pass_config.enable_async_tp: + self.passes += [AsyncTPPass(config)] self.fix_functionalization = FixFunctionalizationPass(config) diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index f0476bfcb65a..17dded87fe8d 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -243,24 +243,25 @@ def __init__(self, config: VllmConfig): pass_name="sequence_parallelism_pass") for epsilon in [1e-5, 1e-6]: EmbeddingAllReduceRMSNormPattern( - epsilon, self.dtype, self.device).register(self.patterns) + epsilon, self.model_dtype, self.device).register(self.patterns) - MiddleAllReduceRMSNormPattern(epsilon, self.dtype, + MiddleAllReduceRMSNormPattern(epsilon, self.model_dtype, self.device).register(self.patterns) - LastAllReduceRMSNormPattern(epsilon, self.dtype, + LastAllReduceRMSNormPattern(epsilon, self.model_dtype, self.device).register(self.patterns) # WARNING: This is a hack to clear the pattern matcher cache # and allow multiple values of epsilon. torch._inductor.pattern_matcher._seen_patterns.clear() def is_applicable_for_shape(self, shape: Optional[int]) -> bool: - # only do replace for specific shapes tp_size = get_tensor_model_parallel_world_size() return shape is not None and shape % tp_size == 0 def __call__(self, graph: fx.Graph): + self.begin() self.dump_graph(graph, "before_sequence_parallelism_pass") count = self.patterns.apply(graph) logger.debug("Replaced %s patterns", count) self.dump_graph(graph, "after_sequence_parallelism_pass") + self.end_and_log() diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index c95e0bce5f2e..0fe73b72b1de 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -26,7 +26,8 @@ class VllmInductorPass(InductorPass): def __init__(self, config: VllmConfig): self.pass_config = config.compilation_config.pass_config - self.dtype = config.model_config.dtype if config.model_config else None + self.model_dtype = config.model_config.dtype if config.model_config \ + else None self.device = config.device_config.device if config.device_config \ else None self.pass_name = self.__class__.__name__ diff --git a/vllm/config.py b/vllm/config.py index 5653bc79f6e3..cd2eb4508de3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3652,6 +3652,8 @@ class PassConfig: """Whether to enable the custom no-op elimination pass.""" enable_sequence_parallelism: bool = False """Whether to enable sequence parallelism.""" + enable_async_tp: bool = False + """Whether to enable async TP.""" def uuid(self): """ @@ -3661,7 +3663,8 @@ def uuid(self): compilation. """ include = { - "enable_fusion", "enable_noop", "enable_sequence_parallelism" + "enable_fusion", "enable_noop", "enable_sequence_parallelism", + "enable_async_tp" } dict_ = {k: v for k, v in asdict(self).items() if k in include} return InductorPass.hash_dict(dict_) @@ -4274,6 +4277,12 @@ def __post_init__(self): if self.compilation_config is None: self.compilation_config = CompilationConfig() + + # async tp is built on top of sequence parallelism + # and requires it to be enabled. + if self.compilation_config.pass_config.enable_async_tp: + self.compilation_config.pass_config.enable_sequence_parallelism = \ + True if self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.custom_ops.append("+rms_norm") if envs.VLLM_USE_V1 and self.model_config is not None and \ diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f67c01889188..10da6ad59246 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -120,7 +120,7 @@ def reduce_scatter(tensor: torch.Tensor, dim: int, world_size: int, group = _groups[group_name]() if group is None: raise ValueError(f"Group {group_name} is destroyed.") - return group.reduce_scatter(tensor, dim) + return group._reduce_scatter_out_place(tensor, dim) def reduce_scatter_fake(tensor: torch.Tensor, dim: int, world_size: int, @@ -136,7 +136,7 @@ def all_gather(tensor: torch.Tensor, dim: int, world_size: int, group = _groups[group_name]() if group is None: raise ValueError(f"Group {group_name} is destroyed.") - return group.all_gather(tensor, dim) + return group._all_gather_out_place(tensor, dim) def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int, @@ -161,6 +161,7 @@ def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int, op_func=reduce_scatter, mutates_args=[], fake_impl=reduce_scatter_fake, + dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( @@ -168,6 +169,7 @@ def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int, op_func=all_gather, mutates_args=[], fake_impl=all_gather_fake, + dispatch_key=current_platform.dispatch_key, ) @@ -367,6 +369,16 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: assert -input_.dim() <= dim < input_.dim(), ( f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if self.use_custom_op_call: + return torch.ops.vllm.all_gather(input_, + dim, + world_size, + group_name=self.unique_name) + else: + return self._all_gather_out_place(input_, dim) + + def _all_gather_out_place(self, input_: torch.Tensor, + dim: int) -> torch.Tensor: return self.device_communicator.all_gather(input_, dim) def reduce_scatter(self, @@ -379,6 +391,16 @@ def reduce_scatter(self, assert -input_.dim() <= dim < input_.dim(), ( f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + if self.use_custom_op_call: + return torch.ops.vllm.reduce_scatter(input_, + dim, + world_size, + group_name=self.unique_name) + else: + return self._reduce_scatter_out_place(input_, dim) + + def _reduce_scatter_out_place(self, input_: torch.Tensor, + dim: int) -> torch.Tensor: return self.device_communicator.reduce_scatter(input_, dim) def gather(self, From 54af915949ac37a9c5985b6ffbeb50a5d9dbdfcb Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 23 May 2025 04:36:37 -0400 Subject: [PATCH 083/192] [Doc] Update quickstart and install for cu128 using `--torch-backend=auto` (#18505) Signed-off-by: mgoin --- .../installation/gpu/cuda.inc.md | 40 +++++++++++-------- .../installation/python_env_setup.inc.md | 15 +------ docs/source/getting_started/quickstart.md | 13 +++--- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 06915f09dd51..d3d4b4ef6c80 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -1,6 +1,6 @@ # Installation -vLLM contains pre-compiled C++ and CUDA (12.6) binaries. +vLLM contains pre-compiled C++ and CUDA (12.8) binaries. ## Requirements @@ -23,18 +23,26 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: ```console -# Install vLLM with CUDA 12.6. -pip install vllm # If you are using pip. -uv pip install vllm # If you are using uv. +# Install vLLM with CUDA 12.8. +# If you are using pip. +pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 +# If you are using uv. +uv pip install vllm --torch-backend=auto ``` -As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions: +We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. + +:::{note} +NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration. +::: + +As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: ```console # Install vLLM with CUDA 11.8. export VLLM_VERSION=0.6.1.post1 -export PYTHON_VERSION=310 -pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +export PYTHON_VERSION=312 +uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` (install-the-latest-code)= @@ -51,30 +59,30 @@ pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly `--pre` is required for `pip` to consider pre-released versions. -If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: +Another way to install the latest code is to use `uv`: ```console -export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly ``` -Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. - -##### Install the latest code using `uv` +##### Install specific revisions using `pip` -Another way to install the latest code is to use `uv`: +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console -uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` +Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + ##### Install specific revisions using `uv` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```console export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +uv pip install vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md index 00b61ea5c826..911301d68335 100644 --- a/docs/source/getting_started/installation/python_env_setup.inc.md +++ b/docs/source/getting_started/installation/python_env_setup.inc.md @@ -1,19 +1,6 @@ -You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html): +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: ```console -# (Recommended) Create a new conda environment. -conda create -n vllm python=3.12 -y -conda activate vllm -``` - -:::{note} -[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. -::: - -Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: - -```console -# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. uv venv --python 3.12 --seed source .venv/bin/activate ``` diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 42468ff73c2c..ecca296b0b0c 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -21,25 +21,28 @@ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python env ```console uv venv --python 3.12 --seed source .venv/bin/activate -uv pip install vllm +uv pip install vllm --torch-backend=auto ``` -Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating an environment: +`uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). + +Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment: ```console uv run --with vllm vllm --help ``` -You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. +You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment. ```console conda create -n myenv python=3.12 -y conda activate myenv -pip install vllm +pip install --upgrade uv +uv pip install vllm --torch-backend=auto ``` :::{note} -For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. +For more detail and non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. ::: (quickstart-offline)= From b046cf792d49c36cabc5589501fcc1da31945f64 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Fri, 23 May 2025 16:41:03 +0800 Subject: [PATCH 084/192] [Feature][V1]: suupports cached_tokens in response usage (#18149) Co-authored-by: simon-mo --- tests/v1/core/test_scheduler_e2e.py | 11 ++++++++++- vllm/v1/core/sched/scheduler.py | 5 ++++- vllm/v1/engine/__init__.py | 3 +++ vllm/v1/engine/output_processor.py | 9 ++++++--- vllm/v1/request.py | 4 ++++ 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 0a79424a30b7..511d57d405ba 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -19,7 +19,8 @@ def model() -> LLM: enable_prefix_caching=True, long_prefill_token_threshold=2, max_num_batched_tokens=6, - max_num_seqs=3) + max_num_seqs=3, + block_size=16) def test_concurrent_partial_prefill(model): @@ -27,3 +28,11 @@ def test_concurrent_partial_prefill(model): assert len(outputs) == 3 for output in outputs: assert len(output.outputs) == 1 + + +def test_prefix_cache_stats_is_recorded(model): + # 17 tokens will make sure first 16 tokens are cached in a block + input_tokens = {"prompt_token_ids": [101] * 17} + _ = model.generate([input_tokens]) + outputs = model.generate([input_tokens]) + assert outputs[0].num_cached_tokens == 16 diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 2152409019b9..c873ced343bf 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -457,7 +457,9 @@ def schedule(self) -> SchedulerOutput: token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - + # Count the number of prifix cached tokens. + if request.num_cached_tokens < 0: + request.num_cached_tokens = num_computed_tokens # Encoder-related. if encoder_inputs_to_schedule: scheduled_encoder_inputs[request.request_id] = ( @@ -798,6 +800,7 @@ def update_from_output( stop_reason=request.stop_reason, events=request.take_events(), kv_transfer_params=kv_transfer_params, + num_cached_tokens=request.num_cached_tokens, )) else: diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 122a5a72cc36..41db99beaad5 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -107,6 +107,9 @@ class EngineCoreOutput( events: Optional[list[EngineCoreEvent]] = None kv_transfer_params: Optional[dict[str, Any]] = None + # The number of tokens with prefix cache hits. + num_cached_tokens: int = 0 + @property def finished(self) -> bool: return self.finish_reason is not None diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index a7a9b0e4a161..293c291b4341 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -147,6 +147,7 @@ def make_request_output( finish_reason: Optional[FinishReason], stop_reason: Union[int, str, None], kv_transfer_params: Optional[dict[str, Any]] = None, + num_cached_tokens: int = 0, ) -> Optional[RequestOutput]: finished = finish_reason is not None @@ -169,7 +170,7 @@ def make_request_output( return None return self._new_request_output(request_id, outputs, finished, - kv_transfer_params) + kv_transfer_params, num_cached_tokens) def _new_request_output( self, @@ -177,6 +178,7 @@ def _new_request_output( outputs: list[CompletionOutput], finished: bool, kv_transfer_params: Optional[dict[str, Any]] = None, + num_cached_tokens: int = 0, ) -> RequestOutput: if self.output_kind == RequestOutputKind.DELTA: @@ -193,6 +195,7 @@ def _new_request_output( outputs=outputs, finished=finished, kv_transfer_params=kv_transfer_params, + num_cached_tokens=num_cached_tokens, ) def _new_completion_output( @@ -340,7 +343,7 @@ def process_outputs( finish_reason = engine_core_output.finish_reason stop_reason = engine_core_output.stop_reason kv_transfer_params = engine_core_output.kv_transfer_params - + num_cached_tokens = engine_core_output.num_cached_tokens req_state.is_prefilling = False # 2) Detokenize the token ids into text and perform stop checks. @@ -356,7 +359,7 @@ def process_outputs( # 4) Create and handle RequestOutput objects. if request_output := req_state.make_request_output( new_token_ids, finish_reason, stop_reason, - kv_transfer_params): + kv_transfer_params, num_cached_tokens): if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put(request_output) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index d1cdd2c52750..b4c84507532a 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -77,6 +77,10 @@ def __init__( self.output_token_ids = ConstantList(self._output_token_ids) self.all_token_ids = ConstantList(self._all_token_ids) + # State + # The number of tokens with prefix cache hits. + self.num_cached_tokens = -1 + @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": if request.mm_inputs is not None: From d0bc2f810b7a34247154b078c2429bf62519e9ca Mon Sep 17 00:00:00 2001 From: Yuqi Zhang Date: Fri, 23 May 2025 01:41:37 -0700 Subject: [PATCH 085/192] [Bugfix] Add half type support in reshape_and_cache_cpu_impl on x86 cpu platform (#18430) Signed-off-by: Yuqi Zhang Co-authored-by: Yuqi Zhang --- csrc/cpu/cpu_types_x86.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index cf67847b45ba..9a613ba588dd 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -19,6 +19,7 @@ namespace vec_op { #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ From a1fe24d961d85089c8a254032d35e4bdbca278d6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 11:09:53 +0200 Subject: [PATCH 086/192] Migrate docs from Sphinx to MkDocs (#18145) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 9 +- .gitignore | 6 +- .pre-commit-config.yaml | 1 + .readthedocs.yaml | 8 +- docker/Dockerfile | 2 + docs/.nav.yml | 51 + docs/Makefile | 25 - docs/README.md | 93 +- docs/api/README.md | 107 ++ docs/api/vllm/.meta.yml | 2 + .../dockerfile-stages-dependency.png | Bin 0 -> 121821 bytes .../deployment/anything-llm-chat-with-doc.png | Bin .../anything-llm-chat-without-doc.png | Bin .../deployment/anything-llm-provider.png | Bin .../deployment/anything-llm-upload-doc.png | Bin .../architecture_helm_deployment.png | Bin .../assets/deployment/chatbox-chat.png | Bin .../assets/deployment/chatbox-settings.png | Bin .../assets/deployment/dify-chat.png | Bin .../assets/deployment/dify-create-chatbot.png | Bin .../assets/deployment/dify-settings.png | Bin .../assets/deployment/open_webui.png | Bin .../assets/deployment/streamlit-chat.png | Bin .../arch_overview/entrypoints.excalidraw.png | Bin .../arch_overview/llm_engine.excalidraw.png | Bin docs/{source => }/assets/design/hierarchy.png | Bin .../assets/design/v1/metrics/intervals-1.png | Bin .../assets/design/v1/metrics/intervals-2.png | Bin .../assets/design/v1/metrics/intervals-3.png | Bin .../v1/prefix_caching/example-time-1.png | Bin .../v1/prefix_caching/example-time-3.png | Bin .../v1/prefix_caching/example-time-4.png | Bin .../v1/prefix_caching/example-time-5.png | Bin .../v1/prefix_caching/example-time-6.png | Bin .../v1/prefix_caching/example-time-7.png | Bin .../assets/design/v1/prefix_caching/free.png | Bin .../design/v1/prefix_caching/overview.png | Bin .../features/disagg_prefill/abstraction.jpg | Bin .../features/disagg_prefill/overview.jpg | Bin docs/{source => }/assets/kernel/k_vecs.png | Bin docs/{source => }/assets/kernel/key.png | Bin .../{source => }/assets/kernel/logits_vec.png | Bin docs/{source => }/assets/kernel/q_vecs.png | Bin docs/{source => }/assets/kernel/query.png | Bin docs/{source => }/assets/kernel/v_vec.png | Bin docs/{source => }/assets/kernel/value.png | Bin .../assets/logos/vllm-logo-only-light.ico | Bin .../assets/logos/vllm-logo-only-light.png | Bin .../assets/logos/vllm-logo-text-dark.png | Bin .../assets/logos/vllm-logo-text-light.png | Bin docs/{source => }/community/meetups.md | 7 +- docs/{source => }/community/sponsors.md | 0 .../contributing/deprecation_policy.md | 0 .../contributing/dockerfile/dockerfile.md | 10 +- docs/contributing/model/README.md | 23 + docs/{source => }/contributing/model/basic.md | 25 +- docs/contributing/model/multimodal.md | 803 ++++++++++ .../contributing/model/registration.md | 37 +- docs/{source => }/contributing/model/tests.md | 25 +- docs/{source => }/contributing/overview.md | 47 +- .../profiling.md} | 19 +- .../contributing/vulnerability_management.md | 0 docs/deployment/docker.md | 126 ++ .../deployment/frameworks/anything-llm.md | 19 +- .../deployment/frameworks/bentoml.md | 7 +- .../deployment/frameworks/cerebrium.md | 9 +- .../deployment/frameworks/chatbox.md | 13 +- .../deployment/frameworks/dify.md | 16 +- .../deployment/frameworks/dstack.md | 14 +- docs/deployment/frameworks/helm.md | 95 ++ .../deployment/frameworks/litellm.md | 7 +- .../deployment/frameworks/lobe-chat.md | 7 +- .../{source => }/deployment/frameworks/lws.md | 7 +- .../deployment/frameworks/modal.md | 7 +- .../deployment/frameworks/open-webui.md | 10 +- .../retrieval_augmented_generation.md | 7 +- .../deployment/frameworks/skypilot.md | 25 +- .../deployment/frameworks/streamlit.md | 10 +- .../deployment/frameworks/triton.md | 7 +- .../deployment/integrations/kserve.md | 7 +- .../deployment/integrations/kubeai.md | 7 +- .../deployment/integrations/llamastack.md | 7 +- .../deployment/integrations/llmaz.md | 7 +- .../integrations/production-stack.md | 9 +- docs/{source => }/deployment/k8s.md | 12 +- docs/{source => }/deployment/nginx.md | 40 +- docs/{source => }/deployment/security.md | 0 docs/{source => }/design/arch_overview.md | 103 +- .../design/automatic_prefix_caching.md | 7 +- .../design/huggingface_integration.md | 7 +- .../design/kernel/paged_attention.md | 101 +- docs/{source => }/design/mm_processing.md | 25 +- docs/{source => }/design/multiprocessing.md | 7 +- docs/{source => }/design/plugin_system.md | 9 +- docs/{source => }/design/v1/metrics.md | 16 +- docs/{source => }/design/v1/prefix_caching.md | 32 +- docs/{source => }/design/v1/torch_compile.md | 0 .../features/automatic_prefix_caching.md | 12 +- docs/features/compatibility_matrix.md | 77 + docs/{source => }/features/disagg_prefill.md | 30 +- docs/{source => }/features/lora.md | 14 +- .../features/multimodal_inputs.md | 98 +- docs/{source => }/features/prompt_embeds.md | 7 +- docs/features/quantization/README.md | 22 + .../features/quantization/auto_awq.md | 7 +- .../features/quantization/bitblas.md | 16 +- .../{source => }/features/quantization/bnb.md | 7 +- .../{source => }/features/quantization/fp8.md | 24 +- .../features/quantization/gguf.md | 22 +- .../features/quantization/gptqmodel.md | 7 +- .../features/quantization/int4.md | 17 +- .../features/quantization/int8.md | 17 +- .../features/quantization/modelopt.md | 0 .../quantization/quantized_kvcache.md | 7 +- .../features/quantization/quark.md | 20 +- .../quantization/supported_hardware.md | 28 + .../features/quantization/torchao.md | 0 .../features/reasoning_outputs.md | 16 +- docs/{source => }/features/spec_decode.md | 30 +- .../features/structured_outputs.md | 25 +- docs/{source => }/features/tool_calling.md | 1 - docs/{source => }/getting_started/faq.md | 13 +- docs/getting_started/installation/.nav.yml | 5 + docs/getting_started/installation/README.md | 20 + .../installation/ai_accelerator.md | 117 ++ .../ai_accelerator/hpu-gaudi.inc.md | 110 +- .../installation/ai_accelerator/neuron.inc.md | 39 +- .../installation/ai_accelerator/tpu.inc.md | 114 +- .../getting_started/installation/cpu.md | 164 +- .../installation/cpu/apple.inc.md | 37 +- .../installation/cpu/arm.inc.md | 41 + .../installation/cpu/build.inc.md | 2 + .../installation/cpu/s390x.inc.md | 37 +- .../installation/cpu/x86.inc.md | 46 + .../installation/device.template.md | 0 docs/getting_started/installation/gpu.md | 124 ++ .../installation/gpu/cuda.inc.md | 75 +- .../installation/gpu/rocm.inc.md | 72 +- .../installation/gpu/xpu.inc.md | 36 +- .../installation/python_env_setup.inc.md | 0 .../getting_started/quickstart.md | 68 +- .../getting_started/troubleshooting.md | 38 +- .../getting_started/v1_user_guide.md | 0 docs/make.bat | 35 - docs/mkdocs/hooks/generate_examples.py | 159 ++ docs/mkdocs/hooks/remove_announcement.py | 16 + docs/mkdocs/hooks/url_schemes.py | 54 + .../javascript/run_llm_widget.js} | 19 - docs/mkdocs/overrides/main.html | 5 + .../models/extensions/fastsafetensor.md | 0 .../models/extensions/runai_model_streamer.md | 17 +- .../models/extensions/tensorizer.md | 12 +- docs/{source => }/models/generative_models.md | 43 +- docs/{source => }/models/pooling_models.md | 107 +- docs/models/supported_models.md | 690 ++++++++ docs/{source => }/performance/benchmarks.md | 15 +- docs/{source => }/performance/optimization.md | 9 +- .../serving/distributed_serving.md | 39 +- docs/serving/engine_args.md | 18 + docs/serving/env_vars.md | 12 + .../serving/integrations/langchain.md | 7 +- .../serving/integrations/llamaindex.md | 7 +- docs/{source => }/serving/metrics.md | 10 +- .../{source => }/serving/offline_inference.md | 53 +- .../serving/openai_compatible_server.md | 396 +++-- docs/serving/serve_args.md | 38 + docs/{source => }/serving/usage_stats.md | 0 docs/source/_static/custom.css | 8 - docs/source/_templates/sections/header.html | 39 - docs/source/api/summary.md | 133 -- docs/source/autodoc2_docstring_parser.py | 21 - docs/source/community/blog.md | 3 - docs/source/conf.py | 263 --- docs/source/contributing/model/index.md | 27 - docs/source/contributing/model/multimodal.md | 834 ---------- docs/source/deployment/docker.md | 133 -- docs/source/deployment/frameworks/helm.md | 250 --- docs/source/deployment/frameworks/index.md | 22 - docs/source/deployment/integrations/index.md | 11 - docs/source/features/compatibility_matrix.md | 476 ------ docs/source/features/quantization/index.md | 24 - .../quantization/supported_hardware.md | 153 -- docs/source/generate_examples.py | 244 --- docs/source/getting_started/installation.md | 28 - .../installation/ai_accelerator.md | 299 ---- .../installation/cpu/arm.inc.md | 34 - .../installation/cpu/x86.inc.md | 41 - .../getting_started/installation/gpu.md | 301 ---- docs/source/index.md | 217 --- docs/source/models/extensions/index.md | 9 - docs/source/models/supported_models.md | 1406 ----------------- docs/source/serving/engine_args.md | 36 - docs/source/serving/env_vars.md | 15 - docs/source/serving/integrations/index.md | 8 - docs/source/serving/serve_args.md | 47 - docs/{source => }/training/rlhf.md | 0 docs/{source => }/training/trl.md | 9 +- mkdocs.yaml | 117 ++ pyproject.toml | 2 + requirements/docs.txt | 27 +- vllm/engine/llm_engine.py | 10 +- vllm/engine/metrics.py | 4 +- vllm/entrypoints/llm.py | 32 +- vllm/entrypoints/openai/protocol.py | 64 +- vllm/envs.py | 4 +- vllm/executor/ray_distributed_executor.py | 6 +- vllm/model_executor/models/blip2.py | 5 +- vllm/model_executor/models/llava.py | 5 +- vllm/model_executor/models/llava_next.py | 5 +- vllm/model_executor/models/mistral3.py | 5 +- vllm/multimodal/__init__.py | 5 +- vllm/multimodal/inputs.py | 22 +- vllm/multimodal/registry.py | 10 +- vllm/utils.py | 5 +- vllm/v1/worker/gpu_worker.py | 7 +- vllm/worker/hpu_worker.py | 7 +- vllm/worker/worker.py | 7 +- vllm/worker/xpu_worker.py | 7 +- 218 files changed, 4144 insertions(+), 6808 deletions(-) create mode 100644 docs/.nav.yml delete mode 100644 docs/Makefile create mode 100644 docs/api/README.md create mode 100644 docs/api/vllm/.meta.yml create mode 100644 docs/assets/contributing/dockerfile-stages-dependency.png rename docs/{source => }/assets/deployment/anything-llm-chat-with-doc.png (100%) rename docs/{source => }/assets/deployment/anything-llm-chat-without-doc.png (100%) rename docs/{source => }/assets/deployment/anything-llm-provider.png (100%) rename docs/{source => }/assets/deployment/anything-llm-upload-doc.png (100%) rename docs/{source => }/assets/deployment/architecture_helm_deployment.png (100%) rename docs/{source => }/assets/deployment/chatbox-chat.png (100%) rename docs/{source => }/assets/deployment/chatbox-settings.png (100%) rename docs/{source => }/assets/deployment/dify-chat.png (100%) rename docs/{source => }/assets/deployment/dify-create-chatbot.png (100%) rename docs/{source => }/assets/deployment/dify-settings.png (100%) rename docs/{source => }/assets/deployment/open_webui.png (100%) rename docs/{source => }/assets/deployment/streamlit-chat.png (100%) rename docs/{source => }/assets/design/arch_overview/entrypoints.excalidraw.png (100%) rename docs/{source => }/assets/design/arch_overview/llm_engine.excalidraw.png (100%) rename docs/{source => }/assets/design/hierarchy.png (100%) rename docs/{source => }/assets/design/v1/metrics/intervals-1.png (100%) rename docs/{source => }/assets/design/v1/metrics/intervals-2.png (100%) rename docs/{source => }/assets/design/v1/metrics/intervals-3.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/example-time-1.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/example-time-3.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/example-time-4.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/example-time-5.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/example-time-6.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/example-time-7.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/free.png (100%) rename docs/{source => }/assets/design/v1/prefix_caching/overview.png (100%) rename docs/{source => }/assets/features/disagg_prefill/abstraction.jpg (100%) rename docs/{source => }/assets/features/disagg_prefill/overview.jpg (100%) rename docs/{source => }/assets/kernel/k_vecs.png (100%) rename docs/{source => }/assets/kernel/key.png (100%) rename docs/{source => }/assets/kernel/logits_vec.png (100%) rename docs/{source => }/assets/kernel/q_vecs.png (100%) rename docs/{source => }/assets/kernel/query.png (100%) rename docs/{source => }/assets/kernel/v_vec.png (100%) rename docs/{source => }/assets/kernel/value.png (100%) rename docs/{source => }/assets/logos/vllm-logo-only-light.ico (100%) rename docs/{source => }/assets/logos/vllm-logo-only-light.png (100%) rename docs/{source => }/assets/logos/vllm-logo-text-dark.png (100%) rename docs/{source => }/assets/logos/vllm-logo-text-light.png (100%) rename docs/{source => }/community/meetups.md (98%) rename docs/{source => }/community/sponsors.md (100%) rename docs/{source => }/contributing/deprecation_policy.md (100%) rename docs/{source => }/contributing/dockerfile/dockerfile.md (89%) create mode 100644 docs/contributing/model/README.md rename docs/{source => }/contributing/model/basic.md (87%) create mode 100644 docs/contributing/model/multimodal.md rename docs/{source => }/contributing/model/registration.md (52%) rename docs/{source => }/contributing/model/tests.md (75%) rename docs/{source => }/contributing/overview.md (87%) rename docs/{source/contributing/profiling/profiling_index.md => contributing/profiling.md} (90%) rename docs/{source => }/contributing/vulnerability_management.md (100%) create mode 100644 docs/deployment/docker.md rename docs/{source => }/deployment/frameworks/anything-llm.md (78%) rename docs/{source => }/deployment/frameworks/bentoml.md (89%) rename docs/{source => }/deployment/frameworks/cerebrium.md (98%) rename docs/{source => }/deployment/frameworks/chatbox.md (84%) rename docs/{source => }/deployment/frameworks/dify.md (90%) rename docs/{source => }/deployment/frameworks/dstack.md (83%) create mode 100644 docs/deployment/frameworks/helm.md rename docs/{source => }/deployment/frameworks/litellm.md (97%) rename docs/{source => }/deployment/frameworks/lobe-chat.md (89%) rename docs/{source => }/deployment/frameworks/lws.md (99%) rename docs/{source => }/deployment/frameworks/modal.md (85%) rename docs/{source => }/deployment/frameworks/open-webui.md (87%) rename docs/{source => }/deployment/frameworks/retrieval_augmented_generation.md (96%) rename docs/{source => }/deployment/frameworks/skypilot.md (97%) rename docs/{source => }/deployment/frameworks/streamlit.md (91%) rename docs/{source => }/deployment/frameworks/triton.md (87%) rename docs/{source => }/deployment/integrations/kserve.md (85%) rename docs/{source => }/deployment/integrations/kubeai.md (93%) rename docs/{source => }/deployment/integrations/llamastack.md (94%) rename docs/{source => }/deployment/integrations/llmaz.md (87%) rename docs/{source => }/deployment/integrations/production-stack.md (98%) rename docs/{source => }/deployment/k8s.md (98%) rename docs/{source => }/deployment/nginx.md (77%) rename docs/{source => }/deployment/security.md (100%) rename docs/{source => }/design/arch_overview.md (81%) rename docs/{source => }/design/automatic_prefix_caching.md (98%) rename docs/{source => }/design/huggingface_integration.md (98%) rename docs/{source => }/design/kernel/paged_attention.md (94%) rename docs/{source => }/design/mm_processing.md (61%) rename docs/{source => }/design/multiprocessing.md (97%) rename docs/{source => }/design/plugin_system.md (86%) rename docs/{source => }/design/v1/metrics.md (98%) rename docs/{source => }/design/v1/prefix_caching.md (94%) rename docs/{source => }/design/v1/torch_compile.md (100%) rename docs/{source => }/features/automatic_prefix_caching.md (91%) create mode 100644 docs/features/compatibility_matrix.md rename docs/{source => }/features/disagg_prefill.md (87%) rename docs/{source => }/features/lora.md (96%) rename docs/{source => }/features/multimodal_inputs.md (84%) rename docs/{source => }/features/prompt_embeds.md (92%) create mode 100644 docs/features/quantization/README.md rename docs/{source => }/features/quantization/auto_awq.md (98%) rename docs/{source => }/features/quantization/bitblas.md (76%) rename docs/{source => }/features/quantization/bnb.md (97%) rename docs/{source => }/features/quantization/fp8.md (88%) rename docs/{source => }/features/quantization/gguf.md (76%) rename docs/{source => }/features/quantization/gptqmodel.md (98%) rename docs/{source => }/features/quantization/int4.md (94%) rename docs/{source => }/features/quantization/int8.md (92%) rename docs/{source => }/features/quantization/modelopt.md (100%) rename docs/{source => }/features/quantization/quantized_kvcache.md (98%) rename docs/{source => }/features/quantization/quark.md (94%) create mode 100644 docs/features/quantization/supported_hardware.md rename docs/{source => }/features/quantization/torchao.md (100%) rename docs/{source => }/features/reasoning_outputs.md (97%) rename docs/{source => }/features/spec_decode.md (93%) rename docs/{source => }/features/structured_outputs.md (96%) rename docs/{source => }/features/tool_calling.md (99%) rename docs/{source => }/getting_started/faq.md (91%) create mode 100644 docs/getting_started/installation/.nav.yml create mode 100644 docs/getting_started/installation/README.md create mode 100644 docs/getting_started/installation/ai_accelerator.md rename docs/{source => }/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md (84%) rename docs/{source => }/getting_started/installation/ai_accelerator/neuron.inc.md (79%) rename docs/{source => }/getting_started/installation/ai_accelerator/tpu.inc.md (55%) rename docs/{source => }/getting_started/installation/cpu.md (74%) rename docs/{source => }/getting_started/installation/cpu/apple.inc.md (58%) create mode 100644 docs/getting_started/installation/cpu/arm.inc.md rename docs/{source => }/getting_started/installation/cpu/build.inc.md (96%) rename docs/{source => }/getting_started/installation/cpu/s390x.inc.md (64%) create mode 100644 docs/getting_started/installation/cpu/x86.inc.md rename docs/{source => }/getting_started/installation/device.template.md (100%) create mode 100644 docs/getting_started/installation/gpu.md rename docs/{source => }/getting_started/installation/gpu/cuda.inc.md (74%) rename docs/{source => }/getting_started/installation/gpu/rocm.inc.md (72%) rename docs/{source => }/getting_started/installation/gpu/xpu.inc.md (67%) rename docs/{source => }/getting_started/installation/python_env_setup.inc.md (100%) rename docs/{source => }/getting_started/quickstart.md (75%) rename docs/{source => }/getting_started/troubleshooting.md (86%) rename docs/{source => }/getting_started/v1_user_guide.md (100%) delete mode 100644 docs/make.bat create mode 100644 docs/mkdocs/hooks/generate_examples.py create mode 100644 docs/mkdocs/hooks/remove_announcement.py create mode 100644 docs/mkdocs/hooks/url_schemes.py rename docs/{source/_static/custom.js => mkdocs/javascript/run_llm_widget.js} (54%) create mode 100644 docs/mkdocs/overrides/main.html rename docs/{source => }/models/extensions/fastsafetensor.md (100%) rename docs/{source => }/models/extensions/runai_model_streamer.md (86%) rename docs/{source => }/models/extensions/tensorizer.md (79%) rename docs/{source => }/models/generative_models.md (63%) rename docs/{source => }/models/pooling_models.md (62%) create mode 100644 docs/models/supported_models.md rename docs/{source => }/performance/benchmarks.md (86%) rename docs/{source => }/performance/optimization.md (98%) rename docs/{source => }/serving/distributed_serving.md (73%) create mode 100644 docs/serving/engine_args.md create mode 100644 docs/serving/env_vars.md rename docs/{source => }/serving/integrations/langchain.md (93%) rename docs/{source => }/serving/integrations/llamaindex.md (91%) rename docs/{source => }/serving/metrics.md (90%) rename docs/{source => }/serving/offline_inference.md (76%) rename docs/{source => }/serving/openai_compatible_server.md (61%) create mode 100644 docs/serving/serve_args.md rename docs/{source => }/serving/usage_stats.md (100%) delete mode 100644 docs/source/_static/custom.css delete mode 100644 docs/source/_templates/sections/header.html delete mode 100644 docs/source/api/summary.md delete mode 100644 docs/source/autodoc2_docstring_parser.py delete mode 100644 docs/source/community/blog.md delete mode 100644 docs/source/conf.py delete mode 100644 docs/source/contributing/model/index.md delete mode 100644 docs/source/contributing/model/multimodal.md delete mode 100644 docs/source/deployment/docker.md delete mode 100644 docs/source/deployment/frameworks/helm.md delete mode 100644 docs/source/deployment/frameworks/index.md delete mode 100644 docs/source/deployment/integrations/index.md delete mode 100644 docs/source/features/compatibility_matrix.md delete mode 100644 docs/source/features/quantization/index.md delete mode 100644 docs/source/features/quantization/supported_hardware.md delete mode 100644 docs/source/generate_examples.py delete mode 100644 docs/source/getting_started/installation.md delete mode 100644 docs/source/getting_started/installation/ai_accelerator.md delete mode 100644 docs/source/getting_started/installation/cpu/arm.inc.md delete mode 100644 docs/source/getting_started/installation/cpu/x86.inc.md delete mode 100644 docs/source/getting_started/installation/gpu.md delete mode 100644 docs/source/index.md delete mode 100644 docs/source/models/extensions/index.md delete mode 100644 docs/source/models/supported_models.md delete mode 100644 docs/source/serving/engine_args.md delete mode 100644 docs/source/serving/env_vars.md delete mode 100644 docs/source/serving/integrations/index.md delete mode 100644 docs/source/serving/serve_args.md rename docs/{source => }/training/rlhf.md (100%) rename docs/{source => }/training/trl.md (66%) create mode 100644 mkdocs.yaml diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6a7d220bbdcf..774a5df16d7f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -33,14 +33,13 @@ steps: - label: Documentation Build # 2min mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/test_docs/docs" + working_dir: "/vllm-workspace/test_docs" fast_check: true no_gpu: True commands: - - pip install -r ../../requirements/docs.txt - - SPHINXOPTS=\"-W\" make html - # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html + - pip install -r ../requirements/docs.txt + # TODO: add `--strict` once warnings in docstrings are fixed + - mkdocs build - label: Async Engine, Inputs, Utils, Worker Test # 24min mirror_hardwares: [amdexperimental] diff --git a/.gitignore b/.gitignore index 2756c612b82f..8d5af1bed92d 100644 --- a/.gitignore +++ b/.gitignore @@ -77,11 +77,6 @@ instance/ # Scrapy stuff: .scrapy -# Sphinx documentation -docs/_build/ -docs/source/getting_started/examples/ -docs/source/api/vllm - # PyBuilder .pybuilder/ target/ @@ -151,6 +146,7 @@ venv.bak/ # mkdocs documentation /site +docs/getting_started/examples # mypy .mypy_cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5c0c368d578..658de23cf4da 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,6 +39,7 @@ repos: rev: v0.9.29 hooks: - id: pymarkdown + exclude: '.*\.inc\.md' args: [fix] - repo: https://github.com/rhysd/actionlint rev: v1.7.7 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 2781ec223b66..98c3be25f7e7 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,12 +8,8 @@ build: tools: python: "3.12" -sphinx: - configuration: docs/source/conf.py - fail_on_warning: true - -# If using Sphinx, optionally build your docs in additional formats such as PDF -formats: [] +mkdocs: + configuration: mkdocs.yaml # Optionally declare the Python requirements required to build your docs python: diff --git a/docker/Dockerfile b/docker/Dockerfile index cc3499d1f0a9..f28a1618298f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -329,7 +329,9 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 # will not be imported by other tests RUN mkdir test_docs RUN mv docs test_docs/ +RUN cp -r examples test_docs/ RUN mv vllm test_docs/ +RUN mv mkdocs.yaml test_docs/ #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### diff --git a/docs/.nav.yml b/docs/.nav.yml new file mode 100644 index 000000000000..c410b6b8223b --- /dev/null +++ b/docs/.nav.yml @@ -0,0 +1,51 @@ +nav: + - Home: + - vLLM: README.md + - Getting Started: + - getting_started/quickstart.md + - getting_started/installation + - Examples: + - LMCache: getting_started/examples/lmcache + - getting_started/examples/offline_inference + - getting_started/examples/online_serving + - getting_started/examples/other + - Roadmap: https://roadmap.vllm.ai + - Releases: https://github.com/vllm-project/vllm/releases + - User Guide: + - Inference and Serving: + - serving/offline_inference.md + - serving/openai_compatible_server.md + - serving/* + - serving/integrations + - Training: training + - Deployment: + - deployment/* + - deployment/frameworks + - deployment/integrations + - Performance: performance + - Models: + - models/supported_models.md + - models/generative_models.md + - models/pooling_models.md + - models/extensions + - Features: + - features/compatibility_matrix.md + - features/* + - features/quantization + - Other: + - getting_started/* + - Developer Guide: + - contributing/overview.md + - glob: contributing/* + flatten_single_child_sections: true + - contributing/model + - Design Documents: + - V0: design + - V1: design/v1 + - API Reference: + - api/README.md + - glob: api/vllm/* + preserve_directory_names: true + - Community: + - community/* + - vLLM Blog: https://blog.vllm.ai diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d3b429dfb925..000000000000 --- a/docs/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -clean: - @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - rm -rf "$(SOURCEDIR)/getting_started/examples" - rm -rf "$(SOURCEDIR)/api/vllm" diff --git a/docs/README.md b/docs/README.md index dcd5e759dfa8..57b1d03deee2 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,43 +1,50 @@ -# vLLM documents - -## Build the docs - -- Make sure in `docs` directory - -```bash -cd docs -``` - -- Install the dependencies: - -```bash -pip install -r ../requirements/docs.txt -``` - -- Clean the previous build (optional but recommended): - -```bash -make clean -``` - -- Generate the HTML documentation: - -```bash -make html -``` - -## Open the docs with your browser - -- Serve the documentation locally: - -```bash -python -m http.server -d build/html/ -``` - -This will start a local server at http://localhost:8000. You can now open your browser and view the documentation. - -If port 8000 is already in use, you can specify a different port, for example: - -```bash -python -m http.server 3000 -d build/html/ -``` +# Welcome to vLLM + +

+ ![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM" class="no-scaled-link" width="60%" } +
+ +

+Easy, fast, and cheap LLM serving for everyone + +

+ +

+ +Star +Watch +Fork +

+ +vLLM is a fast and easy-to-use library for LLM inference and serving. + +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. + +vLLM is fast with: + +- State-of-the-art serving throughput +- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) +- Continuous batching of incoming requests +- Fast model execution with CUDA/HIP graph +- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Speculative decoding +- Chunked prefill + +vLLM is flexible and easy to use with: + +- Seamless integration with popular HuggingFace models +- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +- Tensor parallelism and pipeline parallelism support for distributed inference +- Streaming outputs +- OpenAI-compatible API server +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Prefix caching support +- Multi-lora support + +For more information, check out the following: + +- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) +- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) +- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. +- [vLLM Meetups][meetups] diff --git a/docs/api/README.md b/docs/api/README.md new file mode 100644 index 000000000000..5c7b2ca79ee2 --- /dev/null +++ b/docs/api/README.md @@ -0,0 +1,107 @@ +# Summary + +[](){ #configuration } + +## Configuration + +API documentation for vLLM's configuration classes. + +- [vllm.config.ModelConfig][] +- [vllm.config.CacheConfig][] +- [vllm.config.TokenizerPoolConfig][] +- [vllm.config.LoadConfig][] +- [vllm.config.ParallelConfig][] +- [vllm.config.SchedulerConfig][] +- [vllm.config.DeviceConfig][] +- [vllm.config.SpeculativeConfig][] +- [vllm.config.LoRAConfig][] +- [vllm.config.PromptAdapterConfig][] +- [vllm.config.MultiModalConfig][] +- [vllm.config.PoolerConfig][] +- [vllm.config.DecodingConfig][] +- [vllm.config.ObservabilityConfig][] +- [vllm.config.KVTransferConfig][] +- [vllm.config.CompilationConfig][] +- [vllm.config.VllmConfig][] + +[](){ #offline-inference-api } + +## Offline Inference + +LLM Class. + +- [vllm.LLM][] + +LLM Inputs. + +- [vllm.inputs.PromptType][] +- [vllm.inputs.TextPrompt][] +- [vllm.inputs.TokensPrompt][] + +## vLLM Engines + +Engine classes for offline and online inference. + +- [vllm.LLMEngine][] +- [vllm.AsyncLLMEngine][] + +## Inference Parameters + +Inference parameters for vLLM APIs. + +[](){ #sampling-params } +[](){ #pooling-params } + +- [vllm.SamplingParams][] +- [vllm.PoolingParams][] + +[](){ #multi-modality } + +## Multi-Modality + +vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models] +via the `multi_modal_data` field in [vllm.inputs.PromptType][]. + +Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal]. + +- [vllm.multimodal.MULTIMODAL_REGISTRY][] + +### Inputs + +User-facing inputs. + +- [vllm.multimodal.inputs.MultiModalDataDict][] + +Internal data structures. + +- [vllm.multimodal.inputs.PlaceholderRange][] +- [vllm.multimodal.inputs.NestedTensors][] +- [vllm.multimodal.inputs.MultiModalFieldElem][] +- [vllm.multimodal.inputs.MultiModalFieldConfig][] +- [vllm.multimodal.inputs.MultiModalKwargsItem][] +- [vllm.multimodal.inputs.MultiModalKwargs][] +- [vllm.multimodal.inputs.MultiModalInputs][] + +### Data Parsing + +- [vllm.multimodal.parse][] + +### Data Processing + +- [vllm.multimodal.processing][] + +### Memory Profiling + +- [vllm.multimodal.profiling][] + +### Registry + +- [vllm.multimodal.registry][] + +## Model Development + +- [vllm.model_executor.models.interfaces_base][] +- [vllm.model_executor.models.interfaces][] +- [vllm.model_executor.models.adapters][] diff --git a/docs/api/vllm/.meta.yml b/docs/api/vllm/.meta.yml new file mode 100644 index 000000000000..c15adfec644c --- /dev/null +++ b/docs/api/vllm/.meta.yml @@ -0,0 +1,2 @@ +search: + boost: 0.5 diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png new file mode 100644 index 0000000000000000000000000000000000000000..0838bfa37fe62d60fba9adcbd18c81de0809f253 GIT binary patch literal 121821 zcmcG$cU+Et-#>mLWHii(R8~kO?b0+tT4-oWXlZHhBtnw$Y0*+r+EGeri9%`bol1M} z{e2$ind`c~*YCc6zrTLx{kSjk={(Qlc)wrcIbQGMp@N(=&3eZ5Boc|{+}TqXNhDeq z5@}^Q^-BE9;pF$~_+!mA8R=7`CF1`=Gb3M+NGzmtrz9`i2KF}@SSock3A6@1+)qWV zL%Z!%1PjY1n!6kK>N7nR?z@=#Zsm55u#eYbs+9X;1;fw2I~h?L&GhDmLgEqLtB0vr z)z&F*syyhna-HtVoy~M}cB9jVUr9ge)@jsu#5)X)i90O4&rsejy3i?R?&IUL0++J< zQdl*jxc2W4B$DfzlXKhu{iEUe{~vxj!re>$2KtES&!3lw&BdmrrA0;Q*Tico7Od#i zsCX+|QBmGUHS9qPxeorKH1sXF*^=R;*ZLQuGDWL zBlT|FnCP#w7;epF3$na;=FFMOhs1ka@3Y3ge}BApBk=?M2E0>Vd||vOfJc0$;{`r{ zLgw1F->xfG6^jgg_T`kqwiZY3U;(BEEh5)>o6TG?l%SdGpuHgwHR1efh6_ePG_6&t(2}lj56I z#D_AH28ci)O{#C(wyh#kK6~I}gV_9VUi6E1@7|sCpj}v;$`-fmE*)~7%2>B<-KK+7 zVM-brpDlaJg)IIQ1qxYIMJp+FwYRr(eIEYOV4A&g&5*=UAw-{Zx|;n|$yItKQ1rzkd&3xOlO=N=4hL zfXRl}pnQM*E#ku@GQ%i0zj(8+I!>U)acqnH2}MQ4O~FDVU$@|UCl?pyq<3wMTKZ_* zz<=ZCKN&+=b7LjqbS0L(l?J07KZrd@&Luv|b(RuSt_weL!A^spy5hAmg-`TXM3yW} zwTNH;{`Bn06*NWFaq7&@(`^nznKpBi18o6D@t^+Wu4TSe8KwC5b0o;o=(tjBmWfX1hJa=i5HpjeW`M@9!619PthkwtB?7mADvs z(dG4Di}kP3c3IH%7TvyQk3|tTHbcDQ>B%tJ)x4I-kxOqce44B`k~x3gjYYrW?XTD? zAMB?FnXEX8PhmMR;U~MqmuB9)kb5Ldxs08Z%V7K%Bw`b6Q7puxSG>`U#gjoVzIe?tUeF5TSrF+gSO+yc!j+8@@LR2e@5b?M~^n`JA;7OAW6TUjZHp8Y@z={ z$rItNi7rPtI5NA#JQt@6#MelcMk$5|@aP5G{(h+FL4*ovZzy?}C06lU_t*Y-!M2vT zR(`$WKr9P?@2Q-{*w3Hm*NRW)V#V*W=vT+q?K1srK3*m*%|sMB(gVKbYlvRA<=|sx zaWQJQGhU1f6A3P&$gXfXs{W;)KYtdBPBrf2xk)AYG$7#9t-ANt1NEnu-+y@d{cmQ* zx?F}FTG-jyJ-xj{#GGblx9!jOyng$s|D7eQpX zez}S&tE$@Ou4eeh$7i*qSs_PiRP)NDf1W=djTB#ar(5tulK!Y_Vym9`+?D0J;QxM# zTvTXE>hwYu{Icf{P`hoB6Kq2+$eum>KzNf`&P+}ABA@MWYk7J3^z<|?k&cn~Q12$YtR-O}RgYrMr5Z3}%l!hfyW{XO4nS53Cd5)1X{*cj4jFshySCK5L#6(vp{*ZJaZ z)mM$%LlO~DgMvw`KQc0cDthbu`SbKtbSH*$T$biqqPCE0e9}|12IJJzLF69{JCL8u zLbp4LgG6@m8S)}-@xpJeO;?UnGp-X4ElNby zKi*pBcD27Y$z%8Jvz6u~Hx_D>^q#i})HRaFPz6$qYR)i``$?3=a*AwJPc?D+6QJkD za#ltrW9HAR@F%N=#nn;_C)$~p1gYJS0Alz$+zRn^y@8ZUPm?#^Q{n09d6#8}(CzCD zAB~B3xu7JxL>{nEJCD6;`N!Ay?{}&H{jL{m^0W*{e!6hkmBZWO)KY}!`aj%dnf~p` zrR6-+xxAoOGRs?c_18;2EdBvy&xmDa)|PpS$h1yvyvR)8xVY-~6Td7GwEU4-F}+E3 z)7zs;vgVqG_)QuGOUYm5x)4NJ--#yN?pDrpt3Kiao6ep++g%ere!GDJJXrg{kz-I1iALb>Irp+}ycd8h5|?Xhgu33g+D+QOou z4lQ3-QyJw}vtsI{-k*DMux zdN0ppKM@&Bi)wdNI(_<^l8UBJaBy>m&CvY(e8T5hME1QOEw=4mPM^yz$%}5&GAs@h znzKDj+#YG~Yw|LX=;vojT!PUd|NMGoVPWAi9?pE466&tAlz1Ys9OB|iQ1kQgIqgr* z9anofa-fk|1g9D;q`mhvy*c+HHZ~SZ&r1mbQl<98W&HI(OG@g?KZntG0ybOGlfSxG zgpzyYA4n3il>Yu-Kb4`D~*WxsFm5vwkS);xYGJg3^zk+OR_t4rGZ_Ne63 ziq8)xc%zvPw3k>dElf>ym$eUO3~8Y^qGF)qB;mTykhIKlSKnPjqxRwEFEK4<;zC@b zD20-wn2ti|JozDn$z^^d0niUUFv)3d^7H4hjLx9et}C?FaAgGMVm? zcTu}|aWj=9@<|(6Khak+RGYLN$m9=GYGBwdP0~fu4Gc7bMEUZ^E(_tzeI~p6?`2I$r z38!|xz6RcB{sL2H1j%Aa%-b1h4Y05cnF^n$FazuenUBG~?Td9w!UCw$dS zt^l%OO-w$IcriGAKDHs>ignD$n_BepngkBIdxhO64Y?n5)f8f~WTS5BEUvu27` zvc6&yTRj_~GeCKpu$WkuN`h9#&f=dxADufyyp?1aK&e*r0r<8iI?KVvPliE$ekuj# zlT~v=+1hGqjPZ2B-J$3SO&e29Tf`2&CciK02BjfLTz|=H@P6(N@VHW(fJvh&@kSKo z=<1%Ho($&2>Jai$yUw1YIbI5XO`F?((>gL@s(oJbNJ%Qa!|B}GMUe7mXrL^ePe z+C|<3;H{R@t+>t1%puR7ZHljny)q_Sb0zU=Zq}N(Bu~!tD8mInC@X;%AM{G%=g72* zR4GO&-z?{eh!)7SL}gG>R=(fY7oQPdAAo>7u5ncBbDH6Fp_a?S1fAIQ)ezn4%F_&* zl~`soQM=I&)B0yN`Hy!J&8p}k88;|+{!QZj`}ZRp8qEusv|S7)`|IRSjGRi_bL?u4 zKeujl=9iwxkUg9L)gOSed)O09Y7^UjvL~GA4!vv0HJS>D@=DPBT3Rzw?0}!fMP=nl zpMa_u6>Y03B@hg5ndK#+bXn3~9<$aQo?^jvhV1EY0eZnRvwaB++qU`q^Ur52pg^Np z9==XXOKT5RF004xQP2`rv8-7-9+%;}%*hB>4Wc}GpBVkl7UK9GDUM1d|B62jVC3-O z!{T#wCD*3*tJ*#OA$a87%ytAtJ-;REme1c$``D8;XFFJl1BT!I3?Y4=gzhLAn2DQy@qw&jkdPZ*C%6c-lt_^4E{Wr-k0^f7brRC zn2wa8p&>n$jPFsQu_7Mz6hoiq&&xkqbd9tZAZ2a;6a|37Dvd47P0?{#{o^jokz~0xnw5pp>0Pe243T_c9371X@dricVQkDc_dxZhG35f;V!UFHivT zY&@EL+i%cxZ~k>TCYO<=*>7o4aHnmVvx!oZ<6Kw(l?yb%=E{{Te}QPm2xzc}Lkpn5 zGetdee)c~7`;}eALAwM@c2nh!b{2*guN?MLj#b^nX+PP&(|9xSNuhESn4{1F)Vv$; z78o0S8QI|_Oe-+~^u@qAJKnS9|NiY#(#BoUFqMMZ!(Tdr5fULkYg5VJ4q{_7HG za?qUezG*Mb)&S66X6ND>3ilRU@79{u{=}$0*#H~>uYbYcRZo3bLVFH`3afHxp2^C9mpMViznAEtKO!1ejqzJ2?aSwqt@#Mgmj z)+E#wxP>YgZJHqU#JjxlIZ(2AG_>tHeah~W7ysI_aI&TyVJ~yx!bZ2&V28oOM~*zg ziqcc%LfRSm!4^U#X@(mbA8Zm?{-z*GPNKzus(Vt{A^mcq^PUYyLEtmIOiu-LK`13- zV`JdkyAPRUPE>;eKq-N60n&Q&`t?&mWEk~eSt*&kXl_1&x}RpzRUEbQLZF~I1W!qN z$B}OgdG~4NAP#Dky;`|87AE?z{opV)sOed=7BLVO_8WcGr9Y|QOP=;ZuL^dqg@J)fYd4kT`SU+fzQs_T zGc&g4)EJ}xqm|ir`lz_EJn-}+WK44GkO3&maY|PAXKK&?X0T~Amp0an1MSx~>AX)X zHr?}ETVoWrs~Gs&LH96B9DX66t4DVgCg1YH{%rYw`wR|sOaD`W*0 zz~N-&{3MM%pk!XtqFnanqi+Z{9TCF60nsw$(*bUKV}r_li;M0^VB-vlB5*8*bjb z2^yB|gdLlVx8FsT3k;fdHP`Lo7B+t4dR}TbTzZq!R1=ekHQ^Mf#HuRB=-gS5K6ma1 z^e^$NIj)j-*C;8O0_)Uv7W#n|i>qXkZzg(K(Q_J|Zuj=yB@nO7>ErKjTcxDZ1T+b$ zD8^?)yms~?!rr;}T_l1lMvlm{NDyfhv;~OG0QqX4HkFz|T+T$QK-CT8GpYt`jV`w7 zs~&*IaH22v!+aRA`J^l;0cfcq2M^D zA_ciZOSZ}B*NkncqBMjOFd3Ilq}mFAzo@gKLk1lKL&6{51KFJL48UG-r`K} z4s&;RkGf4RUlYsJ%HbxG9VUv9kPuiHUI?jXF|9=GP->HP17D5CrO4B%9O@r00AT^* zUP-rnSMt?0K}KF)9Jk^;oySb5=YM2{-Fgen zbpH4()N<-E5}yF5D4ZY!%=q~D9#Q+^i=2{G0Jg2fBmAFO8Zq(lJL>v<6>cq>%_0H=fBLdZRv)PL`56uDK=2e`Sp zH8wUTnrG;jg&|Q@M5@6c0s2+=OtP~wuA>zjI#&=#l@OCrRVRh{B_Q*x->{+U*DC?o zJbv2FcGzb0+jEt#iIsF+buW`FRMN6QTXmQlNX^SPmXj-nGIe=C71^h(T%B`g75>qC zk7(A!Pl($gPLp+mdRb#ZLygET#@iADM~)s1giwFOPqP|}ziid5r6QL!*M-L3su+vg zx4+~MJU<|VK6+(9_2$h#p^|sMy6tW)B?#wZ~3_Xy~+mL{G-9 zrTDzo&UV%oufqpJYgj6m;qvnIjEUQ|qVf?q`}nwJb`W_fGwD?ZmbMUEB_XP!pz!q7 zt6lsie#jV@4u%ECd3Zj2_<%fs#R%O^L)s34>o^6u}Dj2 ztJ;Asi!+2o2c5ptPrDh?$}&2lc}TA25_yTR08qF*yc$+-*dD8%8egCE=FJ;OPaW9_@}2pSB6`kq$B6eGFg_|AoQZ{k}A*Y=(M4qP0x>&aKVlMegPV?{pCuFSs&%$;SmwZh&F%q@+D(?AhGErlB4FeN^+u3y5K6VBwagQn-(PQ zvN$`@7sG^t*9X<0HjU|oEdg_`?bh6tlL(DB-1 zk1)iAMDqsdT3Fa`f=5UWtZZyuAlSspR7w(`=t}+dUtsR4i-M|Z8l2)p^H(okT!QyB zpbCBmFAW)jWL8hs--N604}bsuJuQ>4BlaEaF7x*z_Tm#`MF^p&ljIvpqR`AlKiCaE zetv#0UTonvX#vzQEZDqdONM37asARzTojpYIy&@J z0PIF6P|HQDHi_)m(V7<<&TC{(d6>_DkMZ(QE_A<30)Ee_NJa5axDCb@cE(j ztvQ#qw1S^LrMDs{3X3&GdO%_RUn?`u@#EJ5m$&`B#=-_ecsZY%1NVJEBK99)(VbD{d8WN84 zHwNuS=$6aT;XvV%wJz{kQA7>fzTDGkw?-CLIgda4SN}tJ;PXH|Y9m*-=2!ytiA~n( zDtQ?SPSHh~BG+SUNM2sM1c-~H zCU6zZJlK3%NIH5d*_XVB4;|7cSojV}}HSsFwMb#6m6IwRS zZoy+mnVkPTjV!AQ~GKLokwKy=2ENXAh-QFuEd#yE;0sWT>}zsfhRn1PD>0 z9}o?1+R;V63)0nWza3WO3ou+RZ1`e~VW7EVnE`F!AfU5=V`9x7;ZDdeT3#}GmdI3~ zMB%*+3@G3}>?+E0UZVeFwOW3kD&>9G*RNlX8;D^{#l4ZE0x=6vaOu(|U4&k*H+*4q z!aW)A)<7#ZTaib51SP&HWq1J2b6kJ4yb5;?!oE#J?8s^{bF=#;xsGtNz&YjcbP>ni?aR>l@ph%te$$`|CcrZWf z(bAK2OoC=tYtw*TOrecj+r5p%=&qG%AE%lW)|&|Y22x{=Qsqu2b*&OFC~JAu<uhOIg-DND|5UY-rWR5(6y3gMcf6j|A||CTt~jNS`~Bz6y3rfKi2I^ z1;*7%(WR{LPF{TlTpHjMl(xs=D|4K6e>^`xtQ;f}g4dl2BC`Y9qvS=;X+Cu5kSgd* zZ5o;h;@RMDq$+te06G4R2-kWa%3Xx&)PAu-&wiuF#g28*yJY%0S>s0m;3MTs_xgvJ3;ZQq&4XXPe9CmKKonh7Q-+=+=Uax;-w zMQJ8$1OJsus6qXWpFY$l{R>WD>EW*sSEi7Gh!(c&RzM51x;Z`E1_Jf?7+DtNJ+*9< z_G&-?0i;BPs+(q}=;YluEO^Op{L;r~8^4LS&>w&JY6Zk+Mb@X9E&=(I*&+rI#FpmA zfQJYhm~w4-x@5hnmJYZ5U!`p2)BmxQk*>wQdD8(^yb!YxQ$Qf87=EINWu-Y}e=Ns2 zEHsowKtKTb4_r|2go%=uO9>FyKY~y=D*qi_&O#ggV~D{3D{(%|Ft~1F8GkaW2F(`AG$BfEHGw$nz?R@E_iJpI3-e` zCEEprhUhcj*Lig2|fyo{cx@F(@*>bA;!U013C^^`1vm44Zgf6&+%yp5(3 zsu{665b?J$FsNL5l4p&oCtIz+;x5NsXH{NTG_)(k+M>=l#2R2C^O)-9C^<290U!0N3x(4Kg1}%I$C)R) z?SMIk%G5ct=SM-pUzVgARNRCOK;Zn3rR*hQ%MEg(yFG=6(o!P25p-d9i_sRPk&&kO>5iS!rLT;3ChK;`e{BxPu@c?L^Ks_6G zl)|(k$`9>zTv-t$_swVBd)mR)N?gCk->Ip2I ziQXze5n2q^T#A%S@>)k4b~*ESM;W>dA=M{jPO8yUVH^6&!lem3XstI;pF)gs7IJ0J zLs&tXexL=B2OSOJGznP@gEElrk@-&;`OecYxMnJ(4m4m-i1{EwRYRyo%R$h|N=w7k zXXc-pp61Ls40EMRrJQk*p$JVhD2@ zcm=oCSaITOIW!-v;ydEf(f((C${?I!k$}l4AjA+v;b+%=g;v>sshJHMHbABt!Ni+N zP#`zvvUc)-HcERjT1`}`q#D(5`o8 ztiuh#oNe2;XH8VALoIp{VPlYgFm6kf zRW8$KW`I!5kUTJ<@(7c*G-n4=O(7nQ1J-0hU{q2uDGU3yQ!miG`ifqPX-k&%&}x!G z4Pg+0ci$sM15gkru^||2^yc5DL@2J#^V2B$_dLKzA~6|;91?=t58k2wVa0^(jT@a9 zvmB|sB%e7yTBy`gT8`R)zSSNb6CWMdGVnh_nKtbxfJYFhf^Xld&J*3jbyYRcLPFMT zFSb=Fs0ViKHWdwS!^)@~F6%azvTWVHU2I{z0@Erp5Ike#h)dqWCI|cXq(;1>dLNf#_#ZV1ROBUoi#1@AZVBqg>WWnPtAzsdo1GVTvI6xoCMsclFv&SHQ z6o%H>*z_Q|TQ9678P-6C;)IY06u$2S;m~W^H1S>gN}Z`5`Ul0qWh$ek3F0{huMBMe zxNnj37jsH4v05>GFr^}EfzS+4@Q~;CJusME4yUXXl+G#pI7wn53F(Gf@u4?HPY#2h zuS~OquqpMI*S>^MplfD4ZO^lLcS^t{Eq^LnJ@p8_nGXG7Ln; z($Qq)?ZP`O)d~h=_aaNLJa~rH3v7u)OV;@bkO7gF>1iP zG+lsn2B?|)#^uZn@6IXvAcVQgYHGr`dUU+0HXUf>gf)#eapqNTP7?OyL$Pp7@7#9I ztm)tI5oWh;tChwYK&P!hU1D6M>%bqdsOwtlR}-_|=s2S+)JU!xOwB~+8)J;%sMIPF zuLLndfH3kfYeM^lU$}tslIXpal*knf83OAyRG$JT@h+_}!C9ek!w4#E&4hD=H9UIg z5Zo@s@H0eL!??>(waZc($_|TZKN@|}N zu|HE72?NQGPfR>U?FMxg#J-0pBf^!7v7w=@s88A{hD!8QK$MFOEsNF26rlLIS0Kwo zn?;LV=N1YZfEa$J!`LAK@%v4Q@@7!+mT*|HzQjxp#)wJ0mkDPB^mI>rf!D4n)NxE2 zoZ7Xnwgj4nCPt1>1UJ%|W(<7PUL5r+JF_xW>s<=)bIzTcXc{#d7QY~E!*GMvg)JUP z=7;W1i*o=Nuntyd`XkWG_wT^`-}rVd-2mHG82X*G5>h44-@wD9cnD| zLlQ@EkAYBomj&ayg@gwO%_ZvOfGLv7nlM3-Gm`X}o14LK%n%AYaju{as;|6o&l`wd zEYv6~AK{*1$tffGu1h0;PMVlYLK-1RhyfQ}m#y(+F!H4V{}sH2={-bh*nE`1LY zsXr4A3ev_JBMul924KFF8E1vJ6LAK+z9nk`L8fGSC%qa)2aVNopq`J~jo3PjkpgN^ z4kKW^zZKI*yD`F(R#=Pj7Gb~PYmT?`csRw##&RKnmM}Gr*?iK38)0BfDnTYhC?TW6 zXx}rxJf`yngN=7tzJLGjkCC5}U{SKI4!=YMvJJ)VAxZ=>R7Oa~w3g@-#L1CJ5@ueZ z`UV9pfEmE-28tI)i-%rF%YyRWMAQK)NjMn5mq>gDI8u9w!A1$u6f7Uk4ZMzxJt`)) zXo`7;cag!2IJ%HCn&7fv1bZIf7Ar|86vVg&-s!rko+Gsp;2{B0$~J`N1sNG)Mu6T3 zs{v8=7gP$jRzytzpONoS_^NB~Fie7(F94GbZddV%XjTZ%UKZ}8{3bmFHWN7uwc{+WOk{{ z+u8E#Ch1eKj~@gP-ZJ6tyM@BXfTBPMSHzi!Iq(4axUpa~_zQ-lULX`GR(5YV zd3D9-&FHB>S!OWzYC^Y)Yr2-j5{Bc%vE6UTB-h2^`$HI2i)~p=W0_&=GteS zhDsK0{yBL6-3exw%*H9cnuV9#gFAT==B!qbB=*6(6~vJQC~w^Av2k%dbG*Pr`Wvt5 z>dsB{si>+R;^1J~y!p88@YQpAR3xv<`ub6@W$f%e!8C=bk(`opAZQbbbavyT9R!+c z&DgSM&tPxwd1_EJap!r#^QJ2#h^l)^(XS>K#6Py^xjBJ>)h|YK9QS6JEYFSDGUz_ zyMF!pe&YVQd3iU|&>V0Fs@K6R-gCLN#7*KAngn%khoD(2SSmFuuuF)bdDt7Cl_VC} z($&?~?oSCr>fXJ!pAsb3{C8()95!#>jBeB?vW@(HsqctM{nqa82j%LdX*uAAyKBJj zFpuST>p1y0@IPK2k&$d1nN>A4A;)yw-c+n0vAENNrQ)MEOYPme7aZ!A{prKP!+CHL;9TS?((uwQq1$iXy*td>0i&t8 zx%ofu%FN0t;touYbeWYoHd5!$KQgGx34BZ3@4Kv7exw<;1!)Rn6gOk!O7??7%Y^O6 z57FE*Hn#iBsCi@!sq~m~f|l%Eq|P~hvI!7%50~Zc;UPm7+ot(Zv~qKE-JgQrd+p>o zLxj$J2*xWXdn!EWH`CJ(|M|6xxKz?+1t-Zov__oop=O0{_#9%@(m%xdgg*OOT$}}6 zm-P*1QnD9kV~F*cW`6tjZGC;c`%~;P8rxZ6vh^^%rX)nX1v~-{EXs9bQxjMxIRZ$P z810hQPqgg0_~Rz3Nfpdaxx1iMGQ^cXeb5_Yx-dUa&5EuMU>CpSNdE4Tbc-(9g*BD= z`DYM+>R#KtfD&92xd(+LQ$F3*sZ{Jg@rxFp&u&drI6Zf*W0oTPHjFk6VCTWOtlH8K;;!Z|uUxeXzA`_) z0Y&YcuphI;g$oz#?S)ydV@3{$iM)ni%D5abnv$27cK~%9FGrq}Cr{GRc>^7tL+J+y z*V5NNM1wWm|7sn{^-$5eM?yDHFxwSnCyX&1$m(9uZUu*4a+Fx*X}jxsdg6AYZ*Z1} zlT)tz(>w|MMe;5Z@H`P8wCU;TyjP3Wh;JfZ3e4RFT-aLFz~{xQNfLs_#>Nn@zx0|S z9lh47OvFjBJn;5rbB96v8tjM0hbUw@@?o&wc^`Or`P@1v?v8T+WuXwFp8ujkue5k_ z?cblzVhte$s?oveuRBOt4j@mg?l7Y^%j1y=(@SWdS;9vER4nO23%^&iw*eb@mhNOQhE!uQ! zbX30-*}CQ&aWJP z=G@u*!(&$L!%BLQS+VtT=7w%svk{@8Iql|}tZT%de*Jn1J8^!O^y3{P@D0wrS;fLX z{&W2}AF(z1`!RwJUH7!ESjf=e;2s8sso}PhKW@Tv8Ll|_;TUOJ@zCMJWuyTl3*)Z% zUUlT-Dku}F3txy`z1t0)uQ;odDyjUSB>P0bVvE<~$Gk-5T{Ag4SR z;L)Sr-d@1Ll;q^f($c8HB0o+|@aJs(6NZ(CAYPky%!%u>8k|0TuU2nyrc`>}(1{hT za~>8VSXYT~LV^SgkUnz;o74~1fiNueLv&xne-;=2hnn@tO zh_LY6h=>JzjVA+F{I`uToA7dfT3T8Pzfn@Nxxf0DA#^drfbteGfW5@R08l|~V{F0!(}(6u3qYu>WC`!pvt z&z^b9Z)zasve^IU&%vw*w(jqJ@jESHvQqX6)Q66Upsr!b8hI)F=QQOT;#(8=cqEGiunEBAk-o@Xp z@#!VO&NR2QjMbFsyQF}|AhQK~z)FU%96AwdJy;Ab()vK14<1H>rngxT;AE%#C zCJoCvIywdh2I?P$sSXQDN2e^)!REk$2si}({{D#j=1&%pVPSh&SXyyB4{1119k**L z5CSDEXt#z4g-cpkr>nx{mLdbJtHUfNn#uab9P!qw&z&U+6HSPZt?G*kX!D410)1yXaoKIU*Df{+riB4%KRPO6rc>RuS!p6<8Wy0 zc5BWGR1(&c_aV!lN9};BFE3wGR%Q+iH2(ZC)*k&QX|7ONNlh)?dO%Ihz5#Q#DBKuQ zibk(KbZC4CJGQuhN)(ZYi7Xs?@JQF&vtx%XKAI5+&Ws_)Bj=t0#6rnHDU}Zt#3^5P zsqfHgx_*7mrj01=^zVYR2z!jh9TI||u<((?hm{x;Z~kILN=uzJF^T{1!GfuR^T-ht zayE_y0Un;1`bl-nw*s>APL7V+FV5TjVq1cki$@i#V?}hw>IQpxgoH5n{^w&?$V*9W zGhwV-n(g9(Q^T&VuJ^C7KPqU42Pt&y*f6#ZP26bJ?q};aZhTWEbN~K*06c@l_Qi3n zY-c9~-5rUmIy&Yvqc_k?(u??=W-Xs&u358&h;4HzDJhJ9X+GJudGnXA;2_)h9%r)i z@+NaiZynNIP2yvNbz*G%i~DMI?3LsN4%QAdLKH_S52ztQC+wc*LnSP$u8tQ*&%B9> z!Rw4-hX!R7BESk^Pac@l3!`<1EFa&ycN!kIL4R+r+Sm`9{#t!KPn zY)qk6pj^XP2J#M|a01-@w9cthrz(>|$a6}>YB-uQI!qYvYa1GpUxl9j0QD%#ai&63Zqs)I@Rh2594W$)2r_=+~ktm;n*Veqeyzt+65$Wj{%E1AL z$uC2ri&d4C{{B5{uSoL^rLi11pyapeVQ11xSJUH=b?`fgaAlGA2O}A@wY3>>bnye% zny5E~qcArafzxx|8`iJSV%7Bpo-z0Tiz`T;#T?epgoXdsaHu%AHd#5g_~UXpLyHU0TB z%a!i*&oMC%ClqxVISRYbNMC=)&YiIJWC@!1ZowN1+>~a%Pf$=0?E-@$$bPU@Yzop8 zA--v@4FNZXYK6-_c;LXu%sg<+N3cg?D0~gg`{d-u1<(fdlkI{(OMFIN8Q({-MD`muuJ?`?#6(2+1O%L; z{!vm=QdG3nO1hSbCZO6sr6R|B+ewX*LsDnqL&J}9yL-1K)4?ww01{0zDk}|*7Ye$6 zKtPfwL!Y|6{Vb+X!OdPge-67B`A3X5#HkCcT6Gse@bkwHKGlO0#AFQ6AvO!M4!}Xg z+<1Ko;auWb0+m%&2mEWjN8KjkY9kq5C+LIrY}&Mm7aL#!i&^MqA0D!Up0JJ;ZU&G5 zaps7FP=c=gM#ft_20)8UPPWAKKfzbcddedjdBFT$m2~0Y`zWN}sMFUJ9!juK<1Opg zW7@9@V-#<(gNQ$Pq%y)>)7+;{pAN!WqTB(wVH%!jka)NPJnVz=!5RZM#E}8m`mC%h zT-o?9JLD$D%6NfqF!z63W#AHs3vqk5JI|}Cs^a!AXc-oE4OPWsK{LT2sut46C>*- z^cgq;q=5H>2h>e~MIda{RwPR*0&9||i};l^Ggh1;2DE2miv+V9-)DrdKrh0{ z-us_GGD*X_2BEzTb5ld08cOwo7&C!a*^GmjfT$1$EcbdC5TB{-XDY^HZ@ez zOgy&7)s-~q3k;@_?W}n@7|{lRFgh}#V_H;Li0L#Unc>I-RIC2dJm5~hdFM_$m;#_@ zm(d_DUmnMj;GT%nL68y=2{F1WjPi6-xAmjfX}&!!AfUItXRsw(9D}K^^Y9mWP`$&L zpikeO)b_A%=V{#AKvM=_m`^4*LK;(NfG2<}Q#b{(olC2zeL?%Axp`1Rtvc1pT%4c8 z?Wn7&e!m6J8U~DvoSZX;%Fyy5Fo?o?dD)U;dc(+Q*Y4d#&0pF_^xqw5fEEVsU8{aj zE-(!U>*emfLPB;xO5EJsMZV9UZx#z-W@2)})GOGKn!0+8w(@WL+`aC#4Gj(s4j)pf zsa9f?{>#Ed#Hqz84ph*l!sLyu!K7Fd7ruNhPToG!OzDpyVnQ#by zDQ9E1CnqOk2Cg2fa9=3!4W5iMi(Q_c{{G`ffWN=0y86;+A(yzlj&NlNbTtMBKh$IH zuI*Cafq_v;T01*oB|kvYpo1%R8{_d}HUsrmi}N$Te_xxNL{o5HnB_Zqw5Y18>dTkk zfQq}>K&8K8?IDEwXL2Jqpo=1!p+In`C1X4idW$h)%^X^Z{yy^|U z(*)=f7A+8&Qp+&;kc&k4aqI`m9zHki(?cbvBzk?E7Pzo_7>i3l6}1)H znVDVCLcr1fbakOUUlt)|>I_Puhu}a79&Ds~`ZOBzH15%6sCi;^^avN1xXYp=Kn_Ng zGL%Q>rX~l}8F`KC2?@_Czz|Fp8s_*4aqE76l=+Vzts}Va+_?if{dxrECLBg(zkU42 zxW&S$S;2^eZ+B38VYrD8+MP*L`iSjk1fi{igZ8J}5g_b%(iqG#`0Bn8x+6vK^t)Sn zdeWZtH?CjD*+d-!gE~YBl$q}CZo*RrTM!j>LbF57i=U}S{dxxq8O$HZ-a$b@p!+x*Rg?2@;SiT?q^0mr$=Kb14 z(`ngSfy+utagz^{XFmb}L3z4ZC%jKYLht7YyWp2u_WR?1aDqJ>K2ytsnK}Pw2QKO)7E9_anV z#1F0n?H0c=gT}+I5?}eoi!hduh>h4xYan{q@b4312v+SXeE#g&V>~?JnwnTAaN_ed zUw(c6Zadx$Vo4lAId$qQYAPgDVQgPC59Inzv@-Xn_#Zs9!T?XBfG7D5@+R5`++#!` zFo3>_rnLxEa1yEfYE>!>4ZL`(Yiby%sobA_`631J3p2LZuJjm`GTfm_RkY9DyQ^3` zFkptj3pl2;l80^}u(!QEuQC$ji6PMEl)MFT77kruv(#=TCNymW%tH@uJ;ckbh>I!s zr0Sy1BtR;^-}feG`$3n&iJ%P7xiWVC#9{l4Zoiu&r6cOb5yi{asyd!LAunSGspj5yeS$v(rZ zP3rCESM=qO9x%?cap&>ts4O6nNPi%=BIbK(#VU_6HNqW{x|VkLj^o#g9oWtP;M z8XM(8#AK<>EiC#l4PNRtmo8$~O4@R)D?i#lz5#q45g9i#k43;$irNhBt~ie^w1&}v z9Y(3RVQVYM3WaSSw4h_hjzxLLew}bTO`Sx}Ks~(!$@{p%(dEZ!aq!b@eipGO$(2di zQZHV-h|=L>*t{8M9M7}^gRW=FGwIM6=={`w?TR8(=qJ!h?zp$OiI@S#LOFu=hDDnK zrU81r6fj+sjZ1i4jdQxN(_BFIfnL%3pd#0w(k{2V)A8-=R}4PIYJ7U087KwjxCA8? zBRyU?^o;T$CgzNomz0zYi}!(%muWM!XD%faWflL38nD{L#YR-RT!7($qx+;#c-y1OJZo~Rg6?7j}vDXA<|x$ zia=;#qnt3Q2jYiG6HvMjA3wsOQ;K*?hXsy^NX3@JdF8%zRbsKlCKtjFH}*r#Cg$C{ zbK)n93ky*N-t59DHsaK`ni%1&46k~Ay}bmd6TElw+<$ld8eFgAH+Yc>N~ZePu6>E! z?$r5n+mn+Wuvx0(Fl!|<@EpTY(}eVmqevSNu+cIFfTLDuZ2*WVBJK*<9W0d;`(;G2 zj0!J3H5E{2M{q$)OUN1wSi~lyAbu$>e&XAV0cyZa8U2!Ch*XuAXqX8YPI%rU$jiEW z&o|=q9w6sSaXcPrhHWm0TNefg`uFP*;2oj$2^crLVPGHx2@=x0d8HQb4)oxcaTtYm zTfL!I>@_BgKYSR&!!*Ghi77jLJEl0fpv?eAen?9CWZ4613H-|g@!>Ou@0erGCGH4O z#Eo32_z9%7fGIDf0!&&yKHNSzJbYDc>!shq+GuEY4h|Kz@Gmd)a5Ufq1Z4Bmx}YfM z`^=XP!9`h`Zl`-Y?#s%~o}pVn3x+e>j*Jd)D_f(`#}Lc>1sp{`6qFd|!E0T8Ps`|f zHPt^gr>I}6xH_^=#zl*2(0K3IvE%I0SNPa5#w{Cu`(TP;a1aR53R?r>yRHdhMm| zxdAK;ghs{6*Py+qZfA5)p3K82rV^8`fx(Z)XFwg&r%pY1NkfDy6gSP)Z%M{k&q; zno@ten6C%f*$c-R9(#JWN40^g5RMJhZo*>C*pK)GE|4X79!7v)=N-J@pR=|0;l1GC zJr;K^E8(#NywlLjF>866W(ze{@}+~Dl%3|_XW4Js!FnB7QmWU#Bd@mD^a=}sR` zVWFokuc(-Dbgnx73#}M%2ZHb8#~YJ&$FM#e*=Ys-0dbuXCvQ|P7v|&Hs--JC@p}Na z?`caHQ443Zji4D!oxF;>d$yc0QOz%BlT#2E<+(1wN>OV@daO*2Mb#>dp zR!UX7Gty8-D_Gdrq$~GdU>=Xg+Y1BVf9PrmS*wMwLq z3L;X3OR=3SLPA&I^K~bp`r#nvW-0W-YMk6Z?-1}O)SVtu!IM0i9mgcjoOv5ue*vBB zxKS=zf9c08DGUimpc`IDaX(ZgX5a6z7y2TKzfu82l%$5=D3JHx&AfO3QT9rovD zoR=`jNh54AOG{o>{MHT7Cg>G_OfagKhTHBYBn%<1;WFX{%+fzFfCqzoz1=u_FkkPo zo!tx`0ASDv5BkYoXDh1-)PsWj{LNBmAc;VK))P8SB`UxuHw`!_zlZypJMfQ}lcOVJze`j4=c`U7(~gS(Oa`BI^46d^EA9(8ZYgyZqbX^T zH^i6f1Y(6I{V9s9O}NyE%|E}X6k!BbM>60BDt zK|$n1Q)#IglpG|js@;dAHe2xu&|W8}1r&06DNOHRqyfmec9lPaJuM+~GY$5vUgLuY zC%R>w6T>+=NKWoqKX67C=l5iyO}JnlIL-2UK@J2eWg+FetX3Q$(y~aY1Lg@$v$E0rSVa0s@g%>4+1E-dDNw^BqU*p+7?} zMbu^Y{sLs45k2Zl81XHv=agFvjf_0&za8z6B^1I0wW3OlA41e?ZM`b>AUoRyrKHLv z6jNYWhW`8)Ezxh@xK|(CEM>Jgq4kwcFvazGOmKG?v z1tSU?mfvD`JhUkk-FV~oRSzCLy4GtC8R?7WjVi>0VT@_afqnb(plKYwjt?p2hFN_r zzm&PpwlrPP;?D&I1$x#~($ab$N7n*)084N-0$LCm$pK(}*dUCXf?XqzPKbR&oTGo? zC~Yh94XpN>kbUOncAj3G=)9mu*ytAWRVcn{PBJCCz z7w50%Q;#+upryPS1!55CAHgy83zyyX|CT#3>j0D`~BbT|J~cOJ*%y_@9Vy<^E}S|*pL0#592mM>7lKt*>S)^6BCpBBOHn<%4anG z)B^ZMT-uuRM;YuQtV7&{GRjn*(opqsZvKl*kY?%Wb@YV_O?+5v-OH-bQcur2I5;>p zwUJFRojZ*K!=yu`_o{!6FYcsnKQ`L56mnf{g%_HH?yECsg}aquFlX`HaY3W}HA(hS z&O!%4w2f9(z4vXg!TLQBGvcUda1%iqn0Jt@?6Z2Ykjevzn2s*&C3|7~xw~xLRTW6E2)al#4xJ}2kv7lkgODV7*5y!0>zW?uEk!&?l9>h@9F4(ea zqjce{V|igSdnuhRj5fJ(^X7zc<6!4)kw>YhAOIP1h`cC&NzTMYmx$Wrj@~!fvXBe& z%*+zbo<%6Q;_B;8H`bnB_@U5|!_UCt!`Ig3ATjYG3-I#Fvm)UH3#}MwCz9oR0fpZS z%>DM*ty#SqO^NNX;NYwDUO^PNy1F_$f1@QgE+xg{%n**ewA|_YQ|-YzN_M@Hy$?nV zXaa5uEj7uXlmcfDA#lW=2KO5otjDg%&(54oFo*D4^4#ump9isNx)B^zqS|b}Hh$`) z$#5$2T5f|pFH*U&8<~lWNa@h(@agBYZo@0kNlMK#A~SEi>CS+0B!}KTR;lYXcI?!t zn4<@M37i;7KVkfK?P(Lh}zz2#R|Yh#^np>*0H$@vcM;Z@^e_-qgXSKfXnTMckH^_dGmmFE7H*;ob;`;Q=Oj}Y!GJs95S-R8?$$(%! z7c5*ja^a=jsZ~S7@Bg*GFk$nsf1Z8XLZTcS8|%3j6EeaeZ5XP~BAJ9)gef6dKm02F ze$j%$`hK!zv-XfMhlc(c(~b(;4dzQAWTbC8H@1;V_W0z+7{jKfCJkt`*vp5ntUonq zVV4ZIz0r406?Yh~q+|{Ac`-T~@|m2OyG7-d$5j-+0>w#bbMXUs`BL0=QIqf^C{C$w zDaHo2Yuh#;u~>F5cuGIozWi$$WsY8Ea&=P)2_a9b?f$WRdbeN1h&Myktg9QN>vNAa zu8OJqvnFQH!uTN$5X^XLDR>4PIWl4V_`;VZble4bN3R*5Gr%x;eAt6{aYTo(^Ty3nz3`IRB<3VdQk8+`Mlwb4r0%&atfGg?t9ctg|0C#ST4<*rBIPzBA$6iO z(ON~Du|Gi2W#?Ad2id64!uYbfz8!VPA)6w7*+6O)tn=v zE2x=1e|JdynpRg($b+YBVbqBMi zSM1N+8D484}ApbF9>dNBy=n);>@&BRXwamie39R}gP0fJJT|Pei7&>|P=NTRA zXFYNi&RNMk`OAaE14H53gL{E`m=_mT7~4hs5lQg3?N5`it^{Jr&!okeQNDl2~|2HIM=~{BL*roJ-q5d?s6uf6G(;K)0rkO8sxFGFBZvi!vY8!I`Zz7u z^IP3vY3@vV22sYq!q8BAp<@pnnCSlf`epbVQCoumW9MBaXi}j2>NHDgJTafL1QMix zeV`u$i;BrDwzv{We0tO2Uw9rT)ZiwJ=FZhaXrlEX_3NRKlP7hCF6Uz??Nq#eJ^ku_ zXC5ZYf~-7agABj&dHlM~DNqNH4la`ypHH2er1ta=Om{UWO|o@?%+k3HxgBNc7<82E zjZPt7I`b1Bm-?*mYTz5GeL|IQH>q2}=LffM>#kh zwVT}F3R;f>AsbMGd^165`nz>kx;rZ^y>_h4FKyd=cr^Te!h?p5!SG(A0td(Nf1V+Qo=XThua+ws5wu9bHT+;w+zPi)9PElJkH!6cmYzBh-*mH1 z2nk!@&^@H2kCLI2(xW%h^kw~>hCh`LF_zV7Zz*ZREiPTUlv3FRGRcD_ZK0d4P4alU zHrQ36i02;Ys6-OJtR!Mr9=B(0=@sTG$ysNU-aKlDmNGE0I9kZrf4oiv-ERnwCw10D z3`oX$AuRva4e@WI&F#NBKS;1&e0l^T#D&O7K|COOcCnxq`wzBfZ3&4t5;6|QTWX^6 zwOO;@mr`?WzaYPUiYx8|7*<-XUOjA+WY4X>s#cOw3rWXstp*Iz%g+l`ywo0}tYu66 zVk7w5eg_)f749mZ*Be;+RS68>nRmR~{0b3=p!-WnJ(a{gTp6EJ#cWwj-MBP6N6cAniaV@u{cH}hULi?Cp zGd|wnI|aE)jbGaSO@R_BBP>j@;r4{6%svAtlz+SI6_w~dRP@E-G z_TcM#^GwJAwz`x8XCkL@_shflNQx;(Ae-~LtLjv{Qtsg4!-9^RU`Wr>M#W4R(p9NV zvnju##+&s5D7s;{OZCf+J+QHha@f)Clkfy5HKB;XV_jx%zqq33?Wzluh|1PRb64-N3Ro0!5JNSew(hrd%-v9@xOYuV-|-|Ewfi_K}& z;5Dpy*w2T}e{VFI?tBqF^c~-!Hq@L%uU?F#nyII!RwrvBhE{MqXvEg87j$=8x5chA zrItiJfVqGPTluFRc-zk;YSrEA*Z=Wu*RI|8>DxU$?>DY{e`Ky-tQAabzXL32V&gGs zhO6Y}wRIq)GsvmUlJ!Y?L4IswZ(np?vB$1=x4zIzR%v>inw5d~x+zoI%m}@PUu9)f zdj}g+hxOAH#S!sm0J<#*Y{7ysIAG zy>rL6QMvFK-B2}uY<_dM|J?NNjg5h$Uj@45#;3*VBV^f3={I7ZfjWuI2RMP4JyM&) zPoKWTqyV}ip#J)4ZrS@Zt+&T4v?y2I7vC?T6sb0{l*)|N;(ziHJ}FuElZFsfybplCQv<0OCa_i#N6Z(>ni-J!vw zXL5(f_k7uZvdg%6xsWoW=SHWJ&la>7^0J|_HMVwk>NUH}dw!=%T3CPN`1p-V1+P9K zy*jyl>y|C+<|rPTv1dJ}8AM)>9-;yH{&JW8hy#Vz55|-44YJrAxv{I1RDkAI66#OT zI$jQLoVja+Ht{1x$+?ev_3nLpSR)OL!!HgTr=?YT_wHJ1rhrwBplLA$J^x%kiMl#C z3n6a!x#Z-Dw**+o_enLtC29|(UvkBzaD;?znXH58O$Rtt z^U8QoI>8Z;S%nV{&830^T7(^^0f_n^zW(t^-8}hh_4YOtt?2uSn{b+<6h&B#2L%Po z9M7D|c>2_pUVo0RepmefKWLZleMYOGa|`{7>YjAm$?Vo?X@4h|fsWb80nFm+KVj4D zv^0h1pZ9NmY?{p{`SbJt5@t@rg~I%WCjA4+}g_f4Q?$`WLxL|3=qNiF%+qeqRGR7MYa zhcg5h6B|3?!Za*{;2Jf4>(T`(GbF-R%k?WYtLakZKp6^a(Z7{_P7IK~?lQ!*?!{6a zT^sQ8UHMRDdaU%zRe(ONHH27w|jRE&ESFCOxKM1fC_lulPm@UQg$WyA%_EGNBL2B`9cH!N-Wn>kV%EaKZKuvh0u8YX$w3-&##}wpfch) zIWPYLXR*R*k^bJqpt+vX_ybd-$Q!4wzJdCxPQ^fOO&yh)?mEcS?5(YDsx9GI1kV~= z6$-k}jUUm-QSqzbr10EJ{K8Quw%E4fBt*ACHy!bv!b(8te%{_ovJS7>i3S^%h4|xt z>F~jWkBW*Q#|q?;>jyM1>pL)d0!Q4RJx0B<@Ra_VO&bH0e*UZgn`wMt;i_Lb2S|!c z4opBQ=fP!m{G~0A0*cH*DP9!@31+do4p0^7bLi;Ng|=zpFM9OosKCXQo?WsnVk;dK zq{{fWUZGeGIW%{3zHqbP7M7O^4=ph!^K+i)VssC&}!y< z>$jGT)3I~s9(;U@H?-$oB)F#L+jmRnUvYJQedJhLS{kN?Qm)SH*pq17rF8&(V-SrW zIVJEjdFyn(b*6_ziu$%vu&=1&=t>s^Z9i+bsG|Z0V{KpMuhV3g!Ji=i=FPIjQDX)U)Ha?o0v7!B(IS2M1)~D&zp9Lm3z46KdI|bp z=WSP-56WgVj503mu0y0OvV6eJ_~VfI{H%jHMSMQLJ zCGk7Bt@5cJ?}?=dd#XNtg03M4$i92GzM%nmz~teY{Xy$EUPs-nLcW%_<1%1}Ey9_Z zlbJbW`0z&Pbb=l(@k=Qw#1`Nt`IW|6g_yDJDf`Zn_NJy~JWC8TCH#0d(bdX67_{-R4@j2aDv+y8X zZ*DH9a0-nrWizK^J%^iTs4WECCi{FPTh-aoh=pR#NR0^-oRBJzwt;7Pn4&hAmj^CI zXtI{WSYfKZz8$`r24R~O4ZaLi7=%Oa_B-P$j?NF7d!P6nn6-?aQz-vQTdMFENIi3A zckUbF<0h~xh5ELC3tItj$eH?=^D1fzsi9H&`Adqb7KGOeViJ+=2nr(9z(GN7=u_#n zvPgFDm0pTyr&06wT~Iwt`#5^@+_`hq@mvV?$SUyRKshFqOWZ6DFz@o;=KsZHnN0j3 z)Rt}EvEvrFi-?50j-F;Qqpwe&%-z2i8pj8nj|xSct!g$B`Jt$9!|yXi#&iX`QL~sN z(S05jrQ(aQh5!Zyo}HD2X#3-nB4_=hmSF?u> zSdxS5amZgCbZ)a|Sn$q#BRX*D1R(NI^?eOf;3Jw}A?E(Nx1(cM@;y>8AcP!S3=qiVl z*V&o20c3futS(4UthKm^X|wFEp$j3=qsd6RXHOB<7A=}Ra?58dM*{sW5lbSS{{BT_ zIV~hY(o~MmU};)f8DAKBm_2YOE33M$F8%VZoE)0SP=PM} z#@)Mnx01+DYRpn*#gM8pe|C=05D7=!ey#^8B*EHV0u^b0^4zozyyP!73f#4o! zdeCH6@Z{qoZaRUbPd~Ia;i~(snvY@G!4ak-{`_+zDp2bfyATA=_BJ+y?cy$78WSVG z6m^xbSPHzfYc)uU=MN1~`)uja37b{c9LFi7?pb<=xidC7I3T^!sCj{~jdF)+eCCKk z$SYKUr{xDP<5CG0+}|)+T`G8d@JYYh2#L?~D4;|`oVRM0fLKwz4~x>S0tB&rdLH=z zKttL3qO5EkcSk=N#Co5`)%rt}W~hXitXZR5{`M^?1WJkHP~-Wr`g<0wTX&Wk^gZiX zPigQA(hSt~ChrbfuU&h0Z5oCBWB{p}dw;$Mw2#hUs!ZRZJH0BS1dj(}Y?AQp^XJvT z9Hmibku7}njC;u;%+)XLw2n=3cW$sFG7qAkx6>JApgEPK)BEF9K;*AKPSevXhoj)2 z>@ned<(|T^zds!1gUcJwyxWX5pZEaIZ&3TH{g20@4hNo6a$8sZQm=S#oC>?HZ{MS2 z2WC;#=*JrB>wAtKzOj3gY1_ZkGBeAGMJA5ODT<4VU=GO`Pa6ICn5cPg=u|K3&4`u} zNkc7}f49^F;9=o-O=I$zJ!jsPx356Ob@9Qa?gRFaFWO^Nv-u%ME^LVHW?M!2=DF8E z$1_LLp>G0j*?_X30Hys;DRVbY##V-rt@1>t4lgaBe}EfG%CjM-3i-c-%@Z&ek6_*I zhK2^m4I8>&R_?av7R90mB$-zuE25BJLqx}?ixqL8vu@OAum|&<=)nV*T+UUR0?U|r z=MKA0=wm1KKgmu9l#yl6-&P*J57BshPj~VRDlv;tbKg^H^xR9`WpU!VDM)?ZmVi@P zZ&$W_(s{dxMS!FntY9)>7Ib_QiaCw?y1LB{4yf%FZe7)ZWn*6*!;vhajVD6)V5CIQ zFy3;ioTOKqn-*GH9$(h|^OR>Uf=ZQW?WIy~VW}0ac-od5f&1nw&A5N(1GqSS|`@`UnQuITYe~~J9X>U{Q1)|8vThE zD96F8WWC%AgSdU&vwlj`o;{;H!h>QCb>+v8@-1ylMA7IIC&1_(I6Kj)-5ayxq(P8o z64?v!(P!t*6ES^IvjFM z?YF*TK*Fo|kuDuOdc)q;e4)mQXXX36&|5I`fG@fXjXn(Gx0->RLaAk8p}GF~)DW-r zrGv>#S%+q3jJQ}$6}YlCa38<`a=SB}G~4RrBqhYpZS6HpyM+BXH(Ry@ol{w6;-yWc!G z4k^Lv)vNi)Eu5z4ZpjdHat!{~K7q0C@u67)lcT2&6Fxkz8zoW2GjH4zf?Zv%V)moah0`YFe!Tnn>vK z^NlF+Q)%LRit?PV+cY|_Ytav~PB?#F=LY9mI?a?)8#9J9IoZAWN}OG}McPj-K%6}f z8%~4Z4v{j7JID9*j>>I!F8b0_b?rM?qGRWzMbGkQI*GPz+eW>fE2CQs{fsqun5@As zK^_Fqxm&lZB54?EMh{(i+qSG1gd*x{YJpnrZotE&Z{uko;dQU+4v!7A{P}C$Dli$> zdHM1L*A)quF2&Z@7_$0V5A?B$J-fAf5cgh7Yq6eQBpeRM4y{y6Bgg300CKt zWIQJ)Zx_k*)t2Xp=R zgM&{utZAn`QV+L^veqERUmKR*o7r9BCNqwqe;Vec_rCsq#T}w#)1z1jIyCrMTMHSf-N89jUFIusIW8x1l)Jx1nHJrO&# zptS%kjkLm5#L=#1b;D~+{%Evrq3W;5HoEtfH6NYFa8(^$`VJ@0Y>&Z;gTt6F?lvtm zI6Q*)@7gt!86V&(Fv=v7O2CQkgg+54Zwkm=#L5@8HIlKfHN3j{wrTYQd3~hIsWltS8$sQ2-xH+JTBdv8y)Fnsr^HP zoAhhCOZ`6&2V$Ikl_APXb}xFC{T4gwRAk+(S;HS@X^a{^I$>d??SdgkvI_OH&ubj+ zbGRbtpVJyAl~vY;C?C-|;W02L?c<(}BOl72>?J*}U5!)IX6w6ej%}UTt^ec!B{kz8 zvVA?JNyawVM7AZ4g!!@uvKT!WR0dGbw{dX>60_Dz>zd zs03m@dvQJONm~y{(hkVF8LZFiEg9xp*rOg50gV`u`6qb;<56DCMFi1vDSA%B9R}6IcG$DfFe*@KIw%-GR%*a0{&-dSXdl;vOrR% zcWT#7m_TCQ;fX9eqe-?c^By#PuaCB{Ej^Tx{$ZiGpdKv~7d)l`zuXz`gSYA+M`?+N zq7!DDUeX*V;Qb9YZ9JYorR^U5<+>YOe0T9y9zGl1KT=ch)CrN2lG;0QrYx>G8_C!e zGN|Q*C0}mm(Wm%=%^qo;5W|Qtl@7JRETLbwAQQEafA}X z<8|c)Gm0PKjgq1#DO+FblzK6b>Jwj6q9WgN$(`1Ju3BQ-<{>aA>4I?|uZKqX$SZLn z%7FD|81N(S)(9z}@Zaf)?v zaY=bHMItc?!m8rp^1m38M?NIpdYW*8fvVBjNz5`|uppC>=Y~Z;ZW3+3$k$hJejpNI zWvah>7{?g$kqQ~;W5PamYUlA~9K}|mwwJUp*azD`SueiWsCe)Z8$1E>Cc2RH#h8op zh+4z`%D)NY`;tNLdsM!!lmobM6D8+Q2unP}xd-*b%wrBsoF-dyv#-5wUlRL$V>?P@ zK4DO)*Vq2am(${hC6bNxq@JA>*fX_GuFxm2SY&Dnj>_~s2$`a-*Dgpaj)3WWY}H~2 zMp6CJ(ZEF%SiRv^8_8{t7Qg%)UFw1^uC89v1bbxgXSw{cQf%z5G&ec7JOQR`%8HdQ zk-Qc&Rqz`QLr_y&1q?^}erRlW$%%JH$};)ALSTFfbn4&YUI0%%SvP5#?9>Trd(?g|~A|OMyw3kQEq{=HaE}^U6@}>F}zM$|1*0fuY z&+eNjBRSEjFKCzzD7_S5z~LsAi zIF4hrOD`q6BBkFu@ps8gN3~=d*fC#=m;=hCPV*BKCHn!Ao(P0#cvQEI=dI#mCl+ugFC8g0@_&4Pl6tf`YXNSh@ngHRPNXs8g6T!o7hk{VV5+N_6680Y zU-sDHib*Ibmy8a~r8SucE5|B?E>PxQw^i`;^}T~qCox8$<%2wiF@Z&$lBz-m{Jk*4 ze)7aZeAHCZC#*7=yeXrDM{{3L&4$5kCATAoT^*N`n~UvjAu%{?qP;*$ukIEUhg0;TlbdqvAMy$*+xCkQ<_E{)oFtbR#q?V`$(w*Y;_MQT_DWVBq@dS9hqLmt* z6tJYXEWhh3ksMtADX0}v8%XU74u)f`6^Gj@uFXS*Q6yBGFKh^pXp+-dc@~&FVe>mF z1+?7-WF$njtp1&}<2h5rAR12T&I)u8z?>DMJxXdBlX7S&G%yW}U_Ii^Y3VJyzA;RPKP2w?2- zMy4T%J21NTsbeWE$@=DXV>C5g$RkLkTOP%(^9lA|(x73??!7K5?Ufc~BmSLyacKBz zF53DXOeV($2XB-#S(KpMXkpl}BdQ4>uRLxsIZn(x7hBj)dF1raPZ<6} zV}%%>k)t7yaXr~Px=6~FL_w7w{9?tOB@0}*U_m3(xOH{{7#XOU$h|;}_^Ky+IxPpy z#m4HFndlvKcCynk*mwj@Qfb)pT0ub#BcAS+ZxRgf?c1M;ZF<1~vg-%Ot{2Mm50`k@T$2V1AQIkZHpVLcSFc_*k1>rdBs!xfGga}I z?Jn2yWhBk#Fgp^XlE#iTlX~{<-B%_~`wf_M#H6p};4CEQ@K9oU#Qpom3Z5}BAH+;} z=#(UkA9Ln@xyagLZ;25MERL`pWWQoI6^3g`#}J5WFPa{@9=*Bv-|gFbgF3<|VcHA{ z^0KRS;O^-!010Aq&02xn4RkSj$@O0j9s9*lg)>DAwq*}eJsB}cxZtpCDADZi*M!IR zcNPW!0iJ)Tg9qsr`uyVN8Ju9*!*O91uK}Ryb8utO<$B#%!%YmMWF*!FamQZ4i%aDt-U%BCt8l6o2WNcH#M}E~KO@3iC`% ze6}JPR7gvh;;>-`C036JsXR?2k6}9qMZqj%4xF`X)(l@YLSmxr=GMds#NR-u`mVpGvubzsx5m%! z%^Uur&~CG~wq)#Izwnts`RqNxZBc<@xpwP6!8p=5OF*q{?hK0t(G9hTzXp1VF7)d= zb_6ma`L2=yCW)Q)EQ%W|=bL|kmCnhC!LvVCk@nr+{hO&cbT&<}M3F2`$_-jz9qnn! zRC=&sW5WiK-+f-riV~c~zb@B0{ zUmC1gUiX!CN{SR;^@SuECaBSOY)RQROpm+E=3g0pbocJv5LW*0;Hc%PUK;i9PGNw# zf7PFW;L2+6eFutwo?s>4ew1b$f>n^2owM;Q$+$7Yz_B2Zm!M z`Xo(_G^W#N*6|7Pu$=yfvX-kKr<`r2NGT4n4%f)sAUBZj1}ceSx-dg!|xPv?h)9`sQ-EHKDYR=i{?j@%v&{!{Qs9fxV%a+ z!v_qe@|_^5n;PL}^*0Y&zKM?)Q_87Bq;7cS1Nji}Cxj$5-Q5Ewnb$9j=O8-9NO-&F z!VY!4)`?a3BLoc<>HV16?XO+sj5o{{Yt62RaEB-zR;YpMTdxsboh1?7DdQ1h!5HWz zvnq~F51kucfaIg{al*%EN1wLw$hdh^iMAIq7&+hVkCh}6{cQb8-cIs*CD0WN|NL8} z$!t;+1bB2&zvqkUB?Rp8^&M_f-cP%39`S~Z+kq-3@PfsqejwL@iVylaUXZwprS=^= zHcUi=ZP%>m^4YhYmb~da@EJ2^c7uH&DCp|$yf`kF65MKc@xf`s@VwUC&kOCa>uFm_ zR?hC?1Zq)S$(G39hI~jpd&B*~xuIXSAj8SOG8G}MzVots`6W#O4N)(*#OeF}kN?DO zC>c6-Y(oBkwioh6&UNRA=qHv!WETSkMK^z;4~ zyjOybm?u5RlS~0W+M%nT^Dr*0d&E!FIi)U&>(!hz-QrOOf;`m&ONDTBU^2S#B)CWY z;hLH$XI+nS8+b9(?BJVC>(`&7wE?IOaO930J1*LQTtGdb5W2i;o2?dDoN%J!e|_T~ z!FXeX`QU*mz}e%SHc7-lN21H@8v`WTM*QM;X2(pSigs9Zto?#of;^ zi7w=9=RzW1Mzp+}B&?7FgAO&HaTk7I5|Be>e?o*ub?a1matDmJYT);bdNfdn65c`Q z(t)a?iD&eBw$LNA-1K}UK& z!`H`0ARt^R&ZQ%d4^vu;^$VjnXysC5dYwd)a}NT3w45(o6r@zzNd%Z;B-|_KyJm}B zHxuhcFfJ_+V6t@#uOwpa?&-^PbpE-YF}imbPST)DoHbNBkeQzTC$5ARcPSTKMBd6?Ig~9#dLDnY zP+kIJK23EVtE4oYFhbQGE!v|c$THwiIKhx5Mu4wdzs2%A5*I`yw#l4Gf*@5}`b$Mw zTaRJ%RbK&bg=_6Gk7)v9FhQ}pHe7rgpVl#|5=k~S8QDnhRjR!a+cDvFSO3@L>vm=1{ml zbO`<}!;vEf(hkn{vw5~q5@7%|gv^?7e0JQ!zFoGf2Zs~P(MP+ITV<&UJD{p@Ki*th z?dKC0%L(9i?xA!mOb0h~>}U(sXcT$o`$c|7#B|sdT=Eh?XIv-SH0ZT4-yG3a!1#@) z^}|%+a2JOb&$}YvhD6dVhFdc)`IJ(n(Adxq@m`f^3?bfnT98Axo)%DYcf|nE~UTUABZmNaaS4RG}ZF8_{4Fzkny{tV|8 zFXJQhwy=DU&rU$ZMMV3<8SykS;jp}wq~l9gt@>6a_(xY}!_JwUL?Q;Dp)@=HZit9U z`$JlXm`JkbvFVM|eKBbU2+&!}dIg7zhvO`TdQ4h{p%e$U1_z^ce z^!M-ExBlJzA(7tBWLAO@ns|nT1Jq~Sl2y{UQd?CO@ls?G+Iq-2H7LWJ?I)VsfhYCv z3TKuWOAG@IlnMYPbxw^G*FjNc%j1%hok0!kMyP5Ynf~V2#2vEcsM`nNCOMk>QAM5S zCC-|xc0U%vz&$TtE^mo1U4H!w+3k-&9NU5)TwCMU02|(xxV47@-+-vXcL4#tP1D*N z3WrG)T#Mr#3fV1=Vk+k!K742vnzu5(7@5%g1q=K?C{ptktl>icSRSm z5)Te-;b{4WZe*zN?q0K=KzsC@NW-hZ3TM4}#@ZKHK$7#1fnm(#wo>g4axQ5tFlkTI z!wsW#BSSbKJV;MARxyD-xM;ox-2jaX!An3V{vGJIhRhQh5G-{(d#LzFdIu{yr}ICq?rB)9|UU3FCn~NVe6$e zW{hcg0m%e&!eoZ@hSwXVpl#_x@`pZ;(NT{%3x9a4r-}b z@dv91JV)vuosT9G?}&Og{Am60Kayw{f0+Y>K>cGIdc0GBE1fXGJiLHemuln3C*3ZY z?mK#DIZ(BAMt~gJH(QM(;26^<%~#m>sT?;Xg-%8!fKvb>W7`@E6Mpzk1k|A z=t}|gm*F*MsI2m=r6`#HbK6CoZCPFj7+0wFreMp*v>ac5aYX;~4@Xq%t%W-yb0nr! zY`RQ`RnMenyHitL$q?fO)q+&v;7|o|v)2CHLw46!YA6jFBr|5cFur-exf@yz9!G1q zQ@^;yMxCBJiSepk!D!KP!yMiBzCA zR&CG!j#OuLLiQ?(uVJXzI#pgG>G=$T!KlYWD^*Y;!X z#=y)D{{|H#d{vjn!32;EYsk%F>w$h-@2Iimk0aV648PSwM-r(W@iOZ0d@gzHyXLo# z&V@!tuWAW0+VT4s+ub@G2!gCUPyjpt#T<9*(7Y`$1e`nEdS(Bta4iK7714c&;)!Ss z3KHf#`%>XAS{5Xok>nfthdlWr+M~5wX1;?(uKwRBq1T-Mi4sQN5rOfbY3pD&F)xs# zhEAT-P)3THjO7`N$1HG^84}E2cH8oTBZuTq>Kq7;i;e%|MLX4z)E5~`bQp9&Fhd|P zpag;w6)vY0o;AY4!AyrwIWghqp;$GaRZaqO;7H&ZQH(nePJ{2ESHtkY|!Z zOJFC7a~$X#oFl7Un7hW-1~VWvf?9wTaN%wta-k;uf>_LHREG*(SNH6N3(*1hB{|dn z8#Giuc4wpj%h)cym=bUZ(w7a20^UU_{@4T(MF!!vWSN3Ro4Z^sM2BZaA~J4VBFzjM z!ot8%1xD;)MV(MhMEV&;8WR)5haySF*i3pUE%sn0w;RiU85sT(LNpL-r~#$kE+PBE zNpK$Q5ETe!N3}k%_!&PQ*ok335eiYc7Ch_ofAL-(Iw27)Nlb(}6{lWua^~+x(CrN_ zVJq+|j9Xm#`0?ZH>}V$0U_%AB0?^ahiBydI`iOxN;x(A?%gM=RYh5I=(3>kDLY-3G zc!uaer`6Lsghiunbm8QNCmX-t5*s68<&PCD$r2gNH7lS#APg;@8;6FaB^s7xmL@q1nDScrz* zyNjkfe|?k8xvU|pAbC=2{Sj~n$C&WQ7a#G)D2@8_lX(6{@)LmMItDwHUqM9X&fSkMQR zn^yCeISLufbQIO2gY{L=yg7t(DK*+3ME(4GO`8Y_k^?qkVfm&B$48)myj|kqA>mNS zTgd@}3SgKBaT47h2ZO{o>i>ynJW?2aQiS@Gq9mjqUzmQY!|$pP5Nc^%Mat1h0firq z?L(E6JT_D@un^*8MtA|zbC~^xXO=4dQBWDwC8(hRBU_(`;V(aKf~1J*A1YH>&L(2x zd)1bs>FRZ*wc6VMu2~o2Llead1()Q=Jef^w#gG7p*irEcquT z1Mxprvz{oypiE1yENL6RZrsDG;-QL+58_E&LX7&aRS$%X-;9~0LWXEPp~&mw-SLDB zaEfRZ{jdTR!Nm!6=@5$~=cdsg@a-f;Z+#R8|8pZYF8_a%5MC^{i`+*-*5!+-qP(nm zmkmHNKG<^$Zmm%HGn3suu*3Nz>FfC4Q>aqEE8C{CBk>=E1o#3lQ+l41GBeBwdQD&D zX%zCFmiwYC`}4k-XcO1r#lw^;N#&b>JWF>#aZ%-g1{8kNkT?E))RmRRxPwk!OHrbW z0m|HNm6}3USd`*{%=^%kD5n)NI0eMu$l_7>`d~tCCUuV9FKW$)-KR?S^r99&e4{gO z@W<67j{h?^+uH(TJkI}&{)Tl7EF&={yV`bbFFh_llRktcGBPjA$_&@~tV8 zIe8=nEh0#mWsrc-)81~njLC?CCq6vLNc5dYeI+XgI#6lBY6iay*gInb14;8CW*ExK zh82`UJRL(WxI7NWaquDMHDDG>ul2kxEB~4GMijT4mO4v%h}(T*QPnvdEI=Jc63JXC zF3nT%f7Sa1eT@+NBdeDq_@MGB=8XA`Bp_J*jRY36x51wCaqxnmT2P!WU?w$6DFxOO zSCLYk+d+FL4QiZbU_JmiD%!aJ+S}WElvL|S8O@UC9;42fLGd^9sd8wn(TGW1#r-3aB^PnTu~T3mbM2Fx_`Z(CiddC z{rQ%>yNnLR@e4rSh!4+NiL`yvOOfD|x_w)dbgSJkfxTMMEC7*o(C7VTM@QO&6WE1? z(HIXLXx)Xz8b4S5A#9I^lBYmZ(Y~yvcJ9#G2#I*gpuwVhg$h=%=Je3K zVf6p88Nz1$#tj3O^1!gvOXCnyP`LK^akB9a{BFP*5dS8T#sjq6EMiQ}3}>H^qYtW> zAuRxH_6ZAP*ZaD*Kk4-zFT}LZiBCsB4&S~;TLXWwT#*lRP?BbzgY8o02x9-fWpYTJ{nqUC<35rnk zvd)FNVMbrRePg%6!xLmVRTVr*S?uB!-IIu7!xmn`>iNu*CZ49G=RiU))f1pmf_|7F zIVt2mizSm{!w*(ev;-$}>Hw9v;=TF_;PO2|c0yi`b|B-uwtp?qI(AX&R|tq{L2Fv<<mN5va zi-d4HFRuLwJAx|$MMOFGAPKV?L)+}yjIRR)F>u!_>dVMotsKY*ICI95kY;_~JXtWe zH4c(FhJiH>KR3E^G7S#scgu--TeE6NmDtA6(YFbb7XM5d#`L2d@^165xNODnW}hWv z^2qYsUkBR2cFeEuepZ?=#}=t%>oWF%+WW5en>TI3(V>cX8su8PV0?a?y|=D5)x|Mf z59YMzg5OIaYf4K?8GZxJQdG1c)_K9?cZiJexq?gLCFC!++C<5c!+{)#_f#5Cg2*J* z?a;?^G2tU;Nc#2bFZHDe&QTlqO0u{5z5ei@RbYGtLYW56REycK%FB}wTZxF7UF0bZ zRl?T|Mj%As>KuGAA>kt)PSZon!KieK!3Awi?MI@>&Y(Ph3*cMlJwZGY- zK=$IOT+Kh1X-fRdsA>p)3wJJ;>3|3TG5;fY5G7j+#{=WoE{TbW6ztyCeT5uE-`=ZI zSe%d-^RCNxcaV6zvMTwltmDfg2?amM`$IR6-#bChfi>|COL!nmvqMy z^P&sw(fTzvmNo-NCcnyTV@y@{*n*@DuaGrx;tidI4_V`bh4zA8rQET1Y+nzbb*zts z5~o!_Ag^2iEaDGq00$|K`mgP*g21;)YwUK;jp(*ZB-~D7--}F3{adyPcz7TLpVn$tz_oEq91i?kkg#4hD6^+Jb&z64(VY@@WSG4NkNA z2n63)F8QNaDwOTrU-*s%(x*^oiLeexXs3K3EAT%zQ@aShG3oMUIp3Q;M1A zFpUajlmjH~7S7)CCC~ws9Ul?rKo}>!X6593f4*yE&tdaX0=bq40@ASZegmym{cF3b zT7d)27HxgSl#u+t*!puOe_2k|?t9Y!?M$wa?6nEdLPh#=tpV#7h1ufp+5NX}Ya=mS zqiWxws#e9RWcH7QRkA{~4hSap?aUNxzR4|PA>s6X6=s=a^DXQ(608mz99v6SQ=--<$|TWy)-P2Flu?9_DM+Mq}Xa?9p*NsrO`8#qu0X>|0ofQx(G0$KUu53b2Xy_#_^>>w}|8=Qj ztU_vPm-)` z4jQoz7=)+b3FITCC=Q*`jn7g(PQ)TE>sLJ4L_;_=$?B@Ay%S|T?8Mv+Xc_@NMPGpE znMiFYIe>;*V$xkiqwpvH?h``?qz!D_E!QQid<3+Z`oLU#`TSY;F^jsSq{J)wkvP|a}w(Rwk{@P(f;+*pHbN#;5?U@NO)5IXDt3bHw;auF{``3QiEzUQZD?&viU%(A&}nC z{#()QDojNd$*!?ku|kmsglW`7qYK{=9GE~(63h@wA%GqT8zQRPil1Wv;LV-+YU)l| z|K45WNkYtCne^sVMYWJUFzf?`_xwXxO~sqLVojuw!K2ao_I?!a_PM#Yub894xWbMK z?5B9wZw_HbL-_QbkUMouCn5st>}0cwLJs0ep_jdo4%A5r+E8-0gD6q*iVEVenHg1r zD8-f5S0Qd+k28rogIE_XT&R%IYq5($<7sJ+FQ4CE$UWsA0L4YoK&uRSLY~b;9Qhm=`&`qJk%ll$<(D1`)b)9asK=xI|mV zn&Mg^L+I1X5R2cdvh{gaIo z;H{qf_{+_fkK2vmp2V5TqsdL!WJZ_80m>^=XVZv`UT2LF{3ze{P=Ls z0IW!^Uhgn4lu%o)H4?px{Mfa}L>G(d$e-`M;OALjk_puM%-WGs`1QAbpR-3^6*BuU z2BV1?Oi04>3;-py*4YVssDmvP zTKItW9N;)|1z8zxrE+h zB+Tu_mzjutEMCyMk12p#twERn3y`09UAWrR8{sKNX8DR}l~QZNu+^a8bXNZ?#G zSYayDl-Ta#%`cK5FuAdl0tgI~#}V|mNbBrm-~!!N3ZB=j9^vjMZr5IUKEpbq%YS`_ z90nRRkNqj)!!g=kTAFKEVWlYB6{b?$Yi;$IuVL%!qL4wFe8y1uW!H9+?kJnN$-0ld z{i4~ z-0}G*A>|+^ETn#ymPYtDQgrAvV=MEVEKFx7P*Ydll#fvoAG09xzwUoj$l3KnH?M1x z<^oG2G6$WVBwaKHf&EtCDvAE!qw36Jz^i3fH*^(P0qiHni8O~hw?Cha|2@%Jm znVGDY!^3S@WH=b`Y?89aP^l6h^!k5(5Eg>DznOQ z9yh#weP_*{-BZVN`}XRgE)wh83_AUKayDi-qN7hd$PVH`#v1?EgKP_)QCNepZy{Y} zhQs{z`Fxtd&`?0BvU2A4JKaf=vzYGpFyHGRd$yHW*@E*_-4%oXJ&lExD5#j$OR=;nd$*j+EXNJSkXle6+lK4F&}pH})_GkDH1SDPG6M4Pc2zYt%d`zZw z{1?GkRf>#a4T;Qc(J{&tKa*mb40t!IxdAGY2$fD+r}?jBnDqbmH#;5PduDFv_7puU zQ;yHDEmI*75g6@4AT?+Wib~$`tg?Y@&Q$KJd}E%)@*HC8x3|b@UfLTvEefj>qhF{yg1VLW%f4L!+OW z>3{ypWB*SU=0ATWIVkgUH~;7FPMrG5dj03GB#J*pUjP1;@{pg|yd=l6Rpu^~l_#da zO(df=q76xSF$ra5dJ$>f7v`Dm*uH(t%6n-g&NAHte`*0rN(u|((VQdvGG`;fvSv@x zq>pX}v=9vs2zJ3-@%RkZ*Sk@Fpq%rqDggP9&}hJv;H?cFqkPk4qytdu%znK?@%ql47)r|Y9TyXwW+~0nYm4eu z3`C$10k3xnIpl^7cKN3=hsLFxr*#5)pnDU>-cLkU#IGbLCF!~QkXt|*-?36XSS7@6 z%8O}b7%f)==O%x0M$w*5OlpQd4Z97%0KflXVf77pK#ppN?t^q48&_(T(SW{i*B#ZR8B(En^9>Ex!EvHq={fNV7qF5Z znUS?V6CMzWHK)SE-C;vmr8fo)#Uh~@zC1|!HJShl(q_ssdnRs+kFNuCtyb+Fv~E9D z5r?<;o^@0Miixqo>Q|R3XS|V>r0ow(&%MS9n6&o#i??s*FJE_VqZkk+)5kQiBF}bB z$l#p4(Bog;M*q^G0}Ze)WNrE@)L@$<=e?m!l>dxkp~Jd$TQ=>6y`CNJoRxr6ixrTX zJaLb6fu}U*IN5aOi_=A1_d;qdL8=QN%|&Ampyt3b&1cmo;9D2Pjok9Kx-6ZqI`5_N zieyc0Mit*_@`Np*&!VE{J*nUPc0IT2JK60}iHx+f@1sBD=ISkNlRWpnkPy zcYa-LdBOQZ&S;zG5Uc@}#-fXbJB19)&h8x+Y5DzYNcI@>n4Pn+8UBD9x60l=cy5}` zPAKG=6V*e>%=62r0tapqeK?_kKyO?1Eo4qBvJPghOu?U?w>cGTeKY6##H%h}HlWun zIR1f+{W{pVKJ(=p*`Lr~a@K;RMl=sy^xW3k$Q5Ui)LmY_8Qcof_<*Dp-N0`hJ7(q> z!?_BjgO-~Ma#bYDj-{@SxXHi{z;_YoqCuM(h=_#s3K#O;706chA(eBMO;1#yAOJn- z$JAw7UFIk-7;z~bOfcPQRdU6ngnhQJyc?=QP6?T_^os2 zU4DKKD(dh+D@FNoa0!zvoZ**fRGaCzoz=mqkxTF%Nwezt6?B%&(%1L#_7>LbwC+F& zfhQ5)(nh>EAZgCD={q_EPjWt8Xh%q@EF-H!m=ieFZyj#z~1CWKd>6sZf#Lr%(*l)=}g$BADWR!*ie)&6=-A9H%(H94*w%J%#q_o55?;kmc;|=ruUrf318S<`!QoljG!=tJ8*?Z(j zF=EDYzx;gFQL9Sp-{$=#hl-B%S#q(R$P(noJw3NHA^r zg$0N>8nend;r0r0{rV0KIBZJG#qging#OJxRi6K{5L5~<$fh! z8~n75kv%MWe(V&)^AK_G%gP4&QVn*vJSyc=l@&@Wgg%tML_1&C>?`Tp)G^BYGDH8e zmiEj3P%_wAEn9Y={;37io;8KRQlAeqFPC8-Nbj7+nvX;pemP7Jo#@fLLBC#DRjMy? zAAd*2O^c@(TRqEOpLeRK^#(+`pYFdRvQ10}Q_^aP4lQ$i9KZ^A$DeG~0)p6wXP)LdII2d3pKouvi?zJZnWx*}#LCJfvC4q}38!Yt<(#ZMj1`GO zX31DGB~g)3k)cQ^qV`s1c4$zBvK5jTQI(3F90@}T2(*u8i!AMs37 znN#=c7Xdn1!Nr>1mzJ1dEM$@>@VfJrXJcitxN`jR&r?=@tchN6c8>p$r5)@NNHIY( z?-(Gm{xPw>^6#goG_5!r`mcuBjT^Vh$|{0wIA{;)d-%5dP}{EES@1%rgNtj|l|Slh zOI*e}eDe5~`I!d1!-Lj`_ikBDuO37;Cnk=QFI-7p0p&#UsdUl|Uw&Vp+5Y|dai0Qf zr}uZG-FB%aB?Z~^jey0^=(JOR=oKO6I5m1>Le1=Y&^;sqfBGVBuV|-fee$(BV$7K! zPj&eW;c_9q1|K#L^M?q03K)UVT@`(o84HF>>JnbUEcP40h!%>#LWc)vvXho@hz=L* zvm^|qjd9W41AX32kzB;Dc?A$IpWygVhpoX&`vms;jBAg9l>eIvWJrl|V( zy+^lhFDaO=ZDUQzUgw={<)&At|9nZ(Ob5;e#&-He5bi0Rv}~}OKjvQmEfBjt`>&_A ze8cGd>t;aP<$lAQ!eFaNa*7i=ck@3stZQp5+&r4Ebm+>gd5UHBTZdVG!}YdoBf#Z+ z6P3i~Zl=m^+>|wzSp0dq&$AicT`NwAJeXqYU*ucevg3fRt#?u5`1zg&BpCM1EdNs0 zt(G=8S8hUPw+qIhs{zYozw0SK%p*zK+8uW-{-bC6_RG#{GH=s664s#v!S2%uu(+J? zbj9nS9IylOa$y28y6efyn3dj4czA5Xx8w76EvDHSRr~m5*oH5k(Aohjqwj*_?1pyT z)wSCIiuA3&Zm=_<+jNsp1zU! zSfDlEc^aPD`p(e$klU?y-Y&%=_E68!X)p>Q8JmeEnaj{_a)=FST)up`=@{5An5Cmn zvluw1_^*`GmMplh6B0pBK{8{X$UYRU-%Z*ID^35;! z9A->}QW7isP8#NKFjA^;NotRJMAOA_ZjhnZkR`7kx(+7m4Av?!vfdFGc=oJcJ7GG@ zU;Fy~J5Bhte>Q2xoYsM*cY)r?GROU=zn&W;`24HX1y32re}<1=zx9G@{N9TbX#>EtP;62)^S-H5R2?Zi!X{c znvkvVsge3yKaM%Kr25&bS6itb`58WTTlkZ8s0--E5)I;K9PR^vb~@i&;8j9c z7O(hW{8suW9xaZ$R4^^1*)g>LnW|r{GuQq{x~%BvJ8H(d*w52VOPoutQ)3=qi{z)^ zZG8OGn*e~>=d_9;a!#9vegE>M5(M^k-QVzB`|YA5Q9Kkoi^XJbW*_$uLUM8h^BcN* z_>dH?eOUg6D9^%@PVvX4c+{R`Voz(h8@Nw+kRIno~xR#F5r%s|_Fa zHY3GMI>|0uu|k-RLUkdNV{YTaf#(mG#fj%yXWcexcs2IvRsc$@1eCops4}OWOCP_Z z2D|8&KSr#jmH06ayLR7({d~uqTfXH~2;BWq`Q_IF5ldcQ^WIs2wO_-RccNoltMZUl zv4_8eZ!`^k=3xUhFng?-fDm_U)B31%h%?v7JU!Om-%Ccy+3cKdbsDv9)VlD_&p9Kg zMFvD*;Ojt@z`QQ?C{LwMaWkR)Gy(<0V4u+su#8ER@k5r2wFvy_eqNQy2RimsQu2Q| zaE568@0JCg3>Fj0)bxShP+$^5axPJlqUfvI*>1ur=1p~B&4@l0 z97XOW;cJWcwrNCijxs=&cjZ%DA7B`K&9n#;I-G$@K7KrF7_1q!bpQBq$8fytwh=ls zcsygyCufUa0|@D>F=Av12LM!px{{JqWB%C#*BBpmVG-VpY3+vXsJMx9KZjC~ba|sT zHoIW=vl5jS^%31}`itv!T-F=0cWM~u;;5c46a8<7e{t*x4#LUb% z-h4?E@-_v}iSha5U#}-ESCpTtJXp9UUDJGnvAOZAGWgY`BD#ukW>40cA~k|&c)`5^ zLNExCd$kO*`k1EGzAuRvXL;}PZadFYgJwksm z`O}{W>}fFwMb@4@G6z&3tyw|L(jPDO&$Em zqjrZ592lN%7PEO>lw$k#Uv_M0x$N3UefI6367GbCko$QxHa^y1*glUtlzcyUpC zB_-jnjfvoSqYJ!FKYt})=>zR0AJMqIGzi=9UfE}WtTJyTr~jr|&+d&_c2ng#B|X}R zhw96DZDN#6$n4g=v0Aq-lUNn39(Uh*b3Zq+n}fM5+#RDGf-4X6n4+n?|MYTO=K`kSiovno`DqLyp#*)4Oi7SZ>XL4_=+$JUXvkzPkm}Fw~%{taZGV zdOei!b#8P2_>K|B*bp?VZX>dxvi;b*@tM57DU;tRs|>4=rf^0+ZfUQe(64s0_caid%5O=x}#hyaSdnS?u*D>&AvNm2srTX|fDpGb>w~&~UlhN0=@F1qR0F&~udQ%pKGUt&I7f>C-5%FuPb#;GN<4LU=S%gYVW$({gzY*& zl~_xPwm)471cuFCqnYsP6(={#uo{w32x+yJ;Q;(~vm+|s^K-LLHcjc&2xUOZ?rzg# zKm@E9xG+u8n+2`-oM=D(_O`xm;p%($?^9EaKeG(V6>EZyLZJgFI_`U!8k_kPL;mbb zf?MYkB8)4G$3!(0_`2gcp%W5xQ;`@_3DWQOeFwr$!w(P+-PFy2&h-f)2i)W-&$ZHp zSKx@%&o0`eF{GZ;!HqaQVcWOCXY{U_hB}nIyl$8sFscAnvKp-unC_vI% ziCJ0Zt0}vN4;U~Ya~ZVjw$%RCGd)$V;8WzK-`{P9zCKFP6|V|%XkQqc58raft);a! zw2eIs_E(SZd({>O{2yL$?~ccX^Sd3$F<}X3jHMPBh9L9`wbjH0Ob|KuqOegF?kHmi zFtUcTHGUjE^6z6xZVqCjlW$8*TA0dLj$l|szGFu;P~8qrD?f{!hEDMAe^drR^WoQ6qYb+mauMHd;o2rvR2%`Sda97)%F|y2%rbSK;}>TT7d5PZxvgy`?hX1oq);f-qFDKPj+zPug8-+==LG2>Yr0sC&qaD6fowc*9gHeaD3_H@FInje%l>5maFHY?wBqR+2eQF|Izi(XQTwgxe?5s+N-42d5|I*D5JjSFW}U+Y5#00kLV z^Adz=@V3Q_v8fawf%>Z$_@ZqW?_$yU(Es6#&dJZ7ZKjKnrn>Pl-Ba3J-4L6_Nw=ac%n#`Iw^LjXJNV;4s$LO(cjjKbq$!QEhEQzpBFt^ht zI9Vi!9^MG$e!?K2$9`8*$y8Fw0YV1!ZZ%yW0B0?=Aw>?&`mtCvVL_Gu@iK*?Xuz4r z-biXHt_V}k!RWmr0Jq5N#fB_?B7hANpT-QDlgKMqk_@_1C=?@8=07uDhEpfsU6}At z+lr;QVmNn}0F?3*ydb1B#RA5)jzkQ=;6Gi@`KD4(l1L|w@_xSyw6k95z zJEvl>DO+)y2abOyQtLE-P6h^md0a+hy@U|Kfzg$|{rtK1kRgw2vK}e*8n^zP4i|t5 z?i{p+(FLRYgQ@)+E+c;{`TM`ZK?A|r`Sg7JYN=kC|BvUrPzr-wm5;`Ee^?D+$%OCx zV1AzhnmN9$1tqHyU$dNO;D%(unj_9f6)7152Cyxs3%^+ExIq}b0K&x4zxktf6~Dds z8hku<=nDdo*P&*uZZM842&yr6L>NPBL325<2oa!fk`d{J7W!4Do~K;};NlJ_td@>m zNt&v~Z%>0Tx!~NPH@i~@>9kKvnx}k^bK`ZWf0$EtWWW8guUEy>mx+G zPt?YC752(Ye$x4J^}X0~aN|TxYlh=%-Wi;Y@J;e)(|j^(fU`z<#SXEFM*YK3gmK6S zEwOaKuHr8m^>Excm0Ym?h#~6vd3O!|;f1y8GUUXQ6)B#o8)>*)8{vm2WSH-RhuM>W zLeO$SXU`_@Zb2Pv^U*1;Jo`3;w6Bw*W_iI(IC5zZ=>Y@;jma| zMV#(q__7dZe|DHLr%3ve4j6Lo9 zZv9>y2M9ply%;Wt3r@y^%Ah$6d4~-FYBpEpKkqGrm z5FAk29DiAXCh-O~-2#{rt%>F^K*X~hcpb1MhF^`H$^eTGyqVZxah0Wa+hHR}(S!2kr2`U_$3B_9zSzAD|!MNRoGYyLBD4J|Hby1tfY2+(0!z z_WcZ&6@q(S8h&ZyO13|vk^9DNwS9i+Bzg|tDOY&1w1O8iH;8PN&!K>G1iNEztY1bH zj4;LYJ@3Gf6!As~OTkV`w_aA`&^+yuDn7YKf?5QoxbNKDv_5X({B-e|}hIS3yg?sAA4!xIV z*N>8-+A1xUT%GrXO^iL>Zek7W6SeeX;eqsV#zC&SOIgyu(H**!H7i$I{F@`am4<`m z%!4mpEVAk=BbAd#)JG7)JrDkoxbVb@6M@aqugNhUJ}7t+3a`Z7iLbv*5PfsTM^J?b>}cMIdPgFt?LanpJ^L%J3F>C>uUWta6bi86t$Bzw8a z75j{2B#(B7BR_2a%+4^xEu>|x!E9RPcrn!cxPtqUtSr5c^TVwvnrW9U9639ujqk(P z>+Jgpe{;UxWimaY-zMr}8)@*-ufyP0?*Fw}XLg?jl80HM()_X(T_mZph11xqbWkod zh@b637)f|V2)1K)m6gMiEwg6-E}I4~5ri;DN}JzbdWdsv+5;m>{LqUxDtdZsq+S{{ zVn!!v(WuH5`gQC()Tx`4b!bb+?%k`HCkN5{bMAJabA(dji#9koYF}sdD%7mk*1jy; zuW=LOhWTR*a}@a}a>#g0=iV-=#M9lor(0U~ z6E9*5xJelwn(6dcJr5z0J28z9@F6=}tV0683{sRWJ8bZ=Xt8sGDVBaO2TgLQ89ed}gaHL*Q}4BHaAW7W&qaT`}Wb;a;f?njJIA|dl@M}eD2Q0p=Fz&{d!S4 zL*~L{z`)8W^9#FhZ@Kx>d|GW@-D%2*=Q&599Zgdqo1lZtvvCb3$|4j)Z*a|Ka=yCM z`)waIyV+T4cISRq2T`tMYRCckK0tqHXz?r5Pj_|}dxzcSiF$Y}7>?=L+n>?8u&0YI zTl~p6(J?KXJ|D~_4yt5zII`(wOFOT~5IQm=9|%WuzAXQw*X+aIJ8Xr}!{C}qkk_C7 zhg81;WCv{PqAN7L=6`h2X{*(>HCYLVs-0Smb+=o@obUTvBurjkEF$!_w^cP>N! z*6>@3U66qbt_1$A$}!Or54><)`F{`m(gvb4Poc14EBYK}Yw&9c>f1#60!Tm5b(pSf zdsy;yuSfD8$|7i}Lh&N6Un+}RMKJhGh4;IM9%%@bA&n+_x1atDrEA^VB46 za@OPD;>^Sm7mR|5_+7*v0?*jAev-;3(ow^}3DvXc&E>Cmt$BtEUC`}m0os)e6 z0t0)^;L+~ks6rK)$Znu`B0H%lwsozsrSM|Hq=T-blarVg5U=2}LQau1I5{S_NG*6V z@lAXss-K0(6Z;3;5Qx*#Vyyj~!xX73nCpeUNqGub9Cp(b4FQj^x0}2Wb}>SH+d>6K zQ%Es9RnIj11qcwjsI~AOG(M;jE6U5~izhZ%mNzf=<#D4o@59yrH#X16zbD-a2?s|$ z@f?y&fn|cnuX#3D-%cjw32(U}VY2SDl5(nF|Ng3q?OkhBIGEfXItHC}S>ouC1}SXd z!BPIB`F_gElVk-U3U-;7zemc^;N}vl5i51n{o{{6*21{(uWOX( zMhj;&v2TFAQ)>Zq9QxjGy8&!SSnl#Pir~jJ<_5Tmdv38I@wJRf@%Bddyja+^ilkY@ zr)K}5@k>W3N1Gb@u)ZDg5U8@#4x=Ok5|8bxYe~q*W!rb?5N@BV&x7@dXJp)9jC;wa zPd9AR2k`4lGrI8Htm0Pw7e(!ZD4f5sMn?7vR?(2yb%>k9hm2<&<4r?ZNnyxmhk9Jd zqt9pe?BTEs4r3~R*z;fuojP{E2z6)##DrOhvAWd<8I+HNfw{@mc}2gId+$gliEIIk z?($-rCAMDNdhWGOmNv)@7$m1m;|YE>`@;wE?lqs1eo+F8<*!0WgX-n1I1)EeLG3_@ zwl&UN_6z>vB5d%erOOT+M0<1ZoJK#92_0HYoa)ZHsE&SARx%a8BtJmPv7-7G6VpN8 zgp(0KU9CquS8YGaGLEI2JPJU{(^pX&&+%3gQ}9kEp)s=TF1FnWMYM=Vu5bQCK0iD8 zLhQ-km3_`~uF4y9UU0E)@+j?W{tdXzlk#fYN~aelgWY@f90$>kBeEM4UrK@QPULSd zeHLcu9-P+KZO{JwYhGV3BFKopkaZ1Y38GE7f4`-2lO|1gG|Q~+=RRJ#zM{;igwUgy&{$Ev2T~Oa7%Y=I1nq3%agI?C8Y3v|rpTT)3`UhQ0dEEIUrfjl zlsbov)AVQ0h6)giV$lM69g@?@qN<&KusbPPBR?U6Ynye==BUU<8-Pj2xfhAHCZ{4p z0#m;JB>r6&7jbRLJly2Nbv?23qMpaaSfNcT9rIFXZ_O{=9?Es9`@N37FloGWGJF{@&@$s&eGjw?B8;fufJ{x=^rP6#it zY2+1l)P=Ix#SuGXWI>-X=#H5Q=LsF_oB zmBNYZegRd1cmkV5dD62d&u3YFDx+xJ@cg55Cd?bj07Y4A!n#Qxvql-}R2p3sl-ED1 zs#+;yPEX$u{(mTn`z{xkMu#IGih zM+>TGY@%G4q2)RqxvroxH-OkGTr5G7BS%W!s>;d;k=h_Rk(HA>s?BtP1z-aPdfw2X z(o0JUI`J&`?D43>IsB&q<|;>mxg*ZxOLWX)jm0lp(D4BDLZ^?meD!&I@m^X z9e03FWR{fF=w6zLx(rH&F>Q}y$5iZu+6ikloVO-SfI+=oDlPg8<5e1M|5-HIMy>Cy#K^CuHgC-?aK?Y(x9kQf&H%2DJL2>Fw%3j5=p8@j@Xs&0^3QQr zbB3%zljhZ`gU;iB3o9D@?VbQ_sP~h>LzIA?+?4_Jx6vPLS@A2 z7)Mag0RGZ(2e-R!{nDXexwH|iaqgPaiUXK z23yA4Ayf%v7>Z7Sn~ol+6w+c1nprqI0-ps&kz@)_D#Z{|8{Sc5wE9?@|K?#LgYARr z=IatTlmQEDxM))c@)x9>0m$jcKp;Qe3^K#@cFPt9uUUDkw&-#Ic9B$2C$J{(*%KqS zAQ|dOT0=BU;0P{9MzSe3fJH5QcwQQJou2x*(Ii`6hTND6J3130dnFJh=n_qJB9}p^af{~7t+Si#;1^vcriUSr z!Z**xMFlSvL>_!GtBJ79_=1rQsE-0BMTEx(#++oa1 zD29n_PKD3Z^W*tjM<2K5TjVoY*bx2YbmaW9!8Pa<5*thO3~QG7lJkr7QiAMy+bM zKQ!3@TL83gSet@V{qe5^as+C$svx5orJvg84oW#1HFLbIw=xHgX9%F1iF3?h|X0=Niwww4md{f6Y?ai;w9ys&Sd zK7k!xix!mob>IWMXvw&9$B(~;7o+m=b>GmWa=xSE4VJ?+cry>Hj8Eg7;E5Y*EW9D8 zD)guFBi&j0An3xYn;(;Cc|CupERj_HYd~NuYUs=YBd(HI-#dHu<2yDCn9xxdB7?yv zI^H70p+dpX3&dIu<8suT{RVXK4^Ct7Xr1l8Hd3NLy83-xG*~WFKG-*u2*GU@LKyFd z&83pPzabI7*eyeHPBvxI!&jqu@I(UWH1CM?CLRb?Pq5W6B*PX#4-JF?aom(OFT>fq z$G-{UBlT(DL<;bvWqi>zqM$YpU{(p6r?%k2*Sh&KI^O>=P@)o{uA-0!Pp4jJiTw}q zpSN&)7yaa2orq~q@*PY=zmf6=ocq9c#tI#uEe1eE|Jlfm;~dC|p#2+{+8Qk;PZN4h ztIHMXvdV7=E8bG)Dd*Dhz#YukxuH6(5)zAvW)Ig$d(I?0EY8Hh+JOTGbl<|R%jcnH zxD`_m7=@ZH2u9eED8qtLOnt`^kj%?;>hVdBZ`b?&tAhhp||Ih=LEA} zCMw0jCYo|9nz~-p{mnD+J6V|Ef%FW8!uJzG=p$Uvz3(;cJ@c5xRBS9(QJCQ#&t4kR z_4ye*so1}{MXZ{d6OutiSPq$0qUD9!U!XNS?pfflAX`S~*+0KY1%AZkQJg|y6ESw8 zGl3d}`uL8cWEn~x3U18;+%w#`ge6GhP+qre)r#>oL4SGfgZMGnN;e5)=P)Ldw@J_D zU<`!<&`ebD{OpxC^+Q3FkiNKqJ=$d<(>tYclv;EG0yKttv}E&;Fz9F`mGgs*OnP;4 z9sHe)L~cYojv5JN*0KK!8%2eMDfLe!X+K$3IJ9M~Cgs{3iJ+G{- z=K&cmD-_7xzxax)HrIW7dB7;NPiC@d)nGY-KHZyC+x zHzS}YXWvkhnrgSiy0&iyqt%P=o84c$y8B{mPwfAmnl$OVnPSiE4} zg+RB2h(_WJAG}IRwEmyFB6_?;1&Hw&5U<(z%lt3jh7rWaXKzV4vAhmc4Q36_mqOc+ zu!Jg~W`dzO(CjOlF7AxeqDB#qi<7W~dK9%Ce!|3EdNMmc7;*Crd^JHAGa@6=7jaP6 z+uJAm-H_=${eRBFAHl_&>5Zdv@zTCvd|JEjU%%3g zbA-Fz*vDA%&Y{6gO3qjv!MmNQudi7g4!Sanyfp1h%hs)HD1*GX#}?UO7GX|O7m`CV z3^NQ$t}elo`zFbr2A8fZW-@fF<{TTEeR7SHjLwGtc~abgdxRHAfB|;&U?20(vy7Yd z(6khbK)s7w_n+x48idTuerw+9R(S5MzuYMV`i>Y;0nK6>Ix^FWo0$ntxnvx2bw#%d z0HHIV7Jy<#yMrrrb+f&cxUBKd99?v8Z%2yHiR@yH%{}w?1SCa^0gQ-;lHP&x7hGI4 z&vf9!MU;=URSR-+y^6A%B=#pOBG@BX5mSj*)@f#NylgNAj<~qgH)+;NrGv7hqe>=2 z56-oMi4bE@`RcUS$ctyW*K{ATiJUcTFl+;)BdA>sQgdWc6Un}HV?^{=%(wxwIQJ@c zY)O9QQeXYnq0rU}{g5!Z_~H$kLxv+JJs&45lF6}#cTJ1lMUDRdb&rhIbS0g(j%0kQ z)lFkE`QN4SI}lJ=S#h}4ZMHk0QtT-^06}P0R+d4UoD?P7_~Xn*q4tylo-S*SIz31} z#pkr3{nR6s57?Z}X@qh!tV)0KMEKSs#(0>pUM8m#N(P>4+Uf|}jy@BoS@Jj0F@zdd zS5+CdZ6-M-0GhDorT-!)H*?mk5ykcn4k$Z)agtm$93k#pK6YZ!Z$wS5k;0!-8F2$* zVLd9}ECyZ+DuaHA_Or}nF}}?LnudLTbSa1yCucork{I%JYb^(BB%K6MAIF(n4HdxA zm^5h;{{X^Y*iTowRXsybuce$^dcFgH6ItCZy|AyN!=9|d;0zJ{Mbe{(Pvekd>4Q+B zo;#4RDH3qgL(AHL>K1g8wfX{CRH&Qih5+vJInpElkn62<`1us(tpfF4x?0k+k zj*bp7MBE||J=(ORsLx3VS5{1?ud1i9=0sDSk*BLhb{Pwi#;Nl>B>Zl4%!iV0ffy=^ z4U0x{REc`9_XQO3IcBwRci%8hWq;_cu^MWlsZlvAJ1R03nsgc9T2l;Z#Ww>oiSg}& z2S*gMyBv{}H>y}1qbE!dFmuMjUxE2;{YM-Se(nfx;AUXzF|ru$4Z;QYIXX0h(2p9n zZ=#bmtzzAf>pxqTstq5@HTE4%49d6pV_Jze{LIOeq*_rF?vfRwlw!v1`{)1qFWFm(kGvK!Hl$!~Op8%{WQtl6;pgTT0(e zB07^a=!G2rn%rduBVmN*Y2^9dO8@WzkA~P$2xhS<3|^k6;ZgmNx5at+>nEIcw4)&> zR%{+Dv;}pHgwa)kV*YjDOS0h*(Wx&&YrePxv1S;`ZFY(7>QUDuQ456zCNucdvlamjm0lR4 z+XfDX#2*uUOIwLLAty-wZPv`9m>#BgL#N%zPo!J1;6G z22zr*puj+EmhqM#NsGOyh9wQs=eRMn`?xUD_?Y-2JJSh4#={yo1S zDj-3_KsjJWWg1!xU?W*)ZPwtr{+P4#_E7FJ#}0b%5`fm^o28^<^J|>?=OO{Vfx%Xm z+u`1Xdnz{;lQ?mWG#i%*9wvlpAfzvBEkP|v9jXDPSGTd7*Md?6^=5yJ1PpJ&*Q{yK2&S`V>K1JK1@Xf&BCzi7$0p78os(9GAlLYFx|Fog(nrPCT` zV!LmkB4#ed_D&l3uioe6h~@yVl4@%4%NLT432Qe5flU#X9=EZI_lLAmU4Q zAwI-YKpmSCZ3b(?rD4hDzGyRP=*M^O{>9k~rjR5jriECb5t4CgoD(DXvdYv;SLp;} zDgFOC!VfKvICQx=0;2vb2NyBmb>8U93InP4epV@k8Ugqw;>4mG)J`OHVbwu65tA+?lXw>w zB+&AGe(sxbh-MZvqjB}Rm5j6fLk^r9iCfGvfIEU0;@c=zQlaqS#6y^{()*qOo8JQ7K;DXPrjHV(%#lg_$f+y~OJhKiFw6DhD)+P-sKBZ4}HlCli6_0|pL^ zzq=OtVO?f)1s9%;_LHo7xb`eZIVVhH9ipi3iqM6eHBEgHf8Jc049H+~%NVI7&OgLvceB*Bp*@nr0*r3#D?x5?c9i z7>tUrQW)g5Q2dO?jj5T+F8mA&!nU|Npnp4HJ** z#D8*mdLVrZY-Mhx!MS`tkxlzaw5f;z5K{0vSNo|+mByIIks(QIR37NnY|;vIa}jJm zT^&KBgu(&Rx&GDYYU|V;nU`6_*=A1XyokO2aV{01mNa)n$x8{2{_Luyu2iZ2|5KET zgYVk^e;$In(H1uNKIIDA$Td6<_yPyAY5jw&xE49i_4P_FOO~8GdT8)&g^vm=l+O8H zUADq=^zc?wRr1z$J>D)nq3an>--jNLHq1^MdeP(VzaHs5z3qo|Qq|vGSXKL_ZpYjB z!q?dy-YaI>xs2Zyx3uJWg=75dOZFA9<_m2?zX5xo6;=~~U?_tk2DoZ3!v2XEifR|y z5t+)OcjtfZM8Xn8$k1$Tf@kn1O(uW;Jzz{)b^e;!M5kMq_U-3I%UQ2nNx7!X#Q{;k zJq5*#a@D@e*%V|EUMYkCEGWB-G9``zE~2BO=|w&H*JHpS=~n&icE2=_B`Tc# zh}cfBL;7IXZr$2+)6BuF+J5t97Fc7Kn;R+U>RWZz+H=+c zWiERAq3*EqQSROQoe*CS9qOp}cawJ!^7s~^$wjl*sVl1I{H(Z`p9_g#q7;&vOOA2> z{>LA?YU4nnziQPg7EKoKh!hXZH)3PnY*g}P&r(xTs#&JQvf2nnDm!*ODyZdi=b*-V z4;oaMJ6^?A^A48CD1zutqV9lDjSh_@qLKtQmGRZv@($;4h_PYC`!ecW!K{GL1aUmn($oMUk&FQzAq{o59T5jsvd8Hkcjb|pcv6WM&k(c?d9(u zHr964szm@RZv88oXBQW7bT74h_XynTLlz>(Ajbm;<>lt;eIRM^1+Q8BU8`^E+#R&Vw}X09&Cw3(Fa0a?T@O)9uITMN+~17E<8{O^}DGCGG5LzTU)uhVAv zHx)6Rl{56$D3Z#D|9RutWC56j=&jx!h=mg2mlh{ zz?jz{@w1KrO0a3-3Aa*4blr=&5=e5C=xQhBZWUf2m_ZX$poFo#QC_U{ff^7(7|+Me zm<(?8D0NH}I(Ga}P*BA>jHy$mtcoyIo&u4<^JE}uqD@*Lv7+I(3q-a~uKoD&V=HCk zO_V`$mtrQW@O4w*_|RkVLY>c!%^`paxXL&p88T;^K8@wpgC75`q{f zMt~cBj{2^~=h034XE-TdS~UuwBP{?70}YP7%gb8{4{kxQe2I*zPvmV%4lCYK@<*A{ zX_~`_(@+n_Nsk-rE)UotN*)`v;h!`{RIHjcIt_D=O*``vOA;NOTSOi8c5K$HMsSOA zYS{3cIi`|p{S_v+m>?348m(Hk6y8#4dr5yuTG=LtiXn#z7?6c5@J%O*L^M5l|?7iWNeG;v{=(0}!6e-z90CCQ@?jAzV*OIsFtXj&ym)xH; z%DM~t5e``%N35iBD_(kD;s{sCPFj*g}QrEF$BX;e*T!Yu4N$*m;Ahrx`LdixYK_iBdP3d z6f|NL;0aJrZnU=-g9&ik?($4Yh-p)%lsF(*O9l`jZ>{N#U&xp-rVZ^a`U~#qlFE1C zvqv!M{6>qZi{So^53%$J)WbWHnZX^k9H^ZCA+ysTVP8 zW<Wx17JRgu6+L$%a$F+{x^eu8)GW&dF)4sj)`&M$qN~W{bA7S zY(gY8F)-!GG{}bL?RwQzJYQ;NYMPv$mY$wFj_Q!i-s~A%>1+_O5lgWgIGt5@n@Y*2 z3Eq^jnJkbY*|8BK6tpfr245Nr(ItS}@S50&#fuXgC3fK|?#|*Mc&NW#N|K!`>3EVF zPIjSdJ4h;P*<40#qq|24h+Pa?NS;b07uP!gUeUVAQP$GBdS@r|Mjre4^_WCI8x$lU z4N(rJf$`GqeIKLYzW2PWWPHd$l2;(0X+|aQtglRUVK?JY1`e@l z6L0JIB;*j8Y!Be|u@=jMv#K+R6NU6U>G5N@;S|^-dQc57OF_J>%K$UTuNb$Z@PaU_ zHCD#P&wxLGk4hpvXNW1h40z@VehljK#zc4H*fGX2)9&7#EPL3;2kX({pm;QTthE%o z_HIpn%2r3v7VW3#%t+{%*zMKV9X>^cDM)j(PSBq~M*$Tp$R8*_9vuefUUX~ta-7E$ z8c2|_PKt~&5#piK-xJ?0hDx`3d?a~7-gVYxU@{08LX>j-3xX5Jqn#v6v9t={Sa~q4A2A7CWDyXJ*Q7r%z{tzc67{aTE3$ z+@-}9kKm}OCxNYcXPzYPM2FQ;mlRTOW!{FODe%cEO!BZU0)fBBCypmilKSY$T~ z(%eIghXOqBrl(&8-B70RJd=}>=79_N3YhZ#!-v}1S{g{B@gvKhq2ZQtWCj^sO~>wkDa$E@KR-9VO#}S932X`oweexySB8x?PqZK4(JZ zy{Atd2up?4e24jH&^)L zlT~i=6c0UrCNLIt_f%B$5?SCV$!#F zpEe-EJ;GL006&5U39XZw!_w`0{{Gu%9~5k7Cu6d~Z2t+NPgYTSiCre!&T}OY;l3r} zLn_nJ!5ncjd4XNBwEsQd{lx{d1{+a4^Cb6!TY*~>8Ix=>@whk;QWy(H1opJB2&lPp zM*bfNe?A|>Ab-(_6;U(y0TFFbr_Wd01{~P9VZ-S4?_A~IQ#YnfxFNaP|HX-z`vRLB zI)z3#r^yf4j(wUqUQ25u??#j@BXCo8szH%v2#H{lvlkGoGIaiPj)%BC1Zi4W)|Qs+ zDKJ~KXk3w@hzP&5v)Xp2=x$)2g{l&O^DAKFPGxf@l?jZ1C*s_86eC-ZArbQcC#|fT zN}k#bo%*ywrf79ka@|pu?pbq>bURRh5BHZZUWoPeiId&6Z^Dq5O^f1qkatIl-_j`;X&aU;ao_r*N0C#qk4JXCBoe9-4BpM(TfNX z{DX?WVU8g|*^uLI^y7R~R8&YvA!mbfR9#&i*R*+e+;UAoDPFk3*-;X8#Cu0k5j&sr z5kI3jMI}_IF%~~TpKlZY~c>SKc|FbsCD>wP|WoO!}tC7iRBr2hlr^9?i;9VTx!f z1vJJGu?*)S2U08DC3YiAx5`JupOXGKA?K>6)gn$t^%KOAOXig5WG!X2nuu>-kNTh8 z_qwZ`RC@Lw5rhZwGQK|wqhdzU`}e_vP>AWEY|7#xN*5MKg&Q$+7?v2sRG!?!!2$I3 zN%W7Bk_aWLUZr-i5EUwlGLph(Pzw;ntJsgNgm8`6NUQ|)IBLs^59hfL`(E%llgG(@ zxf2w7zQb*f(3AV?HdkOfp-l8$boZb$Z=z(SiU`VETM1DA0ec&BlX0=;40wD2(RS<6 z1H*kRxlo}tbQggV^ z5h8`GvKrqH+}dYg*B(8@We8HNoHcSw@#3JqlItC@^)9X8<_HK*cx|LH?6q3O7_{1& z2JOITpX0}4pl?fVRu_}$%Pgq)(NN@;l-R9UaR!*eLS%AdA+zZI{u*K!+rre;4}_R@ z;zv4#*-tINY3@h+h7W9pMOVTh#TZ&&oD)*082U`R(IAulKrfoJ8kDhG&RJy+-px(ey!odh6(T^ zsR%4y`U+Kf`S1}k+K^a2q?DEmA_;IGQqv~v+ z5r6&~Dpa*N+ldW>#>U>m|1k{4117tqTzWgsfwR zRHEGqTLq25yVX&}+qefkkQwg_(Xi$u&sAxz5Cs4Z-@ZcD37Noi(tAF@q zmH-)`rcpO!Ly6?#7_o*O=ztg@l4y97{Nimyw?yUczyFH;cDj2&vp$wP$RF_O0q5rB zUrn8WOJ35WM+f)rWkKT!CZ^wbLb zFd@5P4p+aJw2dC22GM>{YU~lhIGc{sF!aK(lXN8oIFKRM0L-Z!FhV%w<+XsD%XfhQ zYTc&Zr5c3vkjW(obVyWwr&3P3kd!b)>+S6wbWA8d`4(JBF}VvMnK>n~XWk2*ZbuAD{W`*|&*QQa})@&%w2d#(%?*kylxeS4S->G(gNy5i^+G_{j_K zzR!OCsKECJIb$2oZ}eW!8lOK4pAza5n9KxTpW8t8UP&W)qE)g29hyv+L&%UF z=t=?z^v^Y=K#B{G-vtbaxnaAHCQ#vHm`YXqSKymX=a2y;?M8jGZ`)$0>SJvc$0&?q z%S-f~zcnd}fF?XO1>e)8F4{7XFK3=OQ{GQ~ykYa^o3yBc-wZ!EO^b3uAha!o?D8gR z%lR+L8Mu*&mGL|l?CuBKt76lS2=i0wQGfq>QU5Pmq~LRo!4+sbb?1NJ{Zs@p4hsATt157A)Qt zLLd~$GA&52*Tfyr>!F6h`^;`9CZ2~pAo*v}(0usyLLm*oa{oq2g5$Kkcm9^E}ts%sX^T&9CDF@KiDpqL-U0FS3~7Dk3;;t;OSf1J~ilP866nkZyyVv_suqZqThVZ2uP z9?0tniHQQUr7}|u;T&9kC-}6b%)}AX*_7Eyw}!YRN`38~L$Q{Rn4j zma=xw_$3Sc2ge0|EGRI|dD7)=wXu(BsQ1}8$_zp7)f9vj+lCYJ=@+~KJpVbGc5B%Q z<{t{pOLw>(j^Q zNQvv=!%t{Vwwfcjj@+qm=KB`skQvJ`_527|6H@vyxU6Zg(S`S$+k0#KsbAg@?mSf( zEDd1vqyoS(b%dEM#<71=SJ3to@aWAOxcAa(26Xl`(c-|nKf?iBqfe4i&wL9r1Dte@@qd+!Yd&lhgcH(zSxtDsyQUPEjz!-8iHchl#Hj|0RKBmh3O?BsQ(E zWn6Yc^|y7Evu|pO{PO%##O2E>L-Ae2cSSS!Cb5bJtofT(8um7nc}Ms0UQNx+x}2Iz zufD$Sd;I2%GNoW}PPO%o;dKZoeE@PI!6 zg|9X)iz}TuEn(T+U7b#$HiMko9?a2+U)paPw>ElAfTtmL$$H}lp<6n4 zV=^N^q987SnJ$=PSOA{h|9Zk*>cSm^ggY&-&Lrnrm#Ax0X?jzUlyBIt@N%O1%j@gR z*coWluE$7&I&+10+)O6ngHg?2p|G}{BuVBy8};X`#1c#(4OQ|ma`Uxo!$l*mfU#1@ zQs&G*uoQznpu;o-FY>fUh7bMW-*(^o>#$4_;HfjiO^SzXAHfjKJjWEny$HjM&Sw%l}w`DSk7a#h6EpQA;Aj@+XFT(~6Gab9XTHntum&dbCtEo8)N z>i2&URcvCBPA~$_ok(#u>~|=9HtP~Qzbf3?>wd})@qRr9OktYiMo$KD+c^yr*?F{N zf*o(q{*=creIf6>b4Yo4x!iY^_#Z(@>)s>RD|4CAj5W;22hme2Oi<|DdBAgom>C@> z+cud!W5&PfdhMiJ`vn{E>nor&x_St4;ptAZovoStf_>eFjvU!*oa1`#FsMBX)E}hP zgaR9s;C=iXV@u=>Bge;ZR>*eOu3jyiyA~dw{gkkOV=~atFdOSj6el?+=oinRNKV+e z@(18Q*VOis`k`XC^XFB@Yl)e>yjA?rT#?VfB6Xq*b%`y|XI8eyavcG7!|69SKXobz z6~KK(ROTN0$TI~dlO|SX&L_V``;@oxtT6^<DgurywaKx_Pl)`gZ+`|*PU&z`L# z3iJMimKQ$;o>BfWMm>hB(-YfzG-J^GApK6`r;Dr}b!h+oLH3XLFMdTF&29c2EU=yC z7%aqULISO6J~d~h!V+steQWEpMq|g0?a{q^f$v}!dY7QEr&=B1j2iVPUGE5ADgGMV zG=9ZFEi??mvS`hUI4Y$|s zPV|+>w_CYv*}ON(PHVE?f4j(ygyp59*&X3IjgU*2OuoC_bi86}YUKBIV8MMc2DSBV zKE-F;&s;gWK+fFHG?WEW)huhx8Oz@m>QO=fpHUd+rlyA7xU@90q=E)Rg-e(qzUMrv zk83WyYu2dM9MHN!?K%M=X_qcOzyW3uW;B3B{u$Q-KtFpVlVUE)NUlD~w{A6LADm^t zV!_)Sc=!-{#B?#}+9BW5hu+}xIkmM@w&=1~^^|Yb_hIUnJ#Cw3R{4G^Y3}{K!;ZEb zQ2#NTE`05=gc6%4+lfJ(lP5znupIjFrZB#D)27?U>+<{Q1uymkC_ph>;-_`!kalUw z*i5tSoJJb4^iJ8rbEDK{s7C&_HqtFQ!Hxa;dbc3u;ltK)a%&LSQbka31{K{NtgcRD z1if}7p3?lTya5LTiEca@At`3JIEZDA=H^+P`hE(peq2BBZ)T4Xg9gp4x6EugCsQRd zXQqj%X=mS+ROPQalz;u|e0bb_;m9P`%Zd9lP0Zb}R8J$L}D&LH)s zrk04#rv(Jg$Hym+A3G+z2~pCj9A~W5K82zax8e&_0o#}m3C4=FkgP+u5MNo(ncu@7 zqEw$#znn7^Xalf2GRKL+yl%v%NR5cd$md$+!3SfbXVTSAj<7wIm(?ReLER*L$|d>) z=H}@|K=^&t)}YeD(evsS{3n9T-~6m zeto$vR30$kWBQzlg6Tj}PAw348-kfFc3&tSA+ZQ4^F|W4Jhnt~4d~u^t+r8m1CMhI zBFx*shScm;NE!0U|6)wuDiG<{hb+a)*f^X9otD=83-s)OROXha5JJ{Eng0o-)^mq2 z!Ra?-?Xf=j@fr4EI8{QfMejEM7KE%1qY~OvCcR!>UbeH?jTS-T>$f-`_QmD)sTNni zXCpY`xEBdP2aH2#$&njDe zDgF?n3s6d`$7h+~weja-f4V2)XdzTe(<-m2EZ=&F2%an&a7!2W%Cwo9wraRtTm@F6 z{Iy+mzNvY@-m;rln~glAu53ij zdDvWwplCuoLG}67&s3QeJu`7|Tg;ufabp}G?MP&zdKxhd!rEOP2J4AdQIzlT<7hZ% zR#2-l!LdQhj#Hx9#Pc4Mz}KGaE{_4Ctj3$)Db_)wh?3*U#%t8thLk4r-g%&fUTjCT zRDVWPM!K@xe{Jo^bJLng@1h#rt+~leUjv%}nB2hr-_H(T^K316nOd6-S;9_GoJN&H z*ela(GJ>+js~(XSPGD0fd$k1cd+x&&Q#|+^HlX(>t0N2%VYaSabBaZKYHAk}C&M#{ zCGCOHP>Jq6R_2aF4EOBlz=;!a!^_u|ojiFm1YWJvDKmn8(0gj(`$eUGUS7y+ANwf0 z3#NM9U+61tG66j373Bo1d-U{~U(zrG2+LcA3ehN*rDo0AVwjcu=oxKluKyt0!*IVn z$8R-~9mdgwv=~1Em%r45jV@yv%HpfY5xIcF?XpBA2=$^XOYsCx3PhwQo|cl5!rGSn zV~~_e5Or{L!NT{Ns183H5)u*~oZN&;%1JJbaTUx?6i!_wlrk2iSHY+$t+tCUkNDvX zSbQcQHA|q_LTLpT8K{AE9zC_dfvJSgYW5*?GDidDLbOJ;+(p*_8iF@cRX#3K#O4YD zXnI;&D}#Sm4Tl7XqCB42s?-Q30LB^;vsnTqc5dw{rV#158CmG6n!kcq349bNw}0Qh zR}_Foe|H`B&#ie_Q`u&YnK^T&@)T23Q@Z?Gnwpd?kOk0V_#}S*jDLc^UkhMDBQ?#f z{n$yJ(zX-31SYcQmZvjgCpYt%r|NvV)v$4*0v$vUmOS5p^O!ZN32GQzG9EhT~)wq|C0Hen5Pb>BftISX>y!!F%^=qM?oU;VAmWq~EIRxMw=8l}5orSZY zelo-&{J0a77dEpkEG%f5e$7ile9ih3dL0ykbRwdRpq(T!V6Mcpxg`Qqsz@~^YKI+Z z2WxU~c+s=3$kSqMO4PtR0I6_KI#zX%dk!X_Q!%!~a4hY2`Ryc3wN9en5QEhv%a_}i z*0C{gOUcr%{WF~TpQ4d=?HhBB!q@_J-&jP4)%H(lUqSE@vm}JL{TiZTK!&aEajh}) zp#Lr{>Cx<;yLrrM-W=_afS`p`o8F{%4`O%H@peOm68x zoqK&+MCwN43ljq0lA92GI!mp{S-I#adugl>9igs$kl;t>4msQgA#v7mpl}94fiVP5 zdyTLYAwpEd5Hb%ps80cMk zvol`Tq@RL8yna0vKoigUJDO80_v>D)g@V-{GscHb15S*onc3+xXI3EO=j?+*I*Y`6 zi|&SHTus8ynp}rB>8-3>10wV+-AfNF9)6GRF20WlsZ6Svl`%{FG^L)nzX*2lpTJ*G z?avAe1;I;E3|jjPK>zlPd}yEX;JsmY{a^HR*sbzr)M2G?PtcmXz*b_oU;0#u;V98-JO@B^b?H^Ax>g@hnMyfQWar!U6e z2wIUt;eZkV9AZqIO`_mCpvO>RkT=gq&?sK%3;(CWRC-SUq0eYGoSL8pYah;arR`-A z=;^;7c}VeN+E1=@?urQFfBtEGANVUedfK*c&*;Ngo5Zl|f9t!902tzm=k}&e00ojB zsNGh|J~=0T>&A`XtwO5}L0+|0-0DtQTtnQL_-O}IgItIfv&+w5KvHhs^l42-Q%e1X z430iota@(z1yh5)IEP!z9(2p$uAhu@OQr@^{<8P(A*)2CW(bzeZbEG-}xQsk&j(r|`{0CVe*dVop@Im-I?{i{}nCHwqA979) z^RHLl@nOGX|Aym6Gj43Mr!Ebta${mAds6<{vmbFHxN1u;al_&|%!ccbLPjGtPFYY` zSeO+)3MP;Ht53vxa}J#o z3~3;x+0k8ZAOo#DhHysMmw-ZPe~E3pd-nrYh46OFFfu&cgH5K^XDy+WI0~#e2L!A7 z<}l6(cR42z?z5b9i$mQeN=z#ulbfS}W9Da-GsUzK!R8Hv!k}iNJOG**F=U9M6nV^S z8VDp~{^G*sc=94M{#yRv`jsm$87O6*m~Z-8Fi{d;k+FjrHF*wt-OtQqPODK>P0jlU zu^k$ZH>M8tV1xrwF#|UN64-A<2WAX<*Luhr{h2?B{=q;=MFsNV0oW(4(h}An0t=BN2c|heKi>$CdniYXxC;OVrnyb7~PkA|lemV}Dnw zk@dpnca+jRk5OBE^~x(|jP=!3{*eB8$Em3t6lDg$BC%Hp!LpR&2#kYtd*;lUcqpiO zB#AKWZ#y+HA8%b_Ev@CO4S>ccF_)b#b?WQ>zTMF9nA-qW(DA>D>xaQ@pi4r^+L7b$ z0Xlx^x2M0Dq%MkHOY4L!qv}an0~G;($=O2Yx;ioCiwK<4YYwxK6)zVS3d%n1B?POx zheuZ_4XPg?v<7&srT1FsuZ1w9>?7g_G-9e(DL7-!_MS@03QjGrU-={&%Wb@lF2%zNn0QiJ~iTCL_AVUZJj#=?aVs%M-*d-1TB+h4(y% zX7uAk4*h)+Jx@kh^nlym`+MZf;uW?9>=`@|yLIWZjwr}KsCkp~JF8EI3ZT-Vb!+S$ z2?zJcaLqpS1|Xk3aNs~>sQ{uvmdG(+s|9V}9y4ByWz%=Dpua}c-yXUZ(qDdwRdm?+g`#}0`k)F&BE(2}FHrBh29zyGwNA!4?N@Ezj_i}rth58laKy`CI; z^2^q$XoeqgS#>&|bd*GrO&gub)%Tlw#xyfCO`nC_5KL0y+}I@cqE~0_^HZRD*b$~` z8qm&k?;du>)@f^CTSgJhsO*JGC+#H{lmYxW*Bf`1>U+(4E-5TCp0J&%nsfJq;ljN< zx_8>a>5%l*Gt5v^ku|s zRD^qkj)}&W-d6EeYy;LP8Q(yck2A1)%#MXGg+)bJfI+C{$dR@iHW^DoJCZaK6S|1` zSO@~_%13dmgD$u>Q!;bjtv_s-_v-^5ro-kHAAET8W_*X0F;ps+zJ90`-e1Pa_QmFGXO9|+L{9+MvK6Y?O}2$Ivwak)}3hD#qkRP zgUjz5g|h6zrjeja6u@|mEQbEGoac^}M0`;TiAgH6%ig|y+k>7~;3lT7W!M5HG;+Pu z(IC*GWQUc1coeaas0&lQJ})*gF(Cq=cqtQ2ie`9afmHD^zf>0e{ctX8IgIpb+4rYB ze6Oc{CF+JEj{8bo5!x|J`%pqvaz?~{8DlmPd zzQ(wX2Q({na3J>2EivaaZ))2e*Y0f8nH(%wXa!Y~+o0d;q;p)%_W?1VoB)grGEO0f1X~CLitb4jXevYrF8BnZ8{3EKuh2-*>H0F5+|3F?%50OhTPJQDm}?f9rf(VzOlMV$(pB zP9dFRR(nPC1}g$~CfOl&ObdMkw-90N&x!iQA!D1mOIP3-=P z3(8WMOhMdcmiT?qZ^R`yKy+?V_Pf13rbsjPJC3a@xjj-lAi$U(e>qWggl;8Ie9Db; zIA^$X-V~LPCxIASLh{o?RRu%Vn-J1}S(|r`Z6*GSheQ=d@vfk-IVh-IhYn)Y00Apd zvAn#TlofSn5P~miEE+8ek4*1T)Mt^{k>)W%Jxd)#*vjzwd}Peh<;x3++g$174daAh ztK=I~?rBN%rb^sOJ&oyO{bWsvx!~G~UxA9kHv-)Gq6dHo10tAq0*?G!QtV9 zl-<5AX_hPW#dgQensm4WwL`w4p3g!W`8A|h$3GzB=X%$4oBWjQ=pHFD?s>W4KpSN% z)~ADUcj=w?V*|ka6a`^wA~fxLnKIvl8POb&Wjl1hB;$t+8FC^ydD_gGn@{W9t_k`hM(Es3E^uIdqn2$TE9Uz(Op7HoF>MUcOy%SmJ*cOlf#lxk!C*>h zzlVM^-I7dqthaV6Ox(4;#n*~`z>s1MQBD$JgmNB1cU{%<@v0M?ot?S1c|iey>hmg1 z*({dUf7-w43rCPN@slT!2GI>v&Kpsdb)z;&7ixFFOCZN%HiiI|MlKyq9uGxhgmg2Y z8zDA#4iQ1VtffX~@omobzkTP9Ov;|?YiKw(!p8G^8m!(g=i|8C5%ju{o>B|zR(7Pz zS`Uo@@eFYddo(p*wAso`|7N)ZebSg@2KI13?e((@>pl3p7A^CgYq+1y#?IxIp~yV8 zM9L|%wA>{>yj7rd*4*My?c7{Sqhg*8*3AF3*&glD1UWnWqkHz>pvhw&_@w5fmWqzo zqdPwxMJfl1kfJ6lYa_m*<=J4uHeuVn!nZfJYDH#OiwIKLyIixc=Wu#r&1r`uBknx&@5=rF1kt3-Y zE^rP+v?hD3JV-#E^7L7S*IWTT_@8orFd3K#NdW<&lE}C)Tt+_ zfGg^$C#DJ?$u@mR#ls!JVN*z|&lB4(niq!87sSGb3u#-Q*4ijMFDU_Gvz~vVYEp)b zUQB6YbFV17qMc743|L`-ox-QW?%b%NS7X=hwB+juC#i*g@ygESuM<_C%3i#-P~$O*Hz;(wTFi5S{<1^ zv&G_paox|gbZ*sPFA19MHa<^jXxGAyh2tk=1-ZZa_Okxs=c>5wH{8shC1u<#d~NjA zXH-x3{i&}E#%~P$`rEgH#E*0KC!Ts~AJ*%2!n=g$_F=X>O@w2l_Ofg5Rp}Km4!OftQ=s9|K?OYHeTY&m_u0N}y6GBS=9I>CPj) ze(Re=JtqbPC@fg9f{D?dsV{LL$3@BCj|rU_=!8@!SX$=t#2Vc%az!PtMxMRot8-<; zk$pCw$%zl!e{R>dt@Dg_C%KAu?3l57^+(Qjl;JpyT`QUTU}l^}Vj20q=DR(c%)5Mv zSD@9E1CL3J9S_|;ZjWVs!ok}+)>yA=`pype`xA#55LuYjgtGyAzI-*{u*{NqhXzaU z_wCiI;hxBxO|V|lLMzT~EprWB;Ot!S>eUD$6LGdXDs2CRBZ-NS_;s8;Bq;(%kGB$? zua)MKVsr9KQl?ZzDLL)%;h{5!N=l|*%V;KP2%+hO>r68ME&Yl)39j(o%N7Y7>aYkL z=WX>Ku^`!Zw!F^G&ll{ho!t`%0OG`~AHP6!P+AWPKrZr@FAc^IG@3o^w2H<}@qA$O zgsO$zN#M{PyK?^g5)ufxHJoCDye*|~0092x9XptvB2MYpm2}%-&OzNP;GQrvOIEQ0 zUc7ZnQ%g%rSNGnXJ1aPj>46H-b;`=hLEZHx)y*j3_aX=_o6%en+&Mo*(qO?=;$Ux2 z;)`syPxtQh0)NR1;4F|FGZt=22Oi$W_PrPhFEor)vKgFl*^=YPYZO>kaYCKHcPw57 z+B4P64C(Ph@>YD`%|0*`GHUBKrfo*dduFzjkPJbrv1u_u<36vfm`b_|=odXf7)&G?f^&ZAVXz19t(<7+G0ac&%K6 zHk3%5cOW0d4{RMJ12;uZ+lak;w^O5X?NeFp8VWuYoW

vjQ+enlqfAVs1)m=VqQS zu@8JpXM-?T)r=4C&hwtc4(~dLh8-vpa`^YppRc9{5^6S&LY={bx9r?05)KkKhcurx zWO#U01O^6z`jLt-&GodR8c%?V1?jz1fr~N0IpGd#&*H3O2o<>|*RO$rL2FfZ2RRRN zv;alH{gAycnzk1X4K2YkL-=B?P4=ju46f$pp3?eESD5}xVdfA%wUs1pFG%i0%8iK= zS%|66egrRFwv2g051?jXyhQRfRF#$Sk2rkd#43V2BqmW(mT?yAX$;2SmzOW2Kl*nr zoW6&N)R6vJ#l=QY19Ta1e{y&2HdOK-c>~;yrX-z!AqX{3IkgGn%EuBceF{IogBUqK z)1N+J$P%0+Gk5@~&5(m~!z)XLe!9Az%|*@l@XD2bc-N9DgW`yTTV75v&`Cb;hnxLV zeSg;&R)QY2oUE)eNFUCEkbb15dV!FVF$O9UA-K{?kBpJ{YwRK=2Jq?Fk?d**`!)AG z5Gx&8FoB$<(yao0i9W)}(>mUCkM6t;@1N-3i3WgxEo53jb(R}o_(uK!lCqCB5XH+6 z9z0+-6;INW6!&Bv5gnzsI*rX%kJwmVYZRv?-RZuO&%fHK=SaWsh`0)tA=uHvb z0eA>4zN3P`By~+Q&vrj>BxA4aYIyzP-8%{7qip zh{eLjTzl}~ob5mxYiSmT)I)ibD!yx`rs?F#OSlf8`?z2_OK~sdR;wia9Q^&jm}VtW z;>loGEtvf3`?OFomepKc>C=&4I59pud-5wx%4o%FA;!b8ykbNloqY=yUF+F0~BB39Br7x(bN8;K>sMEv;$JR;?tur}Hyn~8k zkz~G_@0##{7!92JnI3-Ol>!FHB})VrwG9V%KON=!_Y*_0lGXK?CK;xwt`1e>3jWDV z99sVti)MXc{KKtZyA8~{i`Bm_bpCGtrE;dGnCe7)b$tBbXI+1Bbag&c#|0`VM6L#IX7Sd1 zR;&Ya*D&=}jBCP6iBkARSZ7I9_QC%qP=wy($3>(8CdB68PTxB#n~7TnITb;;$q4|; zMq}{c_!B406)eonS8*%}-{MIGMaRYp+>6`-FTG>Ov^{!q3OD{YY-#`33~50b#tQ`@PH6SO*&q!*q*pIqEKEJzEI5d7d-m))LgD1? z-JYX9U%YrAw^e51*8lk(KHRcIBjGi|8VvtyFWK^MhHY3IUAnap}lF``2oo=XnKTbIZi#rfwKzul& zB6!jeFGx0O{bEE6HX30-i*8HE15}brjrz?qSS zNa(vm)AQK!b5Kl>yQQRbXxsMojT^M>ZV3u1!V7Kn>XV8upp)QadGWw`)Sw_L6vAZX zh4IZK`Yot-e!kLIQ@+4X9if6>?Bukp<(cGUJZ;OVwtiMqB55K9?1s-qaFvwgifEmO zhLDKFiAC-k;7KnAHz@6pkR&t`3vU2ZC!VZwYx(bU`_DVpf8<0$9#H{&4KzxWFqm>* zrGB|}OQ_a|U6U&%yz-gDLk|OKG7pe=GjQNyHac&)`wNsc>H{V~1hOOGHJI9=GGx9# zeQD|Ynn_OmXP@d{K*s3}8bI7&0fzsD9VHo1&6h72CFSHi?=1D0YDOW zuo1Rwp*RS+czthJn6lt9yC5XPK6)NLm27x`$eRaLB<8-JnpB$dmw&-SY>7w;d-m=v zL?5|=cRYZbG<*J=XE*UPj<>mcd7&@{EaYHw_Gz-AuTZ*~)DiIxFW8?m{J~U~OYilu zl;T~6!z9t!7XJ!OGF{a5_4Pr-$vUYRa@(qO>!u!N+SKeGC{-1)gK$}f-msAAZCIv( zT?;sfk$5>dkZql$^y&$X4Vw^_g-lgQ-(;USX4FbyhWe6d)BkR$%(pLzeNod3g$%X<-qvIC#kX`TuAE3Sr49;^`nDNeB^l0$A%FvI|Z$ zFKR}i?;2J}lrwGACDhN5m&u`^EQEe`7aAF<|%$x)ka#7J~1tbc|mj*QGRpacI==t>!RYJbLVPVP9%bR zN3S_TF<=>wqn6MWNe>3Iq9p^G9$>T`;xmE-S631jh#T%StlomAYoMth#dSgt@pES) z6q*cR%9~BcuXr|hH;gGSvLIknLI(ohr}XX0E>J!4;#`pmFh1F zl3;{SzjI{EJINCzC+3n?ynOlK#Dei?ekl~o-Oq3r(TBZg(IU1KjayA=VqZh_;sE~dbTk_L$ou(7 zy!V&tYPt*?Pj4buHq%4P5LL9EcHm_n;AHgwB2tF8q)__2y!@JI@V~%UIaV0eUOyYk z#Q6I=J=F;&CjPG`Iu8&kdg(#BhwBUW@y_7VXB8Kc4zQ|a-g|@x5I_Y~pSV75Mw3@B zXRsHnBN=Gu#Us1I!$~dmsPe(h$t!t&p890Y{rFnkW29JE6g^1axoE`-A#vn6Bd)+l zu;R_C0z-aS61SS9h+7sJHe(oT(6F3OOLML(GV=P=f5eD~Nb4wf#oH(-;0zQtQd?3o z&%$?G1-*2A=%fNq%w6-7(({pMZ8S3gN=4n@j!jW??R4%Z~*lE@Mr_}cpo+?|`W5xYhX z65l;q3tp+@`SXt5yYC}o3=XE_nf{SB%fd}%jK|>DEg(PvSfIz{GHKnWOK53 zSBo;b=VJ_{VWy%n?#7KN<$IPubR!NwdUcQ;z|?@_8*L|IU1yad)S>a4V_%@q3X=7>^{*g?!>OPSHs8oqVpbMa*`K!AlrZDfjYWSnW2uV<3xoY?+Y` zC`u2Sd*W`H?m3j)n{)f;z;k??Ok=Z|GrxZQ>dIJ|)2G)EZN$Y3Oe)$VVEw6e6*_eA zSiAP<(UdIl^(FZ?0Vb%BU^`eO(g-_7p|cRLGCawr_=V&-Bj~Zv2;;jEw04giB8eW} zI^En?sr5tvAc7cObyPTm0u+$+BJIes9WiWJCn+}1np6ib5})UH0>^!m1KoU>;=uZ0 zgn(1H^T^*Z_ghWN${kM&iyU?cz(^m)NP=hh8TgR3=Bn{Gx-a$U%ozU!>g}irgSwsf z6Td|;R)x{{c>vsl&|`GSJGsN$$izS~g>mV}3m?ZXI;CCL|{ z(l_|9XWnm*;W(h63doa7aRwMk0D}Y+pl4KjzqiLyaQQuS)22(r_X#BLR-=rl5C6>j z1WdOM66-~VwkygQsPcJt_VW-i-N}_71!3w)90X^gWvaHemK)YY1=+cWyL-@%9SE86 zf(otFrvB&HXw&0_Un)LgrLSJmw8Dywgb#jVZE5L-)X>kb5s`$*hA3FMWpYRStyHiD8ddQ`r8{rb1xenYm1 zx~7g~5X6ueWX{Z3Q<=B(5jx4F2dln*6?r+JP)LXcTNv~L_+@mH%$s8c%9XLzFPlg3 zu7$R1)PZT8?RkhLgi>x?K6fSX434)rqsmqz9O0@1FQ!?QdzQ=3YmJ~_Bnd&(@|N*J ziq9uuC!%^Q_I|5<=Vyk*U+Od1hkTc*q~L(Vj$lNiIK7K%2km2x$Le)xbO~)i5rCm8 z$606wc8YNi505V9iWc71t-ev6QHaA|AVci`UJOeBAZ5_6WnD0y?1Y1`4L{bBprOBD z@t3i8EVY-*7M&-PrW7Fq8Zh?uy?a7@Oz_3W2((h8Y_Q{93TIgm_oPkEtWcv(PR3F6 zAS+6hx^^YTZ|^z!l*;-`l|JDOhR^905Cd=~l;H#N zi!_v4lNuqHXvlJ?kPSS?FZ}xHQ<_`Z*1YB(z&ga=Jx|co!X(MKX({X}rSQnjooAeg zfCP+)s3kMhl4ml-izo_k+vSC=4r|$zyOI@!q!qs^ErnO()va4Mx_Avok8an;hyh_o z++th$7xcN{s@h??_I#(J0noo1&4pB9fR1K>x`sv_!Wbmg zHNdhYfUQ(v1J-?fF%jSn>evAkDr-~5=evfMTfXrH8kXG!@z;-a!oVQnwYF~>dbip? zm*Ct==kH?BbN&~33lIqthU~nh3l=;FEvDK{G*`!A_%tX2pHVnA>>p*Ja1tsnHmw&I zs@uBCLN8=}cSEas#F6R=G(g?e`Yh$g%f22*SU9r(N`AzG&h8AW39=qAI z@nB?{kEyOVRVob)+;e16%5e5#syK&oFv%xDld_7~05HAQ!$XY4N!Fx_#1NeHi3K^$ z^nkBlXU>@MYJ3+-oPpIdDRRy+T-IHsjN*dmvwC<6e(zy!t_W+?1#{-O*bKsjlME-B z0u7x59rW)56gfuNwOIl=F&KDY{!w_=X?lJv7eI}YbotY>cQD|ja%!rQbSOFf4lh`!EDSILPMkU8Ti*7b z`@2U&i8E|NCP+n%UbC2!EZqzh`<;tXbY<{9gx}MQOzV1d{=$W7)_Y=QGV7Hig4h{o zM>z^$!&h>1E$7Y4ph^|p1dN_JGTD_V2I)8;fDZOcJem~@^N5WtU@HJwhNFALS&R&}OmwFAEZ!*yl9*X<+I=fjYR8g?;|EM1e3+nC>p2+(w_cf#gVa0vf;pU(!;NlPTb6^K|4y8-7TiHRqPi-yGnIH}xO- z++=;DxLO}{n|;t6KTn|%5D{!r>(;G>urGjCl@IGbT(I>%02%$`wQ-}^BtUK$tTQiH zBl6Dm>(}%0lB;jfp5<3pK1fx#rI)4kSqmmGW5y1}Lx81W=iLZp=)z7AQn)VKr~


)_|X1w0!F3mhh`PUjH9A0gx*el)WHkZc&iJuoF94A3aS0@t&;5LTUM}FNn{WL$GT7-!9$Rn{V^`f$ z{_CyjWp|l{UQ!~EJbKMZ3Fo0xmp^vnFqa)WuRY!QPpFJS5+)dv_*TdBf*n5(5blp9 z%9gWdlLnsWc0*!cel2 zv}s5N5~CUf5@0P$mMp1T(#%8#&}0q!UH({Y3twUbeH#?#Vc+eMS#lKPgkZAyGhT}eW;xjG$y^M;I*S@vT3 zgThCupN9$7Z{shZV{=AnhX^Up_*`uE zW-qW6DTsatZozMVnte>&zzpx8glzt@FlXH6^^p<>TczWG^ z!1J`+xP1iz=rxt?8es$q%ENy3*v~uLj3>+zaqm-iPno_d1%95E#?*`71csL$z*9s2 z3hAbUx~cjcMO8|Qm=LZ1p$X_T9|t?Y3jy!r$B#K@6oG{BT?Prryhq9Z(>kU_04=R{ za|6mMd!PENM18thrNpx{dl$!0C*-7I7u|TAOl_WPZhnGf#B`nyW!csxV^V6Z4@1h_ z>x~<7ec?@3VCD*A!$YYZ{f~IQNuB;3E&Fj`QE$5zW9Mccy=jxSWxhTSAu@RZ;n{X$ z*I*4BdOp!&CM^5`-j;Fh!E?|=Kza3s=W03Ijn{{ymssX4Up2EeL-;Jqk0~A^-3I|o z^ejI%32qtt$QUoto=S|ge)xhVG0Dd-km09dxH_6JF5=-b!5FguNbG5riAzZt*@V&f zC)@{rp>p9l5aQ8La1>ltRwY`-BjDqw9}$E}T`I(P2*wL@J7 z&~EX3+o1GXRwtuY?OTCaN1509_2C;H!DfFtJ_L|9*W+YkY<789cER;oL$hyXXII24 zBN(tcu1v6uw7Ud;o<0PR1Ky|LV&2CI_PU1dgI%l+@L>*eME?xn_3BoY!3CXVQDrBFM z=n5qzC0dhd(=-B1ICpNg9)`VdUc6wij3*g%#5Wr_83NatMVJHQ*ltR~u1R0EK-Yi& zU8uV*kfb(oUi@yrIf1BdY|E<8$S(tsGy=8nisk*EFZxJ4t<42=gyk%76qdMTjUW&R zBir^Ib^qqgsd@*uPu!{-Nef^teSu@ByvOS~T0*Dhs-o{@-hpwsA`iz}u-D^>X1#-|-*pZ95&;3{qL|0HWwjRt(e(7$yb*0*P{f3fQsXbIJLP z8T&#M%{5aZeZvzMN6F5~IQuG7Uu_(T{q_aAzSrzMkN#=)zrqT(tx zGz>7+L~X5S`Ht&|J9zxOs2|AK%r(NO!pNqMRd*AtC~<-;Y2@-p^J#xAfBUP6`wO!# z!Iuc=vW;KJP;)b0begXD;Pr^)g4;eH!iD;Dui zrXiabzcgWwnMcw6?ZRu#^iP`s)RysQYvOkfcUBl|O|DSZ8ZWS3@sh8pUAHXWAAVZm zZ<>SP4#2ksx@cev{7qno02c@bpLE>OI>NU%Z_#3E(h@+IkUe`SmM6>eXy<6|S&Aku zdNJ3G$tItT!r+c*t|!}Plmra1SyRRx{R%%f8P?o*e6r|ROhe`9D;o-sx_wVdYuAF~ z2mXgyO9*|~?%twzrNuQ;gIiZL%E%a8>WL5Oai01gm zhp~T_1(ADYc>a)sABflBfz0RZbBQ1+pL%R2UD;7ZiQTn)`z_=Da;d9c#U?m1!Vgj6 zYnw5)0jzuSN6=8eh|?I2x<9U;^)!#_wvDQov69_x-fD^>%M~u@5K6Oo_BFKAv{B`z zi*dXm<7${%ai_kMM~{NIT33d98o;M~{QQ~e(qVSRUAlIq-xavyJ>ka5UX{LTUd6=? zd`Y>1Kd1nWhawIvSSjK_J;@?Rnsr7S=Ii4lj9OM>>WR((*oo4Ke~|E|Q*#jvOq}B7 zb*l?v6S%h~@W9kheRpJJZzZKX0uj_F@jEUquB(-QXckX>c$Of|&=OOA`D~e?BQYNj zspCZ@c2bB3HX}GR(AAx(!Yqo1Tq909u1nU3$Vlo4MiOc4cd^rkmXf%2RF%ES1He~1 zOL_94$J^Au*rpqxh{6I{j_aPu@C606{vpxN z1n>g38xXK+_3An3Y{|VSxuKf)=wyYX<+R4hldH?eL)io{J#c8V*#4Cr(t+;6uq}Ir_Y1e0_o2I+_^B%5am<88+e)M?f???Gf2l+({e*y7%k} zSsjl0h6n>3GFOGcbf8IPztw{Mr;X(iv^Ef5)t?1I@gWX;rK>)hS-hYk%{ zho_>L<3-s{5_#t2$yRb!#5ytG$=-`leXl*!Sm-?(%`l%_de-7}SoIO)MJ`2YXB1zc zV7gxz)_`7(miLGG`C>#kf43BgI767?XAO$~U6AnHLbpzd=52VrM6LBf_gO1K83;Gc z6buRfS;12>VbdYq3`JSXh)A!BV46th&Y9Cnl^b^fo-Vv6r81^`PB3c`;ih3SZhjnV zI!YHUUW}{5GvG02BaI)qnoMADKuj2*h^XM$RVU3#zlPU-0g617@UNY_`uhHWY)W47 z)JA6Mi-Lk9#kr%T#*v%Mk8%roK=9=0#o3RVhc~VH&41$w4qO;9d)3#jUM3pf^7HkP z)*|JFp<)p~y^@EN=-eiPy&t40S|3%_1tdcJ=lpj9D{mrVgy}Ids?MEf&7NKRXTA_; z@gQDTVq1F?o*y;>p_T1T<6k9K0}@Z z&NboApl#IvGPS}v6Sy=Ha9so^0izLviziQxrS#@-^IVDuMd&Wd#O#g(7tXcPX^JMh z|9>2=D$Jx@(dOw;ni}M4_7r1Kt{~?p$96U=X`$rwp&=+@Fz5jqo5n6(3>!p2Oy?k-q}BTchXUjP4Ea*Cs6O8J_Oj}Gw82a8K_7ainxL%?5dJW z>5Bw!MWZ3x0Q;Y<{8NTZ%``=S2c@ofY!HD$jWjo30lz8H|RooI8beAe{%spWaK9 zuBE$R(&=|fUN(ys26U8ne*1%c1b;}KD!-g|g*U+zV{ED8LrtA17X$^2)|?9<*TWF) z55txzX$P;TFjPZ&K~lvZt%JKtG}<9_aP*=f4|EXZkq;r*|Ni7T#`RTyDT;BNKK*}{ zWS(B@?k;+)kOSjP{pbnl0k z4Pql9ui5U>)J1UR%I{#NAZfz}%Ht>X+2X}EB>CYw^n!vXi5>x=N)!O!&~6f|p8qOL z{0klc?*Srhpnsjf64pC~1LE#}fayB)Y}+$khnHt(I2x-}fwU%ZJvQtkR_1F$B*WN>^~(( zp?wv$BipXC#8KtHFKIissqqTX4}||8nb6MedPg`PZcbvW(+8~FtCvG+8|n=Fa@TMY z;QEH@=_M9sHH)_YPXd>|>!&0Q*cqYM!D`_WA-W%_s;Td?MSBk~%4nOcQv#GWGvoX` zAP=3ZOr(Hy_YKY_;}HH>2`+>BNQJCQ`;j?cK33< zcr8`K@UHda)yK0%!xkvYIS48%??jo5;W*cU*E--Zjq{trYBH$UR~a{7Z+$CoVH1dU z=HI`QLTiL2R6Nwx(P>~e5Hr|6OUMnx-GmT^3>6g! zkp{DW?wL%@=$M#Nig&~`y+^7Pd9G}Vdbx>2@H*Hbmlb2oq@AvzPtJdc*#f(B1sjl&z;8QYdc+~bGB zN7;%8RP~Aut%g_!;vOdl?1(gm&~@B8anHqzLXJ#36@(}>Ai7~87RD|iDTyn-7nXh0 zLZBTYPyB;apBak$FSt^bfDTlH~?W? zyJRM6-~k|4_x7T?zo2*-MF2aClsMmfqgjodf0D#)LH0^A*RX^X2P|EE>;_`)r!OY~ zKkZXrWn9&cc?LDWB_8@U&TSc}te0P~m|^vGPSXPa&z_;lFLCRdFtBn8AF`3~kpj4Y z4CN9i)2St45IqB;1&_RR?D4~gAJAIE3;vpHG_`VEhiC)ku3brfO2og`amc=LLtRH_ z&ou-L!smh-u#Vh#KVY4gv4uw0K)Ykd)AB8381ug`D}x0>)rTvJJP*Lc=Vf_0P?nwQ z)qm52`Xu2bM=&Mjx^e5)gSPCzSZF>Wwg!ZOlUcMKnYsBo4ii;Aa)ihcE9^vdEm8~U z*%+v7MBz(73hr-T>f+*2o|}c=9Y>R=%C6yb_DyQy5OHtOMXJja;sQIJNcIEM6@=fl z^%&i-2Jvo^=$7J1qcflymWhTDF&$*+!U^!$$OKCnRwXglL9{Y+$`nPZa0Nth%C8dB z6sf_>m|@H-X*&-dc!7b2v-B1&SRh2OJB{k%DEfxA{MCHTo42CwqO`gTTmzvkT$+So z`j0wy)vHwIe07kdblk_lhpj6=e~L%D&nvDYMbT(X{78|_WGUQ_fOe13Ig+bVNe{lj zZ%^t;c*0%_63Np0oa-|qq$yysx!SYkZ*dP{FdB@9%+s1)E~NL^&;40)17=zNEifqc!Na|uqA5rDhe*ezcyx9sl^!@Qi z3P*j{V*5&5s3c(~e5#u_2M-(OMluR!pzx7Ak_6xjEP@-u#zpJW8)^jt0aaUy&}?nz zzf!Dq8@jN$dx|7mu-r5Q5==OOitsd7LOd0MbTiT>Jo$jfZXOUR5@{ZiI;sdKC#P1bv^k{{JEEievLvFD z$Qp^g52XqD90HV1QhVYDdU%daTy6Hhzr6hITV1>48+b?1F870D1>sgoZ>q%H8i6Y- zxRmj)g2hKRc8Pnov}tsveike}bEwU&tojGM!Rrrb0X$Dtw6pyMBtcbO&A2%Qi`lbj zp%qSAsO8jauu*O7lyUzXmH#}~IbtTTdju^D_QeO!YQ$`^IfTlZu&v2uJ7N`i!aTFIKkmicoEzd$FH|CSAK;a^gRCxaS^+?hCO|R?^ z40dwhz{OJ0*^o?98a{hM?5SBzM~yd(o<^(}vo-`5c+mVt(}1sprvQ`SV%GNdS^-Qy z5TgxfLoS(4zvOnZ>!V2WOa3VI)bbZL(9nuIUq&U7Por+mihNxhEXABPzVVQt`c12NA>@5=B9s{Pp=tvSo5B= zS5Sx}|48?ekbG4kzYRZY+*rTnjGe=t!~bNU)}sBHYo<~@Emge1w-t|14baw}u8_l~ zA{>Hsba`Iq?-{!5Rb#k6o39&6J-P6y5yv!4}ifmSN9i%p#0^3n?;Uk2QPtC z-<46&R4V*o+1$?sdE5RG`6t{b$_FTlw&A;X_x!u+CNn6&(dJ1vlLMmwo=n4GRbx5> zctAL?H!C{-l^ES<5cN}1Z*EN#Q^cYzAkN|Z2oFX1kswTtEF>tZuh`B2z2g3oD)}6O z9-vWm1vh^l1pfqTDob2=D2IW%Wibe58GqDiqApJ6yQ?_4~p zSGn_X9oaT$*3WK*gsp3T8>)I0M~}X)qPbMO{L?|Y=>k;^zS;oCxiCSLq z1z|*#{~s6qvyx%v@m8qam87uvbP8q*KhKh|YObBse&YR1${x{>ojfk1r9^)ZO2A}M z{dT>V7hg9-{AZfih2*XA798o-y$y{oGmc!DGWp>CPHwZammRob+5h#TTMG^Zx@|vk zs^i<_PA+q9RT`MR4c=cCY?(7~Pk*I}Yj+Jiu-G{ve(?ss!l-9IEUe zw{AV80_p7B4voEz_n(JiwY;igb({XHBu<%Su{unmTp=zWN%SR{hAmsSYS1UAsX5C- ztLf(wKGuI*Bi#)6%LB>p#qhi32$A$gjJSL278<+XKg3&%Z1yt_Wfr_*lUN;wB>efF zstEe~srJ{@+;;DEqT$c|=~Px)>Qi4EqPP&{PbFE$BXcJwo);C}>@?MXTQZ%Qp<<8^ zKVSr(R01SOJ1k!T z|BbORF%ybZGsBObI1#LP2+RakkoedViAryFWP+-qLY{aNUjCmbdE++nGvas7pFDFG zT^CwK!M@)Is1F`|De899N=0An*g7ykk>sXbyLT7*&4q5^3~TmEK6}=PnYVT?zNxkN z?YH$_Ue*Q(kv;*A$M@^Gv}yyFJD8odfhT!2_-EXJNP|^CX4L9d258E7CG7E3uhG+x zgwa=WSS8OmQU0+cD#H^Te^qT>qY|2e!UQ3*l>uVIr7KoUH^xN{agVu7B*PrSGrjwm z{2E)*E9b_=6`ABC^bjAZl;6ve60>Nwex+i`wyfu?@_PLem&v`)@=sQngJ(ROA??by zVIRNMvj-UEy;E@*KsYGZ;D#rxHOvi+6P-&?x>etNLZ-wdCDHt@hf)Wgfkf3U)jsSx zV3-eO2&!t}*FQ|mEi4Z5V|d8H)H2Qh%f=IXc6|s-wUL->YwUN-)1TE*N`pN zIrXVi!^mzq5IYbrk;C!E`n_$`m1g`sLR(L7x3?#%IR0ybdTHbMMnhArE{W?FH9fhs z$&Cdqw_TH%-y3{x;9u{aw!BRgguO1ZXj(#vkL>EglSEr?4-$DC;2v_|Dt6qbgP#J$ zT=Z%pmfv-u5pw~G=ZFIb#-iNg<}2v?N@d&T&C%j4x_dd6p{L{3Svo;2M|jxw+F=j7 zQ(RmOJ(2H=lh7xUbH4Ayg7yc?UW-=-=|0#LdLhu7{Qx>!UkreE1KMJvh3$A%N@5gH3MC6E@B)*CEMu-SfLn z3Vg6FZLx-Qvq|4tu3DNJZjHehoEILO;%mYbp!gf9-=m&8Yi&i-8IqVdpwuEs_^zOe z6p^V~dmhJfL!lh7g8KSKCGFm|&v12WwVIC~$;XOM{=TVtgKy?9kDKH&EUbRVE?p+A z$O#J$#*S${h3*q0(F#TbLMv6#Cv^5K5!-e}RLCcsDKyUC0GYac`OwN4bv4kCqtH#F z$ab+Q5VZN2-aDP?Xq0{H)}wwCr)` z)QLwa9B`#7EzI~dKL^l`)J(Q{3jW9l&*c$J#*26I?zt@<716vxp|4ao<#$SK= zk$zlF4GjfV_6;2)wu@t%oxt>Bn2#GP3hbL z5{Q|0yVI7dPoMs6dSO7o^5XP;bK-KFJWa_t=-B3^tEu_<%a`_T+YZ;)w|*5fK~~T@ zx|tX_vT4(A`IwR|rLuf$)KYnduKNg91aoUNUgKrO@KQCve8^JsW{F~*n-~BF7sEMo z_5p(+cn5EZtt|nb3-|49g2jh*gx+o40Ly z>6S_nz%QK}voY+Bq=Gwr9cP9sX_?nLJJ4|;4=a#SUtUVZPKgWAKBY@ zyv9C3UJ8n5@oKwR3Q5Amzz1dycdY_~iK}D$8ulw5lF2YYwlS#La~7o%7yHCyw?)&d`7>z~qaDf3J z?S@X_OQ*KKE1ZhOM3M;;?t2Bf_k}hYJ$j#FE=en^UGUf|qa|?#Nd-g@cpHziATRIe zM{{&H;WM-jGpIB4Ziw=h30$PdK5kJ^CCk$->WzzN`byXV6_v)6FO1oi-R{1{G)Y-v zwXJn*c-sDWI;M8^GMsb<-M^C%PE4j~&dLgOrm7xgU|@IY5zCKTlw3U5d7xN(u3x4iHWz2K4FtsoZ*+mKEZ?`?efT`<&j8y0)u~M zwd6Z{XKP)g)s;<+Dge9pVvaVa*I@K${6{7i9sJSZn_D1=>i*+L4!+d%8F8N|Lq`}T+t zSl3xt%o7E`6WBswVfg?i0?UZ-unxmI9epK*EyFn zIDjQ!{S6$&xVR}ZXNrE*yr;Nno=eC*Rx#{A=)Qr8&{((4A!DP3aj1aXZm&e7hd;bC zmy6>*yjk6qH=zIAJBq$xo`Kgd>*UGkARP@)}e zN6*Jx*mepT%_dl`r{MG91J>5of~;(r0GF7H65+Sse#1^xe^S=#&nWCrcau9I{KqFI z2Lv2J2a`tZf-DBlNF~f?7@RL`p)p;aZNT9izD7odz=!f57vhI^ z@2=!tY<)bx$}^pNhWzXUmn7Txm0POlRlr{U%o+E!Yhy+}wJ3z_fG532LA1!p3A_ku z;}Qa$OShqE1CHxpC>HOc-b&*aJkn#AnRgg_=A(Zhj zhSit&SiArr;PMgg5=vLe!a7J9KCc0w`!!zTB{RQ(Coe|K-!CgthmxvYe3A`}E)@_~YmkiQ8}n3iTDy9!865^^E?R29(l=)*6Czv_!U0P{K&KTlb>@KCk>W0V~bxTS$A%F3+$S@v3; z!cH!FPwo=xPBzYD(BcH24Az9_XiP8Zh7CraM@8Lx{rMX?Cxi%kwjv@(5Bpter?)PG zr3vfq41+^TB-`);0|9p4uE*Mhng**A%fCCiQf2@>&Aa`+SBhAN0sgDNI z7T&;0&G0=J0FQjf&YhT5aaQ|@U$lJrG~>j}-%M{#v3g5o4~3Pqwu)+m8+&nYHvjYq z6PkY}gLn-8t~5Fj`L`TS9`boUt-Lt3320|#WIC9=vra-#97P;6Q5}}Wk(QG+YB9<6 z%9Xzw25y~j2t=0i)PT|y->G1Z(Lbe&a9J>&3^qi2ecTsNEcAW*?l~`AYVIYQM$*hO z-R=kai?RJcec9#miPqyWbhfp!GGFnk-LJ!jx^|diYP#^|MFtIGvFd_W0z8YaI`3sw zf{d1@*Y=@K#Y&QYwA>_B>wE2w6>aLp|cT z5`O*V7cy!jt&@7uc;_d+_>y}Z-n&u8&)A9kLV%!|njq$Ufe74w`O6x*3*Qc<&T6UzSL??lV<>|&!K{}t;hG4cr9f!rBw1(nR|QTy3lID@H=7_mq3 z#p~AvTsTCwM8j!9i$FuFQ=qep3wY;xC~vZJdMzQE#!xeIXIY(NRT#{|m7T{Di~xp4!4);R7gsp0K2MRm(5t*X;b^CvD67OJMY z8qcfoQYmt$Q~-8dLN$k3(e6XMi3_YnKB0xj1f>fr#uE?(2xj?F_d2v|hmnK>KGFm=3*0|RpW(m;pFaB63uXk_9-ZA$Noj&Y&i(s) zDT?9mu@UOuWgAPoo=ugo_c9cRBh$60Pc&u@>avYujxG7BynHlbCpgYOOn_;AKvPp+ zD+EgE6M!i+%^$lI=`mQ$g*C6+t4=U8LqaqW-6uCT+~)>_E!=oKDbF#*fEOhEKuQO1 ztZ&<@72^r7P|IU=N4rec%a1~XK!MDUaK*sd(dwOTzfHLE5hvr%o;g#qYk(Q6kM57A zu_~eoJ%f}~t(*VCtQg8+)USg(wY6%qDK-4xG}{+Caj3y2H#j*0{ruCSY7>*+J^B*hc5L(frtbz_j)nb%Ax%fE=w1FIWB z$WXh<9*p-IQ_NE$dMc=rUXqIJ@lcaaK{$h~h3fH~J z-e9_D2J#1ur>-OlVW(VKISd|57Uf^0h?Fr*exLKLMBl$7X$`5X-nFZfR9sw~GR?3W zde0aSp{=f7TV3ASeGz~G$_4gb?%Tg8a;i`(OX5XCF-D7nOnZz!lBcaQgN;C?9CnYdetZ{E9Spjat)E6 z8vE8Mb?mqrsiaZVOoea6PE5ejhDJxEd~8h2cX8)SVyR-jf-%C-eWvr?$Hepp?Iw2D zbw)`jpuQ;`!uIbULl17nmC=l*=5n@4NMoSt$)!1Jy?0JlIC<)n0p-_cdyV}`q1yu( zDQKMVT#g88=}+PcPxP;FNp7>U{=R2N0-J|alT0jBr|lYe1_~}Ykacrf zq|FK|_P4C!F7DvW4m`nQr+IYYI$HJ)j98&!owJBNnj5w{BD*=g!3*kGksi>v!eJlZYp&5chvvSY?7$FHcPPe-oEX%?tv9 z2^I#1h7ZP4O*pm3NUl_mUT{F!??Ct7H23Y`YOWrl9cUv{KuT>02F6H8@b{3 z;t_KiaHxKKQUPsk0Z!xfCQ@y^BBTW}ULDG|_Pi-q!aaqH;T8s{+36DL$r+0u? z*flOBQf?AXQM2f5Qw&{qJkmlQ4y*xOel7hrRUV z6OgUty4Z*M`%Ann1eyXY{kzvZay*08F9NFVvY|>L9A-#2Z`^ndbQRU$qLwtv{O;YW z_Zip0=y1Q$;R|54qi0BQ@YW-zE^GW#P`VH9+c)#+)AN@3kEjHuM;RS(W-aSqo{GEOI(hta16-v#9Ku0|LZ-Vp*ZBEGDX;QjO31zpBnW zuEx9#1V|m$R7s z2BN4X&MqfzyLx&aP~I6Gee+R3!cH7cZUR~X;|g>+i8#)Kppr{27Dl+&GA>ekM zxP-)8CT6@~6;X{;(F+n1k&`|Jv)rSH4h;hxgNE`F7K>wW<@^NI_Mw5!sTjBkW)r*Y zIR6&_#72K`?nyKQ9IU)Ncg76tKbdmzGLI#>a6N4H8#`2!8MFp zVL+x7)?a=@8IO(Vwoacv&QM* zp+n{=EVcBiRoQYCM_kY;{Fb7IXR`UX$c{#i7;!}SIZ@xgIH?YDCbt9?;igKw&!lnOhcg0rY|vihMbG3g7Lm{S>~#zPB}5ml*DG zX?7)io5%Dck#B~;woUcYO>^kW>BecsUmT9}bAkRP1xg&zs<~}FcI6ZT$8LS^` zXQ@7Ffs4o}O}6w1=);kRPG36AS{k!C*?J75_mwM+H(th=)OmhkK+CE5Q+HnJ;_hn=I^GDIXV@iNZM;^Y4wBNDiRb^#8 z+<=ghX-D)DJ!nCuOzF`0j??3`KgAi_O~<(}sOR3*)QE2EF^jzoPn=Ez#zINeauOI@ ztQhJrW5vlQk6HKi@+I`Q8L(5BJoWC|H?Hh((E8%>`%kR)0l39;Bx|f%bnFxt$JBOO zfmR;EW;p<^y6on*jg#bvMVEJnj5rv??wOHZVU?pT3}$GPY*x z3xlob9`YhWY--%PYn2h9a*hgPgrbHMXU;?{+cJis{N0EJNssJpEAXeipq_vr$8IZ# zB(3D*fAHcZeUnnWKa+9ow~{r$QBJh|xiIvVK%W`U%nfA(720 z7nd{ff~isNE8v;(wE?B?4smzRX+6Hyax|=$ox}2`CdR~=YVWXIhDpA9KuKX%mQ|nU z2~m`QJ&c!!E3;JVGbO*%xu4H#(=Q4S{rdIzm~7a<8U66?9mZ7!`T0x{c$Uf4sz%OiCHAA38gZyq2ac79QeR}((XD^kd*gJLj5)IasvNi5w7KwDV-Es zgAN!}Ta-`lWoGH((xoum^+AMdt;F^G$v3*Ici_dO);$R#9F}JpO<2-^h!K z+B?Z}+TNYjlJ+|Crh9Api^GX`I63b3>({4`Tu=8vw01^<;!Cl6M>&&&t=!&y8^1VY zJA&p^hwcV@;Y+KSjhJ`uqQm88BT9jMdc^j?8)UJKxiypmG(`yoS|}I$tFdD_>Fqe@ zZUE(FCKD0|pR9?U+DdYN)e8!{zEKR= zn*^6I#zNw7NRqT45m_p*3U+@~3+Q?R`MAf(9hQN@1P|kR@GsosG?lMsfAsF!+V7YD zWc5j%WruVf5(JL}3K}i3TBlAdJogurT)y&FyI>eF5tlC(lPCkvXXTxV%c{`T*B9h9 z91N{tkI$$EuL9wr29ceqxpdq@TYh(1yoqXj(#B<`WFP~4;R z-`#b5FcW81GV1p4e;fJTq9+O~CbYd;wcI$a_!q4!IXUy#%3YR31;NXogC$3%kK&vA$->O;TXQHi2%?Y1DE ziO7lpLr+|6l%1`SX0~^~2L`g;|g^YI9vy~$rEml1 zb!{*j-gA5A&Q;l`4oa^-u(!0_1QQPXaP1w#W7^0z96X*8)e?d68vn=RpOaR!I!B61 zWs@Jy1xF=W_H8;}vO;OdFr#?)O`;q1b9t=T-p2=#PdH77hsty?j~%zfw#zTSb-6fp z3Ry&QvMAYqYRLQ@&JGTjj=9*C8akK55g(>AGdI<_?B^svacoTO#+?P<`NE_Dgn>eM zmPZ%sTqWv7w^x*0Z zek{XW2zp`It>4!nrPa82csX*ptG)er6XLA5+}u4Lb}&mJ$M<`(*+Sb}yB&sbOJ8dg zzF|8i&19^lN_ohp%+i6le~~A$%LQe+U@uVeq#3p&tqHIlW5@VQL;Tt$y_q06UpQc_ zh&V1A-N*1q&PwS6OO4UbM91PbmwKu*`A;70h+))@;DdTPJL(|9exk54vt)TH101zc zZ)u|TbcDKE_WE_g^t={+=K_`Us+aPc~kFFi=hHaE|k8yJ7#KfDJ_n zOg)JtjCqc1OsD&p*RdbGSDekf88bAaZ%&j%2wUnU`Yv<4yev|pQm$u<$}49|Nc*^F zOi4zp2k_`;tQNh>fA(yhM*WSK**$RZZP8%rm2}6#gW1Qju9Pfdt|~CUHiGVZPfyMo z&|O*CG=*dR(9}_o%|meBxqtt0xtumtP}avUUHPovxva7YKE=LU2bjxRM`lY;y|_1Y z+znnJZCJz7TENl3X?@Fn^ON7?LN<~Q@ePlr!7*nQfruS>Tsqb3&aolxjJu>5+O^n{V^NRE(l3%apg#H7Vb^WiSS#6p|FXArMr|C~S^P4=y zuWG{lkY+Ul@ABafdwUh4puzgdTdBnH;;={t|CG(YJA2kFTxfy8xc!KPa(eHtsR^vT z71)06Rv^?n{`09tNesvD8!>%gm3eP5n&b%AHsNy8avg zsCveACyx?%t9DA}=9g){Pb@or02&T|0>T6$z}^MH=ty#al_YDa1G8?ty0~&?n)iYQ zQ+%gmvyl{;A@%t!BKAdDDLs6uS-{r9<>?N=z4gYgX$)I3%(Rcw>YE6mUYrTs?>%sW z)5jMhwL5KbUH!B$Lfy1)f8c`kez@7SJc%p;C4_N6gg(m6Vqp!wrWGzpWcs+9SQ^3}atK!+NS1v8puHhKYlI zu8*&8e*9hmM}*3hvwTmyP33!L@~I|RMjB3hZ)pCN$_Q&9?TAg3nIU{KFp73sfJ&NT z*fB$Yop{%VLF0?1W@fFdZO0yHs;l9 zI8RETURuqJ({PM66)90NtcbX0;OVk(uD|~@H#dj*CtW^2yV)(`7Exw)u(P>&DOI-g z0nn|}mtajz&9(J+?0(;2W4JHTPe--E2T_pi zj*&YhlB@eYZyml=Azc7>?K<;J`9wHF)eG$<*hc23c;G)1?t1cTKNPb7dY4V~Xo(*P z@>3Eltpii;j&)|g{;Rj(dbk!9iPJi6TmEYp6iGGUzh1!+9{n}Nw?}@o%Dnul?=Ba8 zfs6*~C`y0cajR>OwXTO!c7~jfY4kSz&%L14v-pw?bekbqihvmS`KPC~e5zjQ}<}dU7*dtYWHYjRBuydTR@qdJ+^?6qHPL~>Af+`L-q;0CBu35h6JzjGxn4ltCkOK z>S?O}FydNiRhda2Tj3B@?Zg$ie!CI}rom)Cj{Vl}=gu_V-0g{OL{UT=G?|fQ*b?9bF@e9kTMl;ig-4lw%&E4L=F>&rW z-5^TYY~ZDAQKrAEXSeUnqKie==H}}?{-N|L#hES(8bx5mrF28d)Aitq(SBPj4ic4b zyz1L&3x!7f=L$wd7QR<#rbWdOPwD3-vD9#+<;01X{Z|$n^v`2u6$MuUe!UYj>7m7zRLQ*BRC_k@7Pxpkv{G`1ogLU2PvV z;0$eL>|hZL>1?N)w{E4cZX8_*l_Tfo{U`|Y#u(yZ=E=X2Uz-_!o5=)+)+(89s{cYI z6K(Bvc`yYqyInC1U(x`~M=v{nb7155)> z9$jV~FO!9j%(v>T3dJQZAvSg>CH?3vR+AVy2A)5BSX?KZs(*Wx>zJI@@^3?SjkfQtnBbY9{nk_UX?D2T!Xx#HH@Pz0*xM^NkQlK#{ zlztTh?oajgs_0HmchYyc*wc3F(2t>b_iwwfKrkYdDy;oMvrAvXQxbT`F|N}L>6ix+ zK`BFuv76pGe=dCQ%1IXEMXzbQZsaHlg(()1f$G;PNeIbFNrxZ?E@GCG*T%g~tA8lJ zU4Dsu+rYrhX!?TjN5GZx7tW_DH^?5q_Vh5T13gAA=>9lr1gR=@VDu1Xk8A46JNrC{ zG6sFEF{q*bIc%-D$wHeIPMnn5#oc@Ks5pbrDDdtv7yV)iZ1AcEBQ7C?7#x`KxS>(y zoFu*<*#8jIJ_PQfYljaVDrHn6>H7NUfz^-+es3QN`10t?{CzoNxpXy|g_)WCw6w;j zZa38g>f%KTE_253OqdoWw0jIKE?j_Fo5B;K_t8K8XV_>SDGPcOKC7~Ut(`IH!^)sap3BwHx+)jD5YuM8g)BCWt!FAd+$JfBM746En zn{2{3lD%<&CAomk2O?|4>PnahHOJR6^v9uDmL%KlMdnU{%JX6=Vb1BCMe1H_NNq35 zDj-TgGqdUeZ-f;}@qf>7GD<6Gw3(n7#R`C+Zj(TCndG-=HD20*bSr(+C~TYp&+FSP zC?F8^X+cUOA{?@>^M3aJ79ZB7r8P48k=4T6UB7<4w88!S(wl%+evp8bdW_z^DhJd# zf!PqTT(T`PLfBTiJH0NLVh@Nxzl!ZIy{O>xO5vw`W|mmcaq*3veR7g}6z(I_>1FHqo8fbQsN0kFyDc%qM{E#p!?z3vF?Scdvla4b)l@ zjqdJ)J_47$Y*uGrlmLAS(gNKsm{@-`YEOK_1^)tZIN$Cwtb{xrP|i?~c4(;q-zI~} zgjqn#5mcL}ye5f61yI4_R|>_EM24-*qV)8rEQsm&tYWF_Tt4U>Ok|sZwpU(${F)(lJfx&Ml4`qOe4NVaR zrSf}XGNTFB;vSYtVSp0f!Kz8e1kwy(MW{FYx^hLl>Y_&ORK4vM!Azu%j-62O2sM)% zLwq(>p|%DTq1$C+S2`cJP$M5U1K1ZdBTg!SmcFho8WHl*G^Va(M)_E}!Nmaq@!7K7 z1=x7hcfLX|AFVBi{J4=ShfebJ=^E5|8Azyk zqVbB0NleUL;z85q7bREJ;v`_gix0Mr7=eDyzLB)^M$K;Q*REbo@JJg?S=P*GrW^;> zJ@L0_E1nKa_;K6*DQ;ei{z+k_obh5ISWkTklyA9o>OHV7ga+-j0N|iEp*|$|Kyzne zf~sUXoup7cUP^I1Vp9cWZ^=c_3N>|gv=O~bLWcM0Bc&{YWYxd_9EwXK*5P_=H>VW0 zAJxM)IQSe01kE&NKh^Kwlh5T7J|PJONEjy$$4nM_m$OsR_`RW_ZB}grw8Lcg0rq9I z@yQl!BPd0mimPWu_gQvYQ+mAJ^{t2T41=-Xw_iVi(Fqn7{YNoC>73noAAXL=it|;v z+<1S^SYqN)$Hc>c>*x~0{cQJ*nCuhRpUT}4XO|1UOjQh_EFZ8vaD4KE6o}PMZkT6dI4q!dQ&boOpgo*UtK5EHx?#u$L!?pmf$nly2 zdT`mQ^Luy$F1ud!&lk&=z>6sa&Y^6XvT9&dxj$9u;zKOrA7gF1hqwRTbRDunPeY#pc4`AlW$fK2dpVYStf1P{&8? zy-(Ex?e4FqLx)UHUwJ<>^N-ZjAzEHAHVA|Q8;2OBXA;z+1Iqd<3$Zq10(xHqYXe8m z655#^+2wZ~S_;^T_K(!+-H5M~a_61qc{}cMa$PVRRF*RACsd|ZGh*h6(p=@P--*Y# zk-24_E!2HzLW0{`Qy%%Ry^@mj0gYpU5#k7f8<^LNk4}@Ilyg~*KS|ks@JV0+X^_T! zz}REPf+Q9bI4v(2E;$;no9kr-r8@v@><+O|yidD~M@d1CDQP%RuMW9iP2cmXX`cr+ z3gAvFzY;$H-8B-CVG<2TPM z^_Wg$1kdJI5H6-&JWD(@Ks|!R8=Dv51e`Q*+~Sw>=+S5Dixx*yd_?H5#mcNNA)?VG zE@I!G@L+cA7+Lp#(nK$8DGlM~!vay^!v``fgg*+PXgWwT9>y{O=iD6~^TDpR62D0l z`h%IaRhAP9+kJUIf(oNaBj|k%nv9gV zAhlIhH?CjrB5c($T=o*4)Bzw0<_yAe;)f6SNf~VkOwj$t!Kx}MRrFYVk@l_E!P>;^ zg^C8aF#&Qe$q?^=SzW!L2P0>Jc6yLm@VYj?mPw1_u2L5X`(410;PnIZP?i!ZgbUtn zAN>ZO&tk#^T_X{c7J*g6uqAfD4clk0tt?Q z)c`opm%Evjr8ju+LKt;aIcS++_c1lMpV-sOz>uC|5h@RB>lc?jWF3oW9r z@5>i2z`z8~K2)L{?JMt}x`Kue+fRW(Nyp6KFo%(HK-iHWSufjs{(J*-Pa$|R2qssO zM{)f}=5dOc#8JVrBVH8tv*_34AF49hiQ0h$O6>pAi@i;`a_iQV$0dB9m9Oo-Dvr{7 z%U1nb=^(F70J{q*{j=Wop9d*HQ*X3N+Ut^YGPTwC8FkBlGFp1%$M^KuCsvX&S1F3A|7aQn## zTX#O$*Y`;_1Eo@Sz~EqqWdZ7BhANkJ)Y{7@5mN(VV(GJ}JEHaAPW4(fY>0Cc^I!;+R3_lWj27Y!)KF`m}OvVGLs7 zAC-Hu05tl|7$|tK#QgGt#kNhyT9jRL&N7daO=L@Rh|ZshSD?0Y#+Vp)1BR@RNlF?P zB$x^L{iRenDrEA8{b#-M;&#mt-jiQX;`8Z#h3&EQzZx268#?ocF%OU$$|urB{`hvN zQRNfaAoWdh%yh-O;u>K*NVR#VV)LumE_$pk; z>k(8%63GO`iu+0ZR@i=rW13&0U@-CLhz6#UElP3jn^XPAx3-2TCS!`ht;^1>Y2rE~ zS^sgJg`c#`xV@&LD+{MvMBV&>gvevR6^|=H7_N!!j$&aKj2%|kZexDfxV-u8zW&El zi&Myei{_Qbn}@9sv&>}{Eq|*=VeZ398aun&+#}*8y{deSw!+#bdme0xhjr^(5$%|! zjh*~WM&9z1rY3Ag{};MWk=mXF>;yLtdvjmEpot>#-SCmMS zWr{K1aYNyiOw)EnMKQ&hiHH6h)0Wp3J%bJpA-cC1I!$jzy~1Pqxg#!qFdkLSXj@3X z*NVG-pTf5MSo44FUiMaAmwk| z;s5;j%5tObx2V$FX{m;Jkv=Lrv}HAP)rSw#xcd(IBFVQ%@t+?dDwr|ZCsi_JZ>L2X z{)DSoVGk7r1=fZNUUU}P!uiK0gamWJ) zEr9%ZTLrtemo8ss$*tB7OU60$N=5WDL4UMX@RUPWRMDA8T!rw2R_M1LqecI8KKT9S z6&TLz^Wy<|C#-&PJv~Q73-$%8WJpfJIr^j_iDXoeenBSNqh$nowf@?)ZDO5IMOz`*cXv;ZApEQJ};Y53kR}g%2xeG%E{&=Z3f>ep7 zYdJZU@&WzG|HWq6MmA_r#BJNwK}%pQGcT~?A$S)w3ePFz42oK0JVEX|WQgo;_Kxqd zHb`*YqU_KVqYg6<5EM3rPbcY7iW4ISlb* zTK(sf;(I@bqlHdD91b@YhYmtQ+?f8~UsSA69GDpl8>tQRxENiDF02M6on>jN!b-fU+HMRW+bP5y<4%fhhVmU2 zJDh{s3QeWmGI*yjv=5Tt(MWs`l|J1&JZB%zext?OT8Gb`y?6C$pCLo+9*_U&Vq^tQ z%-lId1bxNy;eQ(kJD4BZF!%IcxXDR+=HDc$cBTwi7_zC+EwbNs*RJ0&LME^w=$4Hc z#9Oa@;N;;UAPZb{bNx<~Lo^%df-y%2A+I)b4-5_I#28EV?AC2Sr!D;T&C)6q<^F%G zw^gtMJ3F_1doCQDtFwk`c~SFF#8WLDuUh+kx~lMD$k3tGLd)@^HPys2DmA$JNP2oY z>tQxtwqG_!c%NkLF=HNHzAT)Ag3C-EXHwF9%KVy_v@FccXG3tO8s`S}6*K>43XL>m zBCWvV*q=<-N3zxn1fGe8V@EopU>L8ohThl8W?1V8FV8Q#Lj+hXvnnwCD!5GB!pI$o zp-q~-&@sp&EVR#2LGtHqTKy!jjE}n}T@!pA7PmAM1!FizoQ+V+-r8ai8f<8P8ab$y z>DEcTccP0G=m|ozAZ+6mYH@1ctnGh&KV^1F@Bx6U5FG4vJ3`!YMa;&EirX@KouG|E zOo(a4m3@o?h+bT_keQzF}*oTR26qIh! z&rrG^I(SgFk)b@D131&v8&AI9g0&$6pa=nt;fBr!-k)T+wNgpYHJeUiHZF8>G!j?1 zhvLg6HhYGk#=A_eObMzyN{?6f%!p zjEdjj<|kCiPSHO+9PB(5LoabWtDC7cY(L=4#1rp0gnN~@T&0FjW#Gm&0{oG!XIcQ> zqk*qz$n2KWJjawlG*0wEM;zz-Q(WhBB>8jdMZ1`{6t+582s!0Ca#hm@y62Vj$OX4%rEj7t2zvgKm5 zSgg((zS}Uv#;y5c(@WT%#WjzP7MPIlJO?U35aOeu-2u44Tj1#SeS7vS4-TfoJq*~S zH%|9^JbPk|6Ot!Oh?cJz6#0;@%r9D-H%1iaHuxl*!J?w67cwk+;{cp+R~H@jTy~@< zL_TrT?EnsW)_L>gWQXp@B~KS6s7-BK1YtQH z*d3hQA=?Aa2`h_^(9a{88jffZdClh46Lb{&NhVkHRe*E=y z{-#d_je+l}udK)oU%zf057zEIdt5v`j=#CBU^<`i4_sXX{>Y6z8+UIFMzVef>;*77; z)8>zByrKS^-50+ to construct the image for running an OpenAI compatible server with vLLM. -More information about deploying with Docker can be found [here](#deployment-docker). +More information about deploying with Docker can be found [here][deployment-docker]. Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: @@ -17,11 +17,9 @@ The edges of the build graph represent: - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - > :::{figure} /assets/contributing/dockerfile-stages-dependency.png - > :align: center - > :alt: query - > :width: 100% - > ::: + >
+ > ![](../../assets/contributing/dockerfile-stages-dependency.png){ align="center" alt="query" width="100%" } + >
> > Made using: > diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md new file mode 100644 index 000000000000..b7727f02c11b --- /dev/null +++ b/docs/contributing/model/README.md @@ -0,0 +1,23 @@ +--- +title: Adding a New Model +--- +[](){ #new-model } + +This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. + +Contents: + +- [Basic](basic.md) +- [Registration](registration.md) +- [Tests](tests.md) +- [Multimodal](multimodal.md) + +!!! note + The complexity of adding a new model depends heavily on the model's architecture. + The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. + However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. + +!!! tip + If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) + or ask on our [developer slack](https://slack.vllm.ai). + We will be happy to help you out! diff --git a/docs/source/contributing/model/basic.md b/docs/contributing/model/basic.md similarity index 87% rename from docs/source/contributing/model/basic.md rename to docs/contributing/model/basic.md index 1fa56dc4728d..0c0ba3379257 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -1,6 +1,7 @@ -(new-model-basic)= - -# Implementing a Basic Model +--- +title: Implementing a Basic Model +--- +[](){ #new-model-basic } This guide walks you through the steps to implement a basic vLLM model. @@ -10,9 +11,8 @@ First, clone the PyTorch model code from the source repository. For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. -:::{warning} -Make sure to review and adhere to the original code's copyright and licensing terms! -::: +!!! warning + Make sure to review and adhere to the original code's copyright and licensing terms! ## 2. Make your code compatible with vLLM @@ -67,7 +67,7 @@ class MyModel(nn.Module): ... ``` -- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. +- Rewrite the [forward][torch.nn.Module.forward] method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. ```python def forward( @@ -78,10 +78,9 @@ def forward( ... ``` -:::{note} -Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. -If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. -::: +!!! note + Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. + If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. @@ -89,7 +88,7 @@ For reference, check out our [Llama implementation](gh-file:vllm/model_executor/ If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`. +For the embedding layer, you can simply replace [torch.nn.Embedding][] with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`. When it comes to the linear layers, we provide the following options to parallelize them: - `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. @@ -107,7 +106,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a ## 5. Register your model -See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM. +See [this page][new-model-registration] for instructions on how to register your new model to be used by vLLM. ## Frequently Asked Questions diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md new file mode 100644 index 000000000000..892ab9098407 --- /dev/null +++ b/docs/contributing/model/multimodal.md @@ -0,0 +1,803 @@ +--- +title: Multi-Modal Support +--- +[](){ #supports-multimodal } + +This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs][multimodal-inputs]. + +## 1. Update the base vLLM model + +It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic]. +Further update the model as follows: + +- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + + More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it. + +- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: + + assert self.vision_encoder is not None + image_features = self.vision_encoder(image_input) + return self.multi_modal_projector(image_features) + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + + # Validate the multimodal input keyword arguments + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + # Run multimodal inputs through encoder and projector + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + ``` + +!!! warning + The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. + +- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. + + ```python + from .utils import merge_multimodal_embeddings + + class YourModelForImage2Seq(nn.Module): + ... + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + + # `get_input_embeddings` should already be implemented for the language + # model as one of the requirements of basic vLLM model implementation. + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=self.config.image_token_index) + + return inputs_embeds + ``` + +- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. + + ```python + class YourModelForImage2Seq(nn.Module): + ... + + def get_language_model(self) -> torch.nn.Module: + # Change `language_model` according to your implementation. + return self.language_model + ``` + +- Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. + + ```diff + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + ``` + +!!! note + The model class does not have to be named `*ForCausalLM`. + Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. + +## 2. Specify processing information + +Next, create a subclass of [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] +to provide basic information related to HF processing. + +### Maximum number of input items + +You need to override the abstract method [get_supported_mm_limits][vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits] +to return the maximum number of input items for each modality supported by the model. + +For example, if the model supports any number of images but only one video per prompt: + +```python +def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": 1} +``` + +## 3. Specify dummy inputs + +Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for +HF processing as well as memory profiling. + +### For memory profiling + +Override the abstract methods [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it. + +Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens. + +=== "Basic example: LLaVA" + + Looking at the code of HF's `LlavaForConditionalGeneration`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] + + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + ``` + + The number of placeholder feature tokens per image is `image_features.shape[1]`. + `image_features` is calculated inside the `get_image_features` method: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + else: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + ``` + + We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower + (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). + Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. + The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention + mechanism doesn't change the sequence length of the output hidden states. + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + ``` + + To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + ``` + + We can infer that `embeddings.shape[1] == self.num_positions`, where + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + ``` + + Overall, the number of placeholder feature tokens for an image can be calculated as: + + ```python + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + hf_processor = self.get_hf_processor() + + image_size = hf_config.vision_config.image_size + patch_size = hf_config.vision_config.patch_size + + num_image_tokens = (image_size // patch_size) ** 2 + 1 + if hf_processor.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + return num_image_tokens + ``` + + Notice that the number of image tokens doesn't depend on the image width and height. + We can simply use a dummy `image_size` to calculate the multimodal profiling data: + + ```python + # NOTE: In actuality, this is usually implemented as part of the + # model's subclass of `BaseProcessingInfo`, but we show it as is + # here for simplicity. + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + width = height = hf_config.image_size + return ImageSize(width=width, height=height) + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + ``` + + For the text, we simply expand the multimodal image token from the model config to match the desired number of images. + + ```python + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + return image_token * num_images + ``` + +=== "No input placeholders: Fuyu" + + Looking at the code of HF's `FuyuForCausalLM`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 + if image_patches is not None and past_key_values is None: + patch_embeddings = [ + self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) + .squeeze(0) + .to(inputs_embeds.device) + for patch in image_patches + ] + inputs_embeds = self.gather_continuous_embeddings( + word_embeddings=inputs_embeds, + continuous_embeddings=patch_embeddings, + image_patch_input_indices=image_patches_indices, + ) + ``` + + The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, + which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. + + Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information? + Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**. + + The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then + `FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`. + + In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, + returning the dimensions after resizing (but before padding) as metadata. + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 + image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) + batch_images = image_encoding["images"] + image_unpadded_heights = image_encoding["image_unpadded_heights"] + image_unpadded_widths = image_encoding["image_unpadded_widths"] + + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L + if do_resize: + batch_images = [ + [self.resize(image, size=size, input_data_format=input_data_format) for image in images] + for images in batch_images + ] + + image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] + image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] + image_unpadded_widths = [[image_size[1]] for image_size in image_sizes] + + if do_pad: + batch_images = [ + [ + self.pad_image( + image, + size=size, + mode=padding_mode, + constant_values=padding_value, + input_data_format=input_data_format, + ) + for image in images + ] + for images in batch_images + ] + ``` + + In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 + model_image_input = self.image_processor.preprocess_with_tokenizer_info( + image_input=tensor_batch_images, + image_present=image_present, + image_unpadded_h=image_unpadded_heights, + image_unpadded_w=image_unpadded_widths, + image_placeholder_id=image_placeholder_id, + image_newline_id=image_newline_id, + variable_sized=True, + ) + + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 + image_height, image_width = image.shape[1], image.shape[2] + if variable_sized: # variable_sized=True + new_h = min( + image_height, + math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height, + ) + new_w = min( + image_width, + math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width, + ) + image = image[:, :new_h, :new_w] + image_height, image_width = new_h, new_w + + num_patches = self.get_num_patches(image_height=image_height, image_width=image_width) + tensor_of_image_ids = torch.full( + [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device + ) + patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) + assert num_patches == patches.shape[0] + ``` + + The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 + patch_size = patch_size if patch_size is not None else self.patch_size + patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] + + if image_height % patch_height != 0: + raise ValueError(f"{image_height=} must be divisible by {patch_height}") + if image_width % patch_width != 0: + raise ValueError(f"{image_width=} must be divisible by {patch_width}") + + num_patches_per_dim_h = image_height // patch_height + num_patches_per_dim_w = image_width // patch_width + num_patches = num_patches_per_dim_h * num_patches_per_dim_w + ``` + + These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized + to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. + + ```python + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + return ImageSize(width=image_processor.size["width"], + height=image_processor.size["height"]) + ``` + + Fuyu does not expect image placeholders in the inputs to HF processor, so + the dummy prompt text is empty regardless of the number of images. + + ```python + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + return "" + ``` + + For the multimodal image profiling data, the logic is very similar to LLaVA: + + ```python + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + ``` + +## 4. Specify processing details + +Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] +to fill in the missing details about HF processing. + +!!! info + [Multi-Modal Data Processing][mm-processing] + +### Multi-modal fields + +Override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] to +return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. + +=== "Basic example: LLaVA" + + The output of `CLIPImageProcessor` is a simple tensor with shape + `(num_images, num_channels, image_height, image_width)`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + ``` + + So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: + + ```python + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + ) + ``` + + !!! note + Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports + pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. + +=== "With postprocessing: Fuyu" + + The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates + the patches from each image belonging to an item in the batch: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679 + image_input_ids.append(tensor_of_image_ids) + image_patches.append(patches) + else: + image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device)) + + batch_image_input_ids.append(image_input_ids) + batch_image_patches.append(image_patches) + ``` + + The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore + `(1, num_images, num_patches, patch_width * patch_height * num_channels)`. + + In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA, + we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]: + + ```python + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + image_patches = processed_outputs.get("image_patches") + if image_patches is not None: + images = mm_data["images"] + assert isinstance(images, list) + + # Original output: (1, num_images, Pn, Px * Py * C) + # New output: (num_images, Pn, Px * Py * C) + assert (isinstance(image_patches, list) + and len(image_patches) == 1) + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) + + processed_outputs["image_patches"] = image_patches[0] + + return processed_outputs + ``` + + !!! note + Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling + for text-only inputs to prevent unnecessary warnings from HF processor. + + This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows: + + ```python + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(image_patches=MultiModalFieldConfig.batched("image")) + ``` + +### Prompt updates + +Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to +return a list of [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instances. + +Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies an update operation +(e.g.: insertion, replacement) performed by the HF processor. + +=== "Basic example: LLaVA" + + Looking at HF's `LlavaProcessor`: + + ```python + # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 + prompt_strings = [] + for sample in text: + sample = sample.replace(self.image_token, self.image_token * num_image_tokens) + prompt_strings.append(sample) + ``` + + It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). + Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows: + + ```python + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index + + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] + ``` + +=== "Handling additional tokens: Fuyu" + + Recall the layout of feature tokens from Step 2: + + ``` + |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| + |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| + ... + |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| + ``` + + We define a helper function to return `ncols` and `nrows` directly: + + ```python + def get_image_feature_grid_size( + self, + *, + image_width: int, + image_height: int, + ) -> tuple[int, int]: + image_processor = self.get_image_processor() + target_width = image_processor.size["width"] + target_height = image_processor.size["height"] + patch_width = image_processor.patch_size["width"] + patch_height = image_processor.patch_size["height"] + + if not (image_width <= target_width and image_height <= target_height): + height_scale_factor = target_height / image_height + width_scale_factor = target_width / image_width + optimal_scale_factor = min(height_scale_factor, width_scale_factor) + + image_height = int(image_height * optimal_scale_factor) + image_width = int(image_width * optimal_scale_factor) + + ncols = math.ceil(image_width / patch_width) + nrows = math.ceil(image_height / patch_height) + return ncols, nrows + ``` + + Based on this, we can initially define our replacement tokens as: + + ```python + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + + # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|` + # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|` + return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + ``` + + However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, + a BOS token (``) is also added to the promopt: + + ```python + # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 + model_image_input = self.image_processor.preprocess_with_tokenizer_info( + image_input=tensor_batch_images, + image_present=image_present, + image_unpadded_h=image_unpadded_heights, + image_unpadded_w=image_unpadded_widths, + image_placeholder_id=image_placeholder_id, + image_newline_id=image_newline_id, + variable_sized=True, + ) + prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( + tokenizer=self.tokenizer, + prompts=prompts, + scale_factors=scale_factors, + max_tokens_to_generate=self.max_tokens_to_generate, + max_position_embeddings=self.max_position_embeddings, + add_BOS=True, + add_beginning_of_answer_token=True, + ) + ``` + + To assign the vision embeddings to only the image tokens, instead of a string + you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]: + + ```python + hf_config = self.info.get_hf_config() + bos_token_id = hf_config.bos_token_id # `` + assert isinstance(bos_token_id, int) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows + + return PromptUpdateDetails.select_token_id( + image_tokens + [bos_token_id], + embed_token_id=_IMAGE_TOKEN_ID, + ) + ``` + + Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, + we can search for it to conduct the replacement at the start of the string: + + ```python + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) + + tokenizer = self.info.get_tokenizer() + eot_token_id = tokenizer.bos_token_id + assert isinstance(eot_token_id, int) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self.info.get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows + + return PromptUpdateDetails.select_token_id( + image_tokens + [bos_token_id], + embed_token_id=_IMAGE_TOKEN_ID, + ) + + return [ + PromptReplacement( + modality="image", + target=[eot_token_id], + replacement=get_replacement_fuyu, + ) + ] + ``` + +## 5. Register processor-related classes + +After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2), +[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3), +and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4), +decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor ` +to register them to the multi-modal registry: + +```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal ++ from vllm.multimodal import MULTIMODAL_REGISTRY + ++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, ++ info=YourProcessingInfo, ++ dummy_inputs=YourDummyInputsBuilder) + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +## Notes + +### Inserting feature tokens without replacement + +Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use [PromptInsertion][vllm.multimodal.processing.PromptInsertion] instead of [PromptReplacement][vllm.multimodal.processing.PromptReplacement] inside [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. + +Examples: + +- BLIP-2 (insert at start of prompt): +- Florence2 (insert at start of prompt): +- Molmo (insert after `<|endoftext|>` token): + +### Handling prompt updates unrelated to multi-modal data + +[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design][mm-processing]. + +Examples: + +- Chameleon (appends `sep_token`): +- Fuyu (appends `boa_token`): +- Molmo (applies chat template which is not defined elsewhere): + +### Custom HF processor + +Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]. + +Examples: + +- DeepSeek-VL2: +- InternVL: +- Qwen-VL: diff --git a/docs/source/contributing/model/registration.md b/docs/contributing/model/registration.md similarity index 52% rename from docs/source/contributing/model/registration.md rename to docs/contributing/model/registration.md index 64cd25b53807..e796e49a7501 100644 --- a/docs/source/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -1,33 +1,32 @@ -(new-model-registration)= - -# Registering a Model to vLLM +--- +title: Registering a Model to vLLM +--- +[](){ #new-model-registration } vLLM relies on a model registry to determine how to run each model. -A list of pre-registered architectures can be found [here](#supported-models). +A list of pre-registered architectures can be found [here][supported-models]. If your model is not on this list, you must register it to vLLM. This page provides detailed instructions on how to do so. ## Built-in models -To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source). +To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source]. This gives you the ability to modify the codebase and test your model. -After you have implemented your model (see [tutorial](#new-model-basic)), put it into the directory. +After you have implemented your model (see [tutorial][new-model-basic]), put it into the directory. Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. -Finally, update our [list of supported models](#supported-models) to promote your model! +Finally, update our [list of supported models][supported-models] to promote your model! -:::{important} -The list of models in each section should be maintained in alphabetical order. -::: +!!! warning + The list of models in each section should be maintained in alphabetical order. ## Out-of-tree models You can load an external model using a plugin without modifying the vLLM codebase. -:::{seealso} -[vLLM's Plugin System](#plugin-system) -::: +!!! info + [vLLM's Plugin System][plugin-system] To register the model, use the following code: @@ -45,11 +44,9 @@ from vllm import ModelRegistry ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") ``` -:::{important} -If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that [here](#supports-multimodal). -::: +!!! warning + If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. + Read more about that [here][supports-multimodal]. -:::{note} -Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. -::: +!!! note + Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/docs/source/contributing/model/tests.md b/docs/contributing/model/tests.md similarity index 75% rename from docs/source/contributing/model/tests.md rename to docs/contributing/model/tests.md index 68d51d89f7cf..26880986181d 100644 --- a/docs/source/contributing/model/tests.md +++ b/docs/contributing/model/tests.md @@ -1,6 +1,7 @@ -(new-model-tests)= - -# Writing Unit Tests +--- +title: Writing Unit Tests +--- +[](){ #new-model-tests } This page explains how to write unit tests to verify the implementation of your model. @@ -14,14 +15,12 @@ Without them, the CI for your PR will fail. Include an example HuggingFace repository for your model in . This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. -:::{important} -The list of models in each section should be maintained in alphabetical order. -::: +!!! warning + The list of models in each section should be maintained in alphabetical order. -:::{tip} -If your model requires a development version of HF Transformers, you can set -`min_transformers_version` to skip the test in CI until the model is released. -::: +!!! tip + If your model requires a development version of HF Transformers, you can set + `min_transformers_version` to skip the test in CI until the model is released. ## Optional Tests @@ -34,16 +33,16 @@ These tests compare the model outputs of vLLM against [HF Transformers](https:// #### Generative models -For [generative models](#generative-models), there are two levels of correctness tests, as defined in : +For [generative models][generative-models], there are two levels of correctness tests, as defined in : - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. #### Pooling models -For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in . +For [pooling models][pooling-models], we simply check the cosine similarity, as defined in . -(mm-processing-tests)= +[](){ #mm-processing-tests } ### Multi-modal processing diff --git a/docs/source/contributing/overview.md b/docs/contributing/overview.md similarity index 87% rename from docs/source/contributing/overview.md rename to docs/contributing/overview.md index 89b31f0311e2..7dbf8bfdcf24 100644 --- a/docs/source/contributing/overview.md +++ b/docs/contributing/overview.md @@ -27,7 +27,21 @@ See . ## Developing Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. -Check out the [building from source](#build-from-source) documentation for details. +Check out the [building from source][build-from-source] documentation for details. + +### Building the docs + +Install the dependencies: + +```bash +pip install -r requirements/docs.txt +``` + +Start the autoreloading MkDocs server: + +```bash +mkdocs serve +``` ## Testing @@ -48,29 +62,25 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files pytest tests/ ``` -:::{tip} -Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. +!!! tip + Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. -Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. -::: + Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. -:::{note} -Currently, the repository is not fully checked by `mypy`. -::: +!!! note + Currently, the repository is not fully checked by `mypy`. -:::{note} -Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU -platform to run unit tests locally, rely on the continuous integration system to run the tests for -now. -::: +!!! note + Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU + platform to run unit tests locally, rely on the continuous integration system to run the tests for + now. ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -:::{important} -If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). -::: +!!! warning + If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). ## Pull Requests & Code Reviews @@ -106,9 +116,8 @@ appropriately to indicate the type of change. Please use one of the following: - `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -:::{note} -If the PR spans more than one category, please include all relevant prefixes. -::: +!!! note + If the PR spans more than one category, please include all relevant prefixes. ### Code Quality diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/contributing/profiling.md similarity index 90% rename from docs/source/contributing/profiling/profiling_index.md rename to docs/contributing/profiling.md index ce25daa39c5c..be01b9b65f65 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/contributing/profiling.md @@ -1,8 +1,7 @@ # Profiling vLLM -:::{warning} -Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference. -::: +!!! warning + Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference. ## Profile with PyTorch Profiler @@ -14,15 +13,13 @@ When using `benchmarks/benchmark_serving.py`, you can enable profiling by passin Traces can be visualized using . -:::{tip} -Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. -::: +!!! tip + Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. -:::{tip} -To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. -Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. -`export VLLM_RPC_TIMEOUT=1800000` -::: +!!! tip + To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. + Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. + `export VLLM_RPC_TIMEOUT=1800000` ### Example commands and usage diff --git a/docs/source/contributing/vulnerability_management.md b/docs/contributing/vulnerability_management.md similarity index 100% rename from docs/source/contributing/vulnerability_management.md rename to docs/contributing/vulnerability_management.md diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md new file mode 100644 index 000000000000..293536e52c4b --- /dev/null +++ b/docs/deployment/docker.md @@ -0,0 +1,126 @@ +--- +title: Using Docker +--- +[](){ #deployment-docker } + +[](){ #deployment-docker-pre-built-image } + +## Use vLLM's Official Docker Image + +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + +This image can also be used with other container engines such as [Podman](https://podman.io/). + +```console +$ podman run --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + +You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`). + +!!! note + You can either use the `ipc=host` flag or `--shm-size` flag to allow the + container to access the host's shared memory. vLLM uses PyTorch, which uses shared + memory to share data between processes under the hood, particularly for tensor parallel inference. + +!!! note + Optional dependencies are not included in order to avoid licensing issues (e.g. ). + + If you need to use those dependencies (having accepted the license terms), + create a custom Dockerfile on top of the base image with an extra layer that installs them: + + ```Dockerfile + FROM vllm/vllm-openai:v0.8.3 + + # e.g. install the `audio` optional dependencies + # NOTE: Make sure the version of vLLM matches the base image! + RUN uv pip install --system vllm[audio]==0.8.3 + ``` + +!!! tip + Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers). + + To use the development version of `transformers`, create a custom Dockerfile on top of the base image + with an extra layer that installs their code from source: + + ```Dockerfile + FROM vllm/vllm-openai:latest + + RUN uv pip install --system git+https://github.com/huggingface/transformers.git + ``` + +[](){ #deployment-docker-build-image-from-source } + +## Building vLLM's Docker Image from Source + +You can build and run vLLM from source via the provided . To build vLLM: + +```console +# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile +``` + +!!! note + By default vLLM will build for all GPU types for widest distribution. If you are just building for the + current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` + for vLLM to find the current GPU type and build for that. + + If you are using Podman instead of Docker, you might need to disable SELinux labeling by + adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). + +## Building for Arm64/aarch64 + +A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use +of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. + +!!! note + Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` + flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. + Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). + +```console +# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) +$ python3 use_existing_torch.py +$ DOCKER_BUILDKIT=1 docker build . \ + --file docker/Dockerfile \ + --target vllm-openai \ + --platform "linux/arm64" \ + -t vllm/vllm-gh200-openai:latest \ + --build-arg max_jobs=66 \ + --build-arg nvcc_threads=2 \ + --build-arg torch_cuda_arch_list="9.0+PTX" \ + --build-arg vllm_fa_cmake_gpu_arches="90-real" +``` + +## Use the custom-built vLLM Docker image + +To run vLLM with the custom-built Docker image: + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + vllm/vllm-openai +``` + +The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). + +!!! note + **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . diff --git a/docs/source/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md similarity index 78% rename from docs/source/deployment/frameworks/anything-llm.md rename to docs/deployment/frameworks/anything-llm.md index d430c170ef54..a89e633c086e 100644 --- a/docs/source/deployment/frameworks/anything-llm.md +++ b/docs/deployment/frameworks/anything-llm.md @@ -1,6 +1,7 @@ -(deployment-anything-llm)= - -# Anything LLM +--- +title: Anything LLM +--- +[](){ #deployment-anything-llm } [Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting. @@ -25,23 +26,19 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 - Base URL: http://{vllm server host}:{vllm server port}/v1 - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ` -:::{image} /assets/deployment/anything-llm-provider.png -::: +![](../../assets/deployment/anything-llm-provider.png) - Back to home page, New Workspace --> create `vllm` workspace, and start to chat: -:::{image} /assets/deployment/anything-llm-chat-without-doc.png -::: +![](../../assets/deployment/anything-llm-chat-without-doc.png) - Click the upload button: - upload the doc - select the doc and move to the workspace - save and embed -:::{image} /assets/deployment/anything-llm-upload-doc.png -::: +![](../../assets/deployment/anything-llm-upload-doc.png) - Chat again: -:::{image} /assets/deployment/anything-llm-chat-with-doc.png -::: +![](../../assets/deployment/anything-llm-chat-with-doc.png) diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/deployment/frameworks/bentoml.md similarity index 89% rename from docs/source/deployment/frameworks/bentoml.md rename to docs/deployment/frameworks/bentoml.md index 2bf435bda838..7e64b6eb6fb0 100644 --- a/docs/source/deployment/frameworks/bentoml.md +++ b/docs/deployment/frameworks/bentoml.md @@ -1,6 +1,7 @@ -(deployment-bentoml)= - -# BentoML +--- +title: BentoML +--- +[](){ #deployment-bentoml } [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes. diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md similarity index 98% rename from docs/source/deployment/frameworks/cerebrium.md rename to docs/deployment/frameworks/cerebrium.md index b20c95137b6e..84cb2304fac2 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/deployment/frameworks/cerebrium.md @@ -1,12 +1,11 @@ -(deployment-cerebrium)= +--- +title: Cerebrium +--- +[](){ #deployment-cerebrium } -# Cerebrium - -:::{raw} html

vLLM_plus_cerebrium

-::: vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. diff --git a/docs/source/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md similarity index 84% rename from docs/source/deployment/frameworks/chatbox.md rename to docs/deployment/frameworks/chatbox.md index e62f4647150f..10da2fc71002 100644 --- a/docs/source/deployment/frameworks/chatbox.md +++ b/docs/deployment/frameworks/chatbox.md @@ -1,6 +1,7 @@ -(deployment-chatbox)= - -# Chatbox +--- +title: Chatbox +--- +[](){ #deployment-chatbox } [Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux. @@ -27,10 +28,8 @@ vllm serve qwen/Qwen1.5-0.5B-Chat - API Path: `/chat/completions` - Model: `qwen/Qwen1.5-0.5B-Chat` -:::{image} /assets/deployment/chatbox-settings.png -::: +![](../../assets/deployment/chatbox-settings.png) - Go to `Just chat`, and start to chat: -:::{image} /assets/deployment/chatbox-chat.png -::: +![](../../assets/deployment/chatbox-chat.png) diff --git a/docs/source/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md similarity index 90% rename from docs/source/deployment/frameworks/dify.md rename to docs/deployment/frameworks/dify.md index 5cdf6a387637..886484b54347 100644 --- a/docs/source/deployment/frameworks/dify.md +++ b/docs/deployment/frameworks/dify.md @@ -1,6 +1,7 @@ -(deployment-dify)= - -# Dify +--- +title: Dify +--- +[](){ #deployment-dify } [Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production. @@ -42,15 +43,12 @@ docker compose up -d - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat` - **Completion Mode**: `Completion` -:::{image} /assets/deployment/dify-settings.png -::: +![](../../assets/deployment/dify-settings.png) - To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type: -:::{image} /assets/deployment/dify-create-chatbot.png -::: +![](../../assets/deployment/dify-create-chatbot.png) - Click the chatbot you just created to open the chat interface and start interacting with the model: -:::{image} /assets/deployment/dify-chat.png -::: +![](../../assets/deployment/dify-chat.png) diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md similarity index 83% rename from docs/source/deployment/frameworks/dstack.md rename to docs/deployment/frameworks/dstack.md index a16e28f2d898..7de92855745b 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/deployment/frameworks/dstack.md @@ -1,12 +1,11 @@ -(deployment-dstack)= +--- +title: dstack +--- +[](){ #deployment-dstack } -# dstack - -:::{raw} html

vLLM_plus_dstack

-::: vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. @@ -97,6 +96,5 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -:::{note} -dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) -::: +!!! note + dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md new file mode 100644 index 000000000000..192b90438acf --- /dev/null +++ b/docs/deployment/frameworks/helm.md @@ -0,0 +1,95 @@ +--- +title: Helm +--- +[](){ #deployment-helm } + +A Helm chart to deploy vLLM for Kubernetes + +Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values. + +This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file. + +## Prerequisites + +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) +- Available GPU resources in your cluster +- S3 with the model which will be deployed + +## Installing the chart + +To install the chart with the release name `test-vllm`: + +```console +helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +``` + +## Uninstalling the Chart + +To uninstall the `test-vllm` deployment: + +```console +helm uninstall test-vllm --namespace=ns-vllm +``` + +The command removes all the Kubernetes components associated with the +chart **including persistent volumes** and deletes the release. + +## Architecture + +![](../../assets/deployment/architecture_helm_deployment.png) + +## Values + +| Key | Type | Default | Description | +|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| +| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration | +| autoscaling.enabled | bool | false | Enable autoscaling | +| autoscaling.maxReplicas | int | 100 | Maximum replicas | +| autoscaling.minReplicas | int | 1 | Minimum replicas | +| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling | +| configs | object | {} | Configmap | +| containerPort | int | 8000 | Container port | +| customObjects | list | [] | Custom Objects configuration | +| deploymentStrategy | object | {} | Deployment strategy configuration | +| externalConfigs | list | [] | External configuration | +| extraContainers | list | [] | Additional containers configuration | +| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container | +| extraInit.pvcStorage | string | "50Gi" | Storage size of the s3 | +| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files | +| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service | +| extraPorts | list | [] | Additional ports configuration | +| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used | +| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration | +| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command | +| image.repository | string | "vllm/vllm-openai" | Image repository | +| image.tag | string | "latest" | Image tag | +| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration | +| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive | +| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server | +| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server | +| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening | +| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated | +| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe | +| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration | +| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration | +| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready | +| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server | +| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server | +| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening | +| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated | +| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe | +| replicaCount | int | 1 | Number of replicas | +| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration | +| resources.limits."nvidia.com/gpu" | int | 1 | Number of gpus used | +| resources.limits.cpu | int | 4 | Number of CPUs | +| resources.limits.memory | string | "16Gi" | CPU memory configuration | +| resources.requests."nvidia.com/gpu" | int | 1 | Number of gpus used | +| resources.requests.cpu | int | 4 | Number of CPUs | +| resources.requests.memory | string | "16Gi" | CPU memory configuration | +| secrets | object | {} | Secrets configuration | +| serviceName | string | Service name | | +| servicePort | int | 80 | Service port | +| labels.environment | string | test | Environment name | diff --git a/docs/source/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md similarity index 97% rename from docs/source/deployment/frameworks/litellm.md rename to docs/deployment/frameworks/litellm.md index 6dd3607ca5e3..3011cde83018 100644 --- a/docs/source/deployment/frameworks/litellm.md +++ b/docs/deployment/frameworks/litellm.md @@ -1,6 +1,7 @@ -(deployment-litellm)= - -# LiteLLM +--- +title: LiteLLM +--- +[](){ #deployment-litellm } [LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.] diff --git a/docs/source/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md similarity index 89% rename from docs/source/deployment/frameworks/lobe-chat.md rename to docs/deployment/frameworks/lobe-chat.md index 6d86b7fa9cce..cd95c028155e 100644 --- a/docs/source/deployment/frameworks/lobe-chat.md +++ b/docs/deployment/frameworks/lobe-chat.md @@ -1,6 +1,7 @@ -(deployment-lobe-chat)= - -# Lobe Chat +--- +title: Lobe Chat +--- +[](){ #deployment-lobe-chat } [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework. diff --git a/docs/source/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md similarity index 99% rename from docs/source/deployment/frameworks/lws.md rename to docs/deployment/frameworks/lws.md index 4e9a03b5c4c1..18282a89ddff 100644 --- a/docs/source/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -1,6 +1,7 @@ -(deployment-lws)= - -# LWS +--- +title: LWS +--- +[](){ #deployment-lws } LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference. diff --git a/docs/source/deployment/frameworks/modal.md b/docs/deployment/frameworks/modal.md similarity index 85% rename from docs/source/deployment/frameworks/modal.md rename to docs/deployment/frameworks/modal.md index e7c42088e36a..dbdb739a1000 100644 --- a/docs/source/deployment/frameworks/modal.md +++ b/docs/deployment/frameworks/modal.md @@ -1,6 +1,7 @@ -(deployment-modal)= - -# Modal +--- +title: Modal +--- +[](){ #deployment-modal } vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling. diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md similarity index 87% rename from docs/source/deployment/frameworks/open-webui.md rename to docs/deployment/frameworks/open-webui.md index 83e5303a00ef..1ab1931068fa 100644 --- a/docs/source/deployment/frameworks/open-webui.md +++ b/docs/deployment/frameworks/open-webui.md @@ -1,6 +1,7 @@ -(deployment-open-webui)= - -# Open WebUI +--- +title: Open WebUI +--- +[](){ #deployment-open-webui } 1. Install the [Docker](https://docs.docker.com/engine/install/) @@ -25,5 +26,4 @@ ghcr.io/open-webui/open-webui:main On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`. -:::{image} /assets/deployment/open_webui.png -::: +![](../../assets/deployment/open_webui.png) diff --git a/docs/source/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md similarity index 96% rename from docs/source/deployment/frameworks/retrieval_augmented_generation.md rename to docs/deployment/frameworks/retrieval_augmented_generation.md index f84451fafe91..cb26c8378dee 100644 --- a/docs/source/deployment/frameworks/retrieval_augmented_generation.md +++ b/docs/deployment/frameworks/retrieval_augmented_generation.md @@ -1,6 +1,7 @@ -(deployment-retrieval-augmented-generation)= - -# Retrieval-Augmented Generation +--- +title: Retrieval-Augmented Generation +--- +[](){ #deployment-retrieval-augmented-generation } [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources. diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md similarity index 97% rename from docs/source/deployment/frameworks/skypilot.md rename to docs/deployment/frameworks/skypilot.md index 5e101b900103..1844a50c5604 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -1,12 +1,11 @@ -(deployment-skypilot)= +--- +title: SkyPilot +--- +[](){ #deployment-skypilot } -# SkyPilot - -:::{raw} html

vLLM

-::: vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). @@ -104,10 +103,8 @@ service: max_completion_tokens: 1 ``` -:::{raw} html
Click to see the full recipe YAML -::: ```yaml service: @@ -153,9 +150,7 @@ run: | 2>&1 | tee api_server.log ``` -:::{raw} html
-::: Start the serving the Llama-3 8B model on multiple replicas: @@ -169,10 +164,8 @@ Wait until the service is ready: watch -n10 sky serve status vllm ``` -:::{raw} html
Example outputs: -::: ```console Services @@ -185,9 +178,7 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 ``` -:::{raw} html
-::: After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: @@ -223,10 +214,8 @@ service: This will scale the service up to when the QPS exceeds 2 for each replica. -:::{raw} html
Click to see the full recipe YAML -::: ```yaml service: @@ -275,9 +264,7 @@ run: | 2>&1 | tee api_server.log ``` -:::{raw} html
-::: To update the service with the new config: @@ -295,10 +282,8 @@ sky serve down vllm It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. -:::{raw} html
Click to see the full GUI YAML -::: ```yaml envs: @@ -328,9 +313,7 @@ run: | --stop-token-ids 128009,128001 | tee ~/gradio.log ``` -:::{raw} html
-::: 1. Start the chat web UI: diff --git a/docs/source/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md similarity index 91% rename from docs/source/deployment/frameworks/streamlit.md rename to docs/deployment/frameworks/streamlit.md index 084550ec991e..8956d1ddc7d8 100644 --- a/docs/source/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -1,6 +1,7 @@ -(deployment-streamlit)= - -# Streamlit +--- +title: Streamlit +--- +[](){ #deployment-streamlit } [Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps. @@ -38,5 +39,4 @@ VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run stream streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug ``` -:::{image} /assets/deployment/streamlit-chat.png -::: +![](../../assets/deployment/streamlit-chat.png) diff --git a/docs/source/deployment/frameworks/triton.md b/docs/deployment/frameworks/triton.md similarity index 87% rename from docs/source/deployment/frameworks/triton.md rename to docs/deployment/frameworks/triton.md index 94d87120159c..082bc24d85aa 100644 --- a/docs/source/deployment/frameworks/triton.md +++ b/docs/deployment/frameworks/triton.md @@ -1,5 +1,6 @@ -(deployment-triton)= - -# NVIDIA Triton +--- +title: NVIDIA Triton +--- +[](){ #deployment-triton } The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. diff --git a/docs/source/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md similarity index 85% rename from docs/source/deployment/integrations/kserve.md rename to docs/deployment/integrations/kserve.md index c780fd74e8f5..754b983dee92 100644 --- a/docs/source/deployment/integrations/kserve.md +++ b/docs/deployment/integrations/kserve.md @@ -1,6 +1,7 @@ -(deployment-kserve)= - -# KServe +--- +title: KServe +--- +[](){ #deployment-kserve } vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. diff --git a/docs/source/deployment/integrations/kubeai.md b/docs/deployment/integrations/kubeai.md similarity index 93% rename from docs/source/deployment/integrations/kubeai.md rename to docs/deployment/integrations/kubeai.md index 2f5772e075d8..ba0a3c52cca7 100644 --- a/docs/source/deployment/integrations/kubeai.md +++ b/docs/deployment/integrations/kubeai.md @@ -1,6 +1,7 @@ -(deployment-kubeai)= - -# KubeAI +--- +title: KubeAI +--- +[](){ #deployment-kubeai } [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md similarity index 94% rename from docs/source/deployment/integrations/llamastack.md rename to docs/deployment/integrations/llamastack.md index a6c3569637ab..2ae600a423ff 100644 --- a/docs/source/deployment/integrations/llamastack.md +++ b/docs/deployment/integrations/llamastack.md @@ -1,6 +1,7 @@ -(deployment-llamastack)= - -# Llama Stack +--- +title: Llama Stack +--- +[](){ #deployment-llamastack } vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . diff --git a/docs/source/deployment/integrations/llmaz.md b/docs/deployment/integrations/llmaz.md similarity index 87% rename from docs/source/deployment/integrations/llmaz.md rename to docs/deployment/integrations/llmaz.md index cd4a76353d26..03d284c34769 100644 --- a/docs/source/deployment/integrations/llmaz.md +++ b/docs/deployment/integrations/llmaz.md @@ -1,6 +1,7 @@ -(deployment-llmaz)= - -# llmaz +--- +title: llmaz +--- +[](){ #deployment-llmaz } [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend. diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md similarity index 98% rename from docs/source/deployment/integrations/production-stack.md rename to docs/deployment/integrations/production-stack.md index 05f1568306cc..8288a4b6e6be 100644 --- a/docs/source/deployment/integrations/production-stack.md +++ b/docs/deployment/integrations/production-stack.md @@ -1,6 +1,7 @@ -(deployment-production-stack)= - -# Production stack +--- +title: Production stack +--- +[](){ #deployment-production-stack } Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with: @@ -114,7 +115,7 @@ To remove the deployment, run: sudo helm uninstall vllm ``` ------- +--- ### (Advanced) Configuring vLLM production stack diff --git a/docs/source/deployment/k8s.md b/docs/deployment/k8s.md similarity index 98% rename from docs/source/deployment/k8s.md rename to docs/deployment/k8s.md index 9079cfa8e1b6..bd2bd44cd522 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -1,6 +1,7 @@ -(deployment-k8s)= - -# Using Kubernetes +--- +title: Using Kubernetes +--- +[](){ #deployment-k8s } Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes. @@ -19,9 +20,8 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: ## Deployment with CPUs -:::{note} -The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs. -::: +!!! note + The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs. First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: diff --git a/docs/source/deployment/nginx.md b/docs/deployment/nginx.md similarity index 77% rename from docs/source/deployment/nginx.md rename to docs/deployment/nginx.md index bf404f1098c3..9d1f74475781 100644 --- a/docs/source/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -1,20 +1,21 @@ -(nginxloadbalancer)= - -# Using Nginx +--- +title: Using Nginx +--- +[](){ #nginxloadbalancer } This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. Table of contents: -1. [Build Nginx Container](#nginxloadbalancer-nginx-build) -2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf) -3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container) -4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network) -5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container) -6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx) -7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx) +1. [Build Nginx Container][nginxloadbalancer-nginx-build] +2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf] +3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container] +4. [Create Docker Network][nginxloadbalancer-nginx-docker-network] +5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container] +6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx] +7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx] -(nginxloadbalancer-nginx-build)= +[](){ #nginxloadbalancer-nginx-build } ## Build Nginx Container @@ -39,7 +40,7 @@ Build the container: docker build . -f Dockerfile.nginx --tag nginx-lb ``` -(nginxloadbalancer-nginx-conf)= +[](){ #nginxloadbalancer-nginx-conf } ## Create Simple Nginx Config file @@ -63,7 +64,7 @@ server { } ``` -(nginxloadbalancer-nginx-vllm-container)= +[](){ #nginxloadbalancer-nginx-vllm-container } ## Build vLLM Container @@ -79,7 +80,7 @@ cd $vllm_root docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy ``` -(nginxloadbalancer-nginx-docker-network)= +[](){ #nginxloadbalancer-nginx-docker-network } ## Create Docker Network @@ -87,7 +88,7 @@ docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_prox docker network create vllm_nginx ``` -(nginxloadbalancer-nginx-launch-container)= +[](){ #nginxloadbalancer-nginx-launch-container } ## Launch vLLM Containers @@ -105,11 +106,10 @@ docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24 docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf ``` -:::{note} -If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. -::: +!!! note + If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. -(nginxloadbalancer-nginx-launch-nginx)= +[](){ #nginxloadbalancer-nginx-launch-nginx } ## Launch Nginx @@ -117,7 +117,7 @@ If you are behind proxy, you can pass the proxy settings to the docker run comma docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest ``` -(nginxloadbalancer-nginx-verify-nginx)= +[](){ #nginxloadbalancer-nginx-verify-nginx } ## Verify That vLLM Servers Are Ready diff --git a/docs/source/deployment/security.md b/docs/deployment/security.md similarity index 100% rename from docs/source/deployment/security.md rename to docs/deployment/security.md diff --git a/docs/source/design/arch_overview.md b/docs/design/arch_overview.md similarity index 81% rename from docs/source/design/arch_overview.md rename to docs/design/arch_overview.md index 94bda8b5c58d..75d3e1b7ccc7 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -1,22 +1,18 @@ -(arch-overview)= - -# Architecture Overview +--- +title: Architecture Overview +--- +[](){ #arch-overview } This document provides an overview of the vLLM architecture. -:::{contents} Table of Contents -:depth: 2 -:local: true -::: +[TOC] ## Entrypoints vLLM provides a number of entrypoints for interacting with the system. The following diagram shows the relationship between them. -:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png -:alt: Entrypoints Diagram -::: +![Entrypoints Diagram](../assets/design/arch_overview/entrypoints.excalidraw.png) ### LLM Class @@ -77,16 +73,14 @@ python -m vllm.entrypoints.openai.api_server --model That code can be found in . -More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document. +More details on the API server can be found in the [OpenAI-Compatible Server][openai-compatible-server] document. ## LLM Engine The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of the vLLM system, handling model inference and asynchronous request processing. -:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png -:alt: LLMEngine Diagram -::: +![LLMEngine Diagram](../assets/design/arch_overview/llm_engine.excalidraw.png) ### LLMEngine @@ -137,18 +131,16 @@ input tensors and capturing cudagraphs. ## Model Every model runner object has one model object, which is the actual -`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various +`torch.nn.Module` instance. See [huggingface_integration][huggingface-integration] for how various configurations affect the class we ultimately get. ## Class Hierarchy The following figure shows the class hierarchy of vLLM: -> :::{figure} /assets/design/hierarchy.png -> :align: center -> :alt: query -> :width: 100% -> ::: +>
+> ![](../assets/design/hierarchy.png){ align="center" alt="query" width="100%" } +>
There are several important design choices behind this class hierarchy: @@ -178,44 +170,43 @@ of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. -:::{note} -To support this change, all vLLM models' signatures have been updated to: - -```python -def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): -``` - -To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: +!!! note + To support this change, all vLLM models' signatures have been updated to: -```python -class MyOldModel(nn.Module): - def __init__( - self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: - ... - -from vllm.config import VllmConfig -class MyNewModel(MyOldModel): + ```python def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - super().__init__(config, cache_config, quant_config, lora_config, prefix) - -if __version__ >= "0.6.4": - MyModel = MyNewModel -else: - MyModel = MyOldModel -``` - -This way, the model can work with both old and new versions of vLLM. -::: + ``` + + To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + + ```python + class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + + from vllm.config import VllmConfig + class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + + if __version__ >= "0.6.4": + MyModel = MyNewModel + else: + MyModel = MyOldModel + ``` + + This way, the model can work with both old and new versions of vLLM. 3\. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md similarity index 98% rename from docs/source/design/automatic_prefix_caching.md rename to docs/design/automatic_prefix_caching.md index 3928e0c16568..80883bb1d90d 100644 --- a/docs/source/design/automatic_prefix_caching.md +++ b/docs/design/automatic_prefix_caching.md @@ -1,6 +1,7 @@ -(design-automatic-prefix-caching)= - -# Automatic Prefix Caching +--- +title: Automatic Prefix Caching +--- +[](){ #design-automatic-prefix-caching } The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. diff --git a/docs/source/design/huggingface_integration.md b/docs/design/huggingface_integration.md similarity index 98% rename from docs/source/design/huggingface_integration.md rename to docs/design/huggingface_integration.md index 7d271b1cfb3a..68cc27ea768c 100644 --- a/docs/source/design/huggingface_integration.md +++ b/docs/design/huggingface_integration.md @@ -1,6 +1,7 @@ -(huggingface-integration)= - -# Integration with HuggingFace +--- +title: Integration with HuggingFace +--- +[](){ #huggingface-integration } This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. diff --git a/docs/source/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md similarity index 94% rename from docs/source/design/kernel/paged_attention.md rename to docs/design/kernel/paged_attention.md index e1770c822643..ad8b5c9264d2 100644 --- a/docs/source/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -1,6 +1,7 @@ -(design-paged-attention)= - -# vLLM Paged Attention +--- +title: vLLM Paged Attention +--- +[](){ #design-paged-attention } - Currently, vLLM utilizes its own implementation of a multi-head query attention kernel (`csrc/attention/attention_kernels.cu`). @@ -139,26 +140,22 @@ const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` - :::{figure} ../../assets/kernel/query.png - :align: center - :alt: query - :width: 70% - - Query data of one token at one head - ::: +
+ ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" } +
+
+
- Each thread defines its own `q_ptr` which points to the assigned query token data on global memory. For example, if `VEC_SIZE` is 4 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs. - :::{figure} ../../assets/kernel/q_vecs.png - :align: center - :alt: q_vecs - :width: 70% - - `q_vecs` for one thread group - ::: +
+ ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" } +
+
+
```cpp __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; @@ -195,13 +192,11 @@ points to key token data based on `k_cache` at assigned block, assigned head and assigned token. - :::{figure} ../../assets/kernel/key.png - :align: center - :alt: key - :width: 70% - - Key data of all context tokens at one head - ::: +
+ ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" } +
+
+
- The diagram above illustrates the memory layout for key data. It assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is @@ -214,13 +209,11 @@ elements for one token) that will be processed by 2 threads (one thread group) separately. - :::{figure} ../../assets/kernel/k_vecs.png - :align: center - :alt: k_vecs - :width: 70% - - `k_vecs` for one thread - ::: +
+ ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" } +
+
+
```cpp K_vec k_vecs[NUM_VECS_PER_THREAD] @@ -289,14 +282,12 @@ should be performed across the entire thread block, encompassing results between the query token and all context key tokens. - :::{math} - :nowrap: true - + $$ \begin{gather*} m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} \end{gather*} - ::: + $$ ### `qk_max` and `logits` @@ -379,29 +370,23 @@ ## Value -:::{figure} ../../assets/kernel/value.png -:align: center -:alt: value -:width: 70% - -Value data of all context tokens at one head -::: - -:::{figure} ../../assets/kernel/logits_vec.png -:align: center -:alt: logits_vec -:width: 50% - -`logits_vec` for one thread -::: - -:::{figure} ../../assets/kernel/v_vec.png -:align: center -:alt: v_vec -:width: 70% - -List of `v_vec` for one thread -::: +
+ ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" } +
+
+
+ +
+ ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" } +
+
+
+ +
+ ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" } +
+
+
- Now we need to retrieve the value data and perform dot multiplication with `logits`. Unlike query and key, there is no thread group diff --git a/docs/source/design/mm_processing.md b/docs/design/mm_processing.md similarity index 61% rename from docs/source/design/mm_processing.md rename to docs/design/mm_processing.md index dc92a3c2c511..f3685ce76a4b 100644 --- a/docs/source/design/mm_processing.md +++ b/docs/design/mm_processing.md @@ -1,10 +1,11 @@ -(mm-processing)= +--- +title: Multi-Modal Data Processing +--- +[](){ #mm-processing } -# Multi-Modal Data Processing +To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching][automatic-prefix-caching], we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. ``) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor. -To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. ``) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor. - -Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`: +Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]: ## Prompt Update Detection @@ -15,7 +16,7 @@ One of the main responsibilities of HF processor is to update the prompt with pl The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs. -In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptUpdate` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens. +In vLLM, this information is specified using [PromptUpdate][vllm.multimodal.processing.PromptUpdate] in [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens. ## Tokenized Prompt Inputs @@ -43,22 +44,22 @@ While HF processors support text + multi-modal inputs natively, this is not so f Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other. -(mm-dummy-text)= +[](){ #mm-dummy-text } ### Dummy text -We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. +We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. -(mm-automatic-prompt-updating)= +[](){ #mm-automatic-prompt-updating } ### Automatic prompt updating We address the second issue by implementing model-agnostic code in -{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates` to automatically update the prompt with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. +[_apply_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates] to automatically update the prompt with feature placeholder tokens based on the specification outputted by [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. ### Summary -With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`. +With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in [_apply_hf_processor_main][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main]. ## Processor Output Caching @@ -66,4 +67,4 @@ Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238) When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache. -Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#mm-automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other. +Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other. diff --git a/docs/source/design/multiprocessing.md b/docs/design/multiprocessing.md similarity index 97% rename from docs/source/design/multiprocessing.md rename to docs/design/multiprocessing.md index 43fe5fe2e5e9..649edfcce69b 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/design/multiprocessing.md @@ -2,14 +2,13 @@ ## Debugging -Please see the [Troubleshooting](#troubleshooting-python-multiprocessing) +Please see the [Troubleshooting][troubleshooting-python-multiprocessing] page for information on known issues and how to solve them. ## Introduction -:::{important} -The source code references are to the state of the code at the time of writing in December, 2024. -::: +!!! warning + The source code references are to the state of the code at the time of writing in December, 2024. The use of Python multiprocessing in vLLM is complicated by: diff --git a/docs/source/design/plugin_system.md b/docs/design/plugin_system.md similarity index 86% rename from docs/source/design/plugin_system.md rename to docs/design/plugin_system.md index 225030885f62..5027a35c23e8 100644 --- a/docs/source/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -1,12 +1,13 @@ -(plugin-system)= - -# vLLM's Plugin System +--- +title: vLLM's Plugin System +--- +[](){ #plugin-system } The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. ## How Plugins Work in vLLM -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview][arch-overview]), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. ## How vLLM Discovers Plugins diff --git a/docs/source/design/v1/metrics.md b/docs/design/v1/metrics.md similarity index 98% rename from docs/source/design/v1/metrics.md rename to docs/design/v1/metrics.md index de8022655372..2631f28e46e4 100644 --- a/docs/source/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -57,7 +57,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` - `vllm:spec_decode_num_draft_tokens_total` (Counter) - `vllm:spec_decode_num_emitted_tokens_total` (Counter) -These are documented under [Inferencing and Serving -> Production Metrics](project:../../serving/metrics.md). +These are documented under [Inferencing and Serving -> Production Metrics](../../serving/metrics.md). ### Grafana Dashboard @@ -222,9 +222,7 @@ And the calculated intervals are: Put another way: -:::{image} /assets/design/v1/metrics/intervals-1.png -:alt: Interval calculations - common case -::: +![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png) We explored the possibility of having the frontend calculate these intervals using the timing of events visible by the frontend. However, @@ -239,17 +237,13 @@ When a preemption occurs during decode, since any already generated tokens are reused, we consider the preemption as affecting the inter-token, decode, and inference intervals. -:::{image} /assets/design/v1/metrics/intervals-2.png -:alt: Interval calculations - preempted decode -::: +![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png) When a preemption occurs during prefill (assuming such an event is possible), we consider the preemption as affecting the time-to-first-token and prefill intervals. -:::{image} /assets/design/v1/metrics/intervals-3.png -:alt: Interval calculations - preempted prefill -::: +![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png) ### Frontend Stats Collection @@ -467,7 +461,7 @@ In general: hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics) for some time before deleting them. -See the [deprecation policy](project:../../contributing/deprecation_policy.md) for +See the [deprecation policy](../../contributing/deprecation_policy.md) for the project-wide deprecation policy. ### Unimplemented - `vllm:tokens_total` diff --git a/docs/source/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md similarity index 94% rename from docs/source/design/v1/prefix_caching.md rename to docs/design/v1/prefix_caching.md index 0f7475777797..ad041b0059f5 100644 --- a/docs/source/design/v1/prefix_caching.md +++ b/docs/design/v1/prefix_caching.md @@ -122,9 +122,7 @@ There are two design points to highlight: As a result, we will have the following components when the KV cache manager is initialized: -:::{image} /assets/design/v1/prefix_caching/overview.png -:alt: Component Overview -::: +![Component Overview](../../assets/design/v1/prefix_caching/overview.png) * Block Pool: A list of KVCacheBlock. * Free Block Queue: Only store the pointers of head and tail blocks for manipulations. @@ -194,9 +192,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first. -:::{image} /assets/design/v1/prefix_caching/free.png -:alt: Free Queue after Free a Request -::: +![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png) ### Eviction (LRU) @@ -212,36 +208,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens), **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens. -:::{image} /assets/design/v1/prefix_caching/example-time-1.png -:alt: Example Time 1 -::: +![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png) **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4. -:::{image} /assets/design/v1/prefix_caching/example-time-3.png -:alt: Example Time 3 -::: +![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png) **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens. -:::{image} /assets/design/v1/prefix_caching/example-time-4.png -:alt: Example Time 4 -::: +![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png) **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1. -:::{image} /assets/design/v1/prefix_caching/example-time-5.png -:alt: Example Time 5 -::: +![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png) **Time 6: Request 1 is finished and free.** -:::{image} /assets/design/v1/prefix_caching/example-time-6.png -:alt: Example Time 6 -::: +![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png) **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted). -:::{image} /assets/design/v1/prefix_caching/example-time-7.png -:alt: Example Time 7 -::: +![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png) diff --git a/docs/source/design/v1/torch_compile.md b/docs/design/v1/torch_compile.md similarity index 100% rename from docs/source/design/v1/torch_compile.md rename to docs/design/v1/torch_compile.md diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md similarity index 91% rename from docs/source/features/automatic_prefix_caching.md rename to docs/features/automatic_prefix_caching.md index 5c5b37c2a071..5e92796ddda7 100644 --- a/docs/source/features/automatic_prefix_caching.md +++ b/docs/features/automatic_prefix_caching.md @@ -1,14 +1,14 @@ -(automatic-prefix-caching)= - -# Automatic Prefix Caching +--- +title: Automatic Prefix Caching +--- +[](){ #automatic-prefix-caching } ## Introduction Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. -:::{note} -Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching). -::: +!!! note + Technical details on how vLLM implements APC can be found [here][design-automatic-prefix-caching]. ## Enabling APC in vLLM diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md new file mode 100644 index 000000000000..77ceea49f173 --- /dev/null +++ b/docs/features/compatibility_matrix.md @@ -0,0 +1,77 @@ +--- +title: Compatibility Matrix +--- +[](){ #compatibility-matrix } + +The tables below show mutually exclusive features and the support on some hardware. + +The symbols used have the following meanings: + +- ✅ = Full compatibility +- 🟠 = Partial compatibility +- ❌ = No compatibility + +!!! note + Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination. + +## Feature x Feature + + + +| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | prmpt adptr | [SD][spec-decode] | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | +|-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------| +| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | +| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | | +| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | | +| prmpt adptr | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | | +| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | +| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | | +| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | +| async output | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | +| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | +| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | +| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | +| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | + +[](){ #feature-x-hardware } + +## Feature x Hardware + +| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | +|-----------------------------------------------------------|--------------------|----------|----------|-------|----------|--------------------|-------| +| [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [APC][automatic-prefix-caching] | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| prmpt adptr | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8475) | ✅ | +| [SD][spec-decode] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | +| pooling | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | +| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| mm | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | +| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | diff --git a/docs/source/features/disagg_prefill.md b/docs/features/disagg_prefill.md similarity index 87% rename from docs/source/features/disagg_prefill.md rename to docs/features/disagg_prefill.md index 2fa20140c086..54be05647d94 100644 --- a/docs/source/features/disagg_prefill.md +++ b/docs/features/disagg_prefill.md @@ -1,12 +1,12 @@ -(disagg-prefill)= - -# Disaggregated Prefilling (experimental) +--- +title: Disaggregated Prefilling (experimental) +--- +[](){ #disagg-prefill } This page introduces you the disaggregated prefilling feature in vLLM. -:::{note} -This feature is experimental and subject to change. -::: +!!! note + This feature is experimental and subject to change. ## Why disaggregated prefilling? @@ -15,9 +15,8 @@ Two main reasons: - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. -:::{note} -Disaggregated prefill DOES NOT improve throughput. -::: +!!! note + Disaggregated prefill DOES NOT improve throughput. ## Usage example @@ -39,21 +38,16 @@ Key abstractions for disaggregated prefilling: - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer. - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`. -:::{note} -`insert` is non-blocking operation but `drop_select` is blocking operation. -::: +!!! note + `insert` is non-blocking operation but `drop_select` is blocking operation. Here is a figure illustrating how the above 3 abstractions are organized: -:::{image} /assets/features/disagg_prefill/abstraction.jpg -:alt: Disaggregated prefilling abstractions -::: +![Disaggregated prefilling abstractions](../assets/features/disagg_prefill/abstraction.jpg) The workflow of disaggregated prefilling is as follows: -:::{image} /assets/features/disagg_prefill/overview.jpg -:alt: Disaggregated prefilling workflow -::: +![Disaggregated prefilling workflow](../assets/features/disagg_prefill/overview.jpg) The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer. diff --git a/docs/source/features/lora.md b/docs/features/lora.md similarity index 96% rename from docs/source/features/lora.md rename to docs/features/lora.md index 5a3ce0c01f3f..642462f7c455 100644 --- a/docs/source/features/lora.md +++ b/docs/features/lora.md @@ -1,10 +1,11 @@ -(lora-adapter)= - -# LoRA Adapters +--- +title: LoRA Adapters +--- +[](){ #lora-adapter } This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model. -LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`. +LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA]. Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save them locally with @@ -60,9 +61,8 @@ vllm serve meta-llama/Llama-2-7b-hf \ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ ``` -:::{note} -The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. -::: +!!! note + The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`, etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along diff --git a/docs/source/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md similarity index 84% rename from docs/source/features/multimodal_inputs.md rename to docs/features/multimodal_inputs.md index bb2997f008ed..19b668172902 100644 --- a/docs/source/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -1,20 +1,20 @@ -(multimodal-inputs)= +--- +title: Multimodal Inputs +--- +[](){ #multimodal-inputs } -# Multimodal Inputs +This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM. -This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. - -:::{note} -We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, -and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. -::: +!!! note + We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, + and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. ## Offline Inference -To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: +To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: - `prompt`: The prompt should follow the format that is documented on HuggingFace. -- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`. +- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][]. ### Image Inputs @@ -211,16 +211,15 @@ for o in outputs: Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). -:::{important} -A chat template is **required** to use Chat Completions API. -For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`. +!!! warning + A chat template is **required** to use Chat Completions API. + For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`. -If no default chat template is available, we will first look for a built-in fallback in . -If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. + If no default chat template is available, we will first look for a built-in fallback in . + If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. -For certain models, we provide alternative chat templates inside . -For example, VLM2Vec uses which is different from the default one for Phi-3-Vision. -::: + For certain models, we provide alternative chat templates inside . + For example, VLM2Vec uses which is different from the default one for Phi-3-Vision. ### Image Inputs @@ -284,25 +283,21 @@ print("Chat completion output:", chat_response.choices[0].message.content) Full example: -:::{tip} -Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, -and pass the file path as `url` in the API request. -::: - -:::{tip} -There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. -In fact, you can place image placeholders in the middle of the text by interleaving text and image content. -::: +!!! tip + Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, + and pass the file path as `url` in the API request. -:::{note} -By default, the timeout for fetching images through HTTP URL is `5` seconds. -You can override this by setting the environment variable: +!!! tip + There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. + In fact, you can place image placeholders in the middle of the text by interleaving text and image content. -```console -export VLLM_IMAGE_FETCH_TIMEOUT= -``` +!!! note + By default, the timeout for fetching images through HTTP URL is `5` seconds. + You can override this by setting the environment variable: -::: + ```console + export VLLM_IMAGE_FETCH_TIMEOUT= + ``` ### Video Inputs @@ -357,15 +352,13 @@ print("Chat completion output from image url:", result) Full example: -:::{note} -By default, the timeout for fetching videos through HTTP URL is `30` seconds. -You can override this by setting the environment variable: +!!! note + By default, the timeout for fetching videos through HTTP URL is `30` seconds. + You can override this by setting the environment variable: -```console -export VLLM_VIDEO_FETCH_TIMEOUT= -``` - -::: + ```console + export VLLM_VIDEO_FETCH_TIMEOUT= + ``` ### Audio Inputs @@ -461,15 +454,13 @@ print("Chat completion output from audio url:", result) Full example: -:::{note} -By default, the timeout for fetching audios through HTTP URL is `10` seconds. -You can override this by setting the environment variable: - -```console -export VLLM_AUDIO_FETCH_TIMEOUT= -``` +!!! note + By default, the timeout for fetching audios through HTTP URL is `10` seconds. + You can override this by setting the environment variable: -::: + ```console + export VLLM_AUDIO_FETCH_TIMEOUT= + ``` ### Embedding Inputs @@ -535,7 +526,6 @@ chat_completion = client.chat.completions.create( ) ``` -:::{note} -Only one message can contain `{"type": "image_embeds"}`. -If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. -::: +!!! note + Only one message can contain `{"type": "image_embeds"}`. + If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc. diff --git a/docs/source/features/prompt_embeds.md b/docs/features/prompt_embeds.md similarity index 92% rename from docs/source/features/prompt_embeds.md rename to docs/features/prompt_embeds.md index 9d7b242bbe51..6f5616e05d8c 100644 --- a/docs/source/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -6,13 +6,12 @@ This page teaches you how to pass prompt embedding inputs to vLLM. The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary. -:::{note} -Prompt embeddings are currently only supported in the v0 engine. -::: +!!! note + Prompt embeddings are currently only supported in the v0 engine. ## Offline Inference -To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPrompt`: +To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: - `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model. diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md new file mode 100644 index 000000000000..71f62065f63d --- /dev/null +++ b/docs/features/quantization/README.md @@ -0,0 +1,22 @@ +--- +title: Quantization +--- +[](){ #quantization-index } + +Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. + +Contents: + +- [Supported_Hardware](supported_hardware.md) +- [Auto_Awq](auto_awq.md) +- [Bnb](bnb.md) +- [Bitblas](bitblas.md) +- [Gguf](gguf.md) +- [Gptqmodel](gptqmodel.md) +- [Int4](int4.md) +- [Int8](int8.md) +- [Fp8](fp8.md) +- [Modelopt](modelopt.md) +- [Quark](quark.md) +- [Quantized_Kvcache](quantized_kvcache.md) +- [Torchao](torchao.md) diff --git a/docs/source/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md similarity index 98% rename from docs/source/features/quantization/auto_awq.md rename to docs/features/quantization/auto_awq.md index b4ac597f5a79..5879b3126fa6 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -1,6 +1,7 @@ -(auto-awq)= - -# AutoAWQ +--- +title: AutoAWQ +--- +[](){ #auto-awq } To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint. diff --git a/docs/source/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md similarity index 76% rename from docs/source/features/quantization/bitblas.md rename to docs/features/quantization/bitblas.md index d0b2bf858c9b..8e9cf67a7a69 100644 --- a/docs/source/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -1,14 +1,14 @@ -(bitblas)= - -# BitBLAS +--- +title: BitBLAS +--- +[](){ #bitblas } vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations. -:::{note} -Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). -Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. -For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html). -::: +!!! note + Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). + Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. + For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html). Below are the steps to utilize BitBLAS with vLLM. diff --git a/docs/source/features/quantization/bnb.md b/docs/features/quantization/bnb.md similarity index 97% rename from docs/source/features/quantization/bnb.md rename to docs/features/quantization/bnb.md index 1843a33a3dfd..990ac34eb2fd 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -1,6 +1,7 @@ -(bits-and-bytes)= - -# BitsAndBytes +--- +title: BitsAndBytes +--- +[](){ #bits-and-bytes } vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference. BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. diff --git a/docs/source/features/quantization/fp8.md b/docs/features/quantization/fp8.md similarity index 88% rename from docs/source/features/quantization/fp8.md rename to docs/features/quantization/fp8.md index cb304d54726c..01d5d9da046d 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -1,6 +1,7 @@ -(fp8)= - -# FP8 W8A8 +--- +title: FP8 W8A8 +--- +[](){ #fp8 } vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. @@ -14,10 +15,9 @@ The FP8 types typically supported in hardware have two distinct representations, - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`. - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. -:::{note} -FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). -FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. -::: +!!! note + FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). + FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. ## Installation @@ -94,9 +94,8 @@ print(result[0].outputs[0].text) Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): -:::{note} -Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. -::: +!!! note + Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. ```console $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic @@ -133,6 +132,5 @@ result = model.generate("Hello, my name is") print(result[0].outputs[0].text) ``` -:::{warning} -Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. -::: +!!! warning + Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. diff --git a/docs/source/features/quantization/gguf.md b/docs/features/quantization/gguf.md similarity index 76% rename from docs/source/features/quantization/gguf.md rename to docs/features/quantization/gguf.md index e93e4dcd3b57..04ab5945e8f6 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -1,14 +1,13 @@ -(gguf)= +--- +title: GGUF +--- +[](){ #gguf } -# GGUF +!!! warning + Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. -:::{warning} -Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. -::: - -:::{warning} -Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. -::: +!!! warning + Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: @@ -25,9 +24,8 @@ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inferen vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` -:::{warning} -We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. -::: +!!! warning + We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md similarity index 98% rename from docs/source/features/quantization/gptqmodel.md rename to docs/features/quantization/gptqmodel.md index 9771d5a4fe9e..10660a408fd2 100644 --- a/docs/source/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -1,6 +1,7 @@ -(gptqmodel)= - -# GPTQModel +--- +title: GPTQModel +--- +[](){ #gptqmodel } To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI. diff --git a/docs/source/features/quantization/int4.md b/docs/features/quantization/int4.md similarity index 94% rename from docs/source/features/quantization/int4.md rename to docs/features/quantization/int4.md index 7a0ab4ad229e..b7d09206365f 100644 --- a/docs/source/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -1,14 +1,14 @@ -(int4)= - -# INT4 W4A16 +--- +title: INT4 W4A16 +--- +[](){ #int4 } vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS). Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c). -:::{note} -INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell). -::: +!!! note + INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell). ## Prerequisites @@ -121,9 +121,8 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -:::{note} -Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. -::: +!!! note + Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. ## Best Practices diff --git a/docs/source/features/quantization/int8.md b/docs/features/quantization/int8.md similarity index 92% rename from docs/source/features/quantization/int8.md rename to docs/features/quantization/int8.md index 1e4b01d35575..1d9fba9dc87f 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -1,15 +1,15 @@ -(int8)= - -# INT8 W8A8 +--- +title: INT8 W8A8 +--- +[](){ #int8 } vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size while maintaining good performance. Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). -:::{note} -INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell). -::: +!!! note + INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell). ## Prerequisites @@ -125,9 +125,8 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -:::{note} -Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. -::: +!!! note + Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. ## Best Practices diff --git a/docs/source/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md similarity index 100% rename from docs/source/features/quantization/modelopt.md rename to docs/features/quantization/modelopt.md diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md similarity index 98% rename from docs/source/features/quantization/quantized_kvcache.md rename to docs/features/quantization/quantized_kvcache.md index 86e6354ec82e..e3ebd024bab3 100644 --- a/docs/source/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -1,6 +1,7 @@ -(quantized-kvcache)= - -# Quantized KV Cache +--- +title: Quantized KV Cache +--- +[](){ #quantized-kvcache } ## FP8 KV Cache diff --git a/docs/source/features/quantization/quark.md b/docs/features/quantization/quark.md similarity index 94% rename from docs/source/features/quantization/quark.md rename to docs/features/quantization/quark.md index 955890dbc75b..51da98cc09d3 100644 --- a/docs/source/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -1,6 +1,7 @@ -(quark)= - -# AMD QUARK +--- +title: AMD QUARK +--- +[](){ #quark } Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/), @@ -86,13 +87,12 @@ We need to set the quantization configuration, you can check for further details. Here we use FP8 per-tensor quantization on weight, activation, kv-cache and the quantization algorithm is AutoSmoothQuant. -:::{note} -Note the quantization algorithm needs a JSON config file and the config file is located in -[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html), -under the directory `examples/torch/language_modeling/llm_ptq/models`. For example, -AutoSmoothQuant config file for Llama is -`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. -::: +!!! note + Note the quantization algorithm needs a JSON config file and the config file is located in + [Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html), + under the directory `examples/torch/language_modeling/llm_ptq/models`. For example, + AutoSmoothQuant config file for Llama is + `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. ```python from quark.torch.quantization import (Config, QuantizationConfig, diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md new file mode 100644 index 000000000000..2967bf9c7504 --- /dev/null +++ b/docs/features/quantization/supported_hardware.md @@ -0,0 +1,28 @@ +--- +title: Supported Hardware +--- +[](){ #quantization-supported-hardware } + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Inferentia | Google TPU | +|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------| +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ | +| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- ✅︎ indicates that the quantization method is supported on the specified hardware. +- ❌ indicates that the quantization method is not supported on the specified hardware. + +!!! note + This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + + For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. diff --git a/docs/source/features/quantization/torchao.md b/docs/features/quantization/torchao.md similarity index 100% rename from docs/source/features/quantization/torchao.md rename to docs/features/quantization/torchao.md diff --git a/docs/source/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md similarity index 97% rename from docs/source/features/reasoning_outputs.md rename to docs/features/reasoning_outputs.md index bf4f8901a11a..85464269efac 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -1,6 +1,7 @@ -(reasoning-outputs)= - -# Reasoning Outputs +--- +title: Reasoning Outputs +--- +[](){ #reasoning-outputs } vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. @@ -17,10 +18,9 @@ vLLM currently supports the following reasoning models: | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ | -:::{note} -IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. -The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. -::: +!!! note + IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. + The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. ## Quickstart @@ -167,12 +167,10 @@ client = OpenAI( models = client.models.list() model = models.data[0].id - class People(BaseModel): name: str age: int - json_schema = People.model_json_schema() prompt = ("Generate a JSON with the name and age of one random person.") diff --git a/docs/source/features/spec_decode.md b/docs/features/spec_decode.md similarity index 93% rename from docs/source/features/spec_decode.md rename to docs/features/spec_decode.md index f16e0d96522d..dce87c27896c 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -1,16 +1,15 @@ -(spec-decode)= +--- +title: Speculative Decoding +--- +[](){ #spec-decode } -# Speculative Decoding +!!! warning + Please note that speculative decoding in vLLM is not yet optimized and does + not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. + The work to optimize it is ongoing and can be followed here: -:::{warning} -Please note that speculative decoding in vLLM is not yet optimized and does -not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. -The work to optimize it is ongoing and can be followed here: -::: - -:::{warning} -Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. -::: +!!! warning + Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM. Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. @@ -51,9 +50,8 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}' ``` -:::{warning} -Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now. -::: +!!! warning + Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now. Then use a client: @@ -255,7 +253,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). + titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq]. While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -264,7 +262,7 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq]. ## Resources for vLLM contributors diff --git a/docs/source/features/structured_outputs.md b/docs/features/structured_outputs.md similarity index 96% rename from docs/source/features/structured_outputs.md rename to docs/features/structured_outputs.md index 03119ec7441c..f96b598cff98 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -1,6 +1,7 @@ -(structured-outputs)= - -# Structured Outputs +--- +title: Structured Outputs +--- +[](){ #structured-outputs } vLLM supports the generation of structured outputs using [xgrammar](https://github.com/mlc-ai/xgrammar) or @@ -20,7 +21,7 @@ The following parameters are supported, which must be added as extra parameters: - `guided_grammar`: the output will follow the context free grammar. - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text. -You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server) page. +You can see the complete list of supported parameters on the [OpenAI-Compatible Server][openai-compatible-server] page. Structured outputs are supported by default in the OpenAI-Compatible Server. You may choose to specify the backend to use by setting the @@ -83,13 +84,11 @@ class CarType(str, Enum): truck = "Truck" coupe = "Coupe" - class CarDescription(BaseModel): brand: str model: str car_type: CarType - json_schema = CarDescription.model_json_schema() completion = client.chat.completions.create( @@ -105,11 +104,10 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -:::{tip} -While not strictly necessary, normally it´s better to indicate in the prompt the -JSON schema and how the fields should be populated. This can improve the -results notably in most cases. -::: +!!! tip + While not strictly necessary, normally it´s better to indicate in the prompt the + JSON schema and how the fields should be populated. This can improve the + results notably in most cases. Finally we have the `guided_grammar` option, which is probably the most difficult to use, but it´s really powerful. It allows us to define complete @@ -160,12 +158,10 @@ Here is a simple example demonstrating how to get structured output using Pydant from pydantic import BaseModel from openai import OpenAI - class Info(BaseModel): name: str age: int - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") completion = client.beta.chat.completions.parse( model="meta-llama/Llama-3.1-8B-Instruct", @@ -199,17 +195,14 @@ from typing import List from pydantic import BaseModel from openai import OpenAI - class Step(BaseModel): explanation: str output: str - class MathResponse(BaseModel): steps: list[Step] final_answer: str - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") completion = client.beta.chat.completions.parse( model="meta-llama/Llama-3.1-8B-Instruct", diff --git a/docs/source/features/tool_calling.md b/docs/features/tool_calling.md similarity index 99% rename from docs/source/features/tool_calling.md rename to docs/features/tool_calling.md index f76128406bfd..75cd00e24d7b 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -322,7 +322,6 @@ class ExampleToolParser(ToolParser): tool_calls=[], content=text) - ``` Then you can use this plugin in the command line like this. diff --git a/docs/source/getting_started/faq.md b/docs/getting_started/faq.md similarity index 91% rename from docs/source/getting_started/faq.md rename to docs/getting_started/faq.md index c1bb28937c14..51977d4434f5 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/getting_started/faq.md @@ -1,23 +1,24 @@ -(faq)= - -# Frequently Asked Questions +--- +title: Frequently Asked Questions +--- +[](){ #faq } > Q: How can I serve multiple models on a single port using the OpenAI API? A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly. -______________________________________________________________________ +--- > Q: Which model to use for offline inference embedding? A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5); -more are listed [here](#supported-models). +more are listed [here][supported-models]. By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, but they are expected to be inferior to models that are specifically trained on embedding tasks. -______________________________________________________________________ +--- > Q: Can the output of a prompt vary across runs in vLLM? diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml new file mode 100644 index 000000000000..7acfc015ff50 --- /dev/null +++ b/docs/getting_started/installation/.nav.yml @@ -0,0 +1,5 @@ +nav: + - README.md + - gpu.md + - cpu.md + - ai_accelerator.md \ No newline at end of file diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md new file mode 100644 index 000000000000..36bb16cc0224 --- /dev/null +++ b/docs/getting_started/installation/README.md @@ -0,0 +1,20 @@ +--- +title: Installation +--- +[](){ #installation-index } + +vLLM supports the following hardware platforms: + +- [GPU](gpu.md) + - [NVIDIA CUDA](gpu.md#nvidia-cuda) + - [AMD ROCm](gpu.md#amd-rocm) + - [Intel XPU](gpu.md#intel-xpu) +- [CPU](cpu.md) + - [Intel/AMD x86](cpu.md#intelamd-x86) + - [ARM AArch64](cpu.md#arm-aarch64) + - [Apple silicon](cpu.md#apple-silicon) + - [IBM Z (S390X)](cpu.md#ibm-z-s390x) +- [Other AI accelerators](ai_accelerator.md) + - [Google TPU](ai_accelerator.md#google-tpu) + - [Intel Gaudi](ai_accelerator.md#intel-gaudi) + - [AWS Neuron](ai_accelerator.md#aws-neuron) diff --git a/docs/getting_started/installation/ai_accelerator.md b/docs/getting_started/installation/ai_accelerator.md new file mode 100644 index 000000000000..a4f136a172fe --- /dev/null +++ b/docs/getting_started/installation/ai_accelerator.md @@ -0,0 +1,117 @@ +# Other AI accelerators + +vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:installation" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:installation" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:installation" + +## Requirements + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:requirements" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:requirements" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:requirements" + +## Configure a new environment + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:configure-a-new-environment" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:configure-a-new-environment" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:configure-a-new-environment" + +## Set up using Python + +### Pre-built wheels + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-wheels" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-wheels" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-wheels" + +### Build wheel from source + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-wheel-from-source" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-wheel-from-source" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-wheel-from-source" + +## Set up using Docker + +### Pre-built images + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-images" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-images" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-images" + +### Build image from source + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-image-from-source" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-image-from-source" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-image-from-source" + +## Extra information + +=== "Google TPU" + + --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:extra-information" + +=== "Intel Gaudi" + + --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:extra-information" + +=== "AWS Neuron" + + --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:extra-information" diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md similarity index 84% rename from docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md rename to docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 78938de317c4..1ca8a9216a4e 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -1,12 +1,12 @@ -# Installation +# --8<-- [start:installation] This tab provides instructions on running vLLM with Intel Gaudi devices. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - OS: Ubuntu 22.04 LTS - Python: 3.10 @@ -48,13 +48,16 @@ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-i docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built Intel Gaudi wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] To build and install vLLM from source, run: @@ -75,29 +78,32 @@ pip install -r requirements/hpu.txt python setup.py develop ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] Currently, there are no pre-built Intel Gaudi images. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] ```console docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` -:::{tip} -If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -::: +!!! tip + If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] ## Supported features -- [Offline inference](#offline-inference) -- Online serving via [OpenAI-Compatible Server](#openai-compatible-server) +- [Offline inference][offline-inference] +- Online serving via [OpenAI-Compatible Server][openai-compatible-server] - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, @@ -157,41 +163,25 @@ Gaudi2 devices. Configurations that are not listed may or may not work. Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. -:::{list-table} vLLM execution modes -:widths: 25 25 50 -:header-rows: 1 - -- * `PT_HPU_LAZY_MODE` - * `enforce_eager` - * execution mode -- * 0 - * 0 - * torch.compile -- * 0 - * 1 - * PyTorch eager mode -- * 1 - * 0 - * HPU Graphs -- * 1 - * 1 - * PyTorch lazy mode -::: - -:::{warning} -In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. -::: - -(gaudi-bucketing-mechanism)= +| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode | +|----------------------|-------------------|--------------------| +| 0 | 0 | torch.compile | +| 0 | 1 | PyTorch eager mode | +| 1 | 0 | HPU Graphs | +
vLLM execution modes
+ +!!! warning + In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. + +[](){ #gaudi-bucketing-mechanism } ### Bucketing mechanism Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. -:::{note} -Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. -::: +!!! note + Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: @@ -224,15 +214,13 @@ min = 128, step = 128, max = 512 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. -:::{warning} -If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. -::: +!!! warning + If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. -:::{note} -Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. -::: +!!! note + Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. ### Warmup @@ -252,11 +240,10 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` -This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. +This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. -:::{tip} -Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. -::: +!!! tip + Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. ### HPU Graph capture @@ -271,9 +258,8 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. -:::{note} -`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. -::: +!!! note + `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: @@ -282,9 +268,8 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. -:::{note} -`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. -::: +!!! note + `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): @@ -401,3 +386,4 @@ the below: higher batches. You can do that by adding `--enforce-eager` flag to server (for online serving), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md similarity index 79% rename from docs/source/getting_started/installation/ai_accelerator/neuron.inc.md rename to docs/getting_started/installation/ai_accelerator/neuron.inc.md index b4bfb696faa2..671afa8d8900 100644 --- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md +++ b/docs/getting_started/installation/ai_accelerator/neuron.inc.md @@ -1,14 +1,14 @@ -# Installation +# --8<-- [start:installation] vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. Paged Attention and Chunked Prefill are currently in development and will be available soon. Data types currently supported in Neuron SDK are FP16 and BF16. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - OS: Linux - Python: 3.9 -- 3.11 @@ -63,17 +63,19 @@ sudo apt-get install aws-neuronx-tools=2.* -y export PATH=/opt/aws/neuron/bin:$PATH ``` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built Neuron wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] -:::{note} -The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -::: +!!! note + The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. Following instructions are applicable to Neuron SDK 2.16 and beyond. @@ -122,18 +124,23 @@ VLLM_TARGET_DEVICE="neuron" pip install . If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] Currently, there are no pre-built Neuron images. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] -See for instructions on building the Docker image. +See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. Make sure to use in place of the default Dockerfile. -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] There is no extra information for this device. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/getting_started/installation/ai_accelerator/tpu.inc.md similarity index 55% rename from docs/source/getting_started/installation/ai_accelerator/tpu.inc.md rename to docs/getting_started/installation/ai_accelerator/tpu.inc.md index 4459cc61e1cd..d0b168120137 100644 --- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md +++ b/docs/getting_started/installation/ai_accelerator/tpu.inc.md @@ -1,4 +1,4 @@ -# Installation +# --8<-- [start:installation] Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs @@ -30,11 +30,11 @@ For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tp You may need additional persistent storage for your TPU VMs. For more information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options). -:::{attention} -There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -::: +!!! warning + There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - Google Cloud TPU VM - TPU versions: v6e, v5e, v5p, v4 @@ -51,10 +51,9 @@ When you request queued resources, the request is added to a queue maintained by the Cloud TPU service. When the requested resource becomes available, it's assigned to your Google Cloud project for your immediate exclusive use. -:::{note} -In all of the following commands, replace the ALL CAPS parameter names with -appropriate values. See the parameter descriptions table for more information. -::: +!!! note + In all of the following commands, replace the ALL CAPS parameter names with + appropriate values. See the parameter descriptions table for more information. ### Provision Cloud TPUs with GKE @@ -79,33 +78,15 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` -:::{list-table} Parameter descriptions -:header-rows: 1 - -- * Parameter name - * Description -- * QUEUED_RESOURCE_ID - * The user-assigned ID of the queued resource request. -- * TPU_NAME - * The user-assigned name of the TPU which is created when the queued - resource request is allocated. -- * PROJECT_ID - * Your Google Cloud project -- * ZONE - * The GCP zone where you want to create your Cloud TPU. The value you use - depends on the version of TPUs you are using. For more information, see - `TPU regions and zones `_ -- * ACCELERATOR_TYPE - * The TPU version you want to use. Specify the TPU version, for example - `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, - see [TPU versions](https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions). -- * RUNTIME_VERSION - * The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes). -- * SERVICE_ACCOUNT - * The email address for your service account. You can find it in the IAM - Cloud Console under *Service Accounts*. For example: - `tpu-service-account@.iam.gserviceaccount.com` -::: +| Parameter name | Description | +|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. | +| TPU_NAME | The user-assigned name of the TPU which is created when the queued | +| PROJECT_ID | Your Google Cloud project | +| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use | +| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example | +| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes). | +
Parameter descriptions
Connect to your TPU using SSH: @@ -113,13 +94,16 @@ Connect to your TPU using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE ``` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built TPU wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] Install Miniconda: @@ -161,13 +145,16 @@ Run the setup script: VLLM_TARGET_DEVICE="tpu" python -m pip install -e . ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -See for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. +See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] You can use to build a Docker image with TPU support. @@ -182,31 +169,30 @@ Run the Docker image with the following command: docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` -:::{note} -Since TPU relies on XLA which requires static shapes, vLLM bucketizes the -possible input shapes and compiles an XLA graph for each shape. The -compilation time may take 20~30 minutes in the first run. However, the -compilation time reduces to ~5 minutes afterwards because the XLA graphs are -cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default). -::: +!!! note + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the + possible input shapes and compiles an XLA graph for each shape. The + compilation time may take 20~30 minutes in the first run. However, the + compilation time reduces to ~5 minutes afterwards because the XLA graphs are + cached in the disk (in `VLLM_XLA_CACHE_PATH` or `~/.cache/vllm/xla_cache` by default). -:::{tip} -If you encounter the following error: +!!! tip + If you encounter the following error: -```console -from torch._C import * # noqa: F403 -ImportError: libopenblas.so.0: cannot open shared object file: No such -file or directory -``` - -Install OpenBLAS with the following command: + ```console + from torch._C import * # noqa: F403 + ImportError: libopenblas.so.0: cannot open shared object file: No such + file or directory + ``` -```console -sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev -``` + Install OpenBLAS with the following command: -::: + ```console + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + ``` -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] There is no extra information for this device. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md similarity index 74% rename from docs/source/getting_started/installation/cpu.md rename to docs/getting_started/installation/cpu.md index 2c0ec60d7100..18c96b264ad8 100644 --- a/docs/source/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -2,107 +2,47 @@ vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions: -:::::{tab-set} -:sync-group: device +=== "Intel/AMD x86" -::::{tab-item} Intel/AMD x86 -:selected: -:sync: x86 + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation" -:::{include} cpu/x86.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: +=== "ARM AArch64" -:::: + --8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation" -::::{tab-item} ARM AArch64 -:sync: arm +=== "Apple silicon" -:::{include} cpu/arm.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: + --8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation" -:::: +=== "IBM Z (S390X)" -::::{tab-item} Apple silicon -:sync: apple - -:::{include} cpu/apple.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} IBM Z (S390X) -:sync: s390x - -:::{include} cpu/s390x.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::: + --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation" ## Requirements - Python: 3.9 -- 3.12 -:::::{tab-set} -:sync-group: device - -::::{tab-item} Intel/AMD x86 -:sync: x86 - -:::{include} cpu/x86.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} ARM AArch64 -:sync: arm - -:::{include} cpu/arm.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: +=== "Intel/AMD x86" -:::: + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements" -::::{tab-item} Apple silicon -:sync: apple +=== "ARM AArch64" -:::{include} cpu/apple.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: + --8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements" -:::: +=== "Apple silicon" -::::{tab-item} IBM Z (S390X) -:sync: s390x + --8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements" -:::{include} cpu/s390x.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: +=== "IBM Z (S390X)" -:::: - -::::: + --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements" ## Set up using Python ### Create a new Python environment -:::{include} python_env_setup.inc.md -::: +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" ### Pre-built wheels @@ -110,69 +50,29 @@ Currently, there are no pre-built CPU wheels. ### Build wheel from source -:::::{tab-set} -:sync-group: device - -::::{tab-item} Intel/AMD x86 -:sync: x86 - -:::{include} cpu/x86.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} ARM AArch64 -:sync: arm +=== "Intel/AMD x86" -:::{include} cpu/arm.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source" -:::: +=== "ARM AArch64" -::::{tab-item} Apple silicon -:sync: apple + --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source" -:::{include} cpu/apple.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: +=== "Apple silicon" -:::: + --8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source" -::::{tab-item} IBM Z (s390x) -:sync: s390x +=== "IBM Z (s390x)" -:::{include} cpu/s390x.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::: + --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source" ## Set up using Docker ### Pre-built images -:::::{tab-set} -:sync-group: device - -::::{tab-item} Intel/AMD x86 -:sync: x86 - -:::{include} cpu/x86.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: +=== "Intel/AMD x86" -::::: + --8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images" ### Build image from source @@ -192,13 +92,11 @@ $ docker run --rm \ other vLLM OpenAI server arguments ``` -::::{tip} -For ARM or Apple silicon, use `docker/Dockerfile.arm` -:::: +!!! tip + For ARM or Apple silicon, use `docker/Dockerfile.arm` -::::{tip} -For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float` -:::: +!!! tip + For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float` ## Supported features diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md similarity index 58% rename from docs/source/getting_started/installation/cpu/apple.inc.md rename to docs/getting_started/installation/cpu/apple.inc.md index 7bc9e85ecd96..7a91e3ce5e5b 100644 --- a/docs/source/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -1,24 +1,27 @@ -# Installation +# --8<-- [start:installation] vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - OS: `macOS Sonoma` or later - SDK: `XCode 15.4` or later with Command Line Tools - Compiler: `Apple Clang >= 15.0.0` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. @@ -29,9 +32,8 @@ pip install -r requirements/cpu.txt pip install -e . ``` -:::{note} -On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. -::: +!!! note + On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. #### Troubleshooting @@ -51,10 +53,15 @@ If the build has error like the following snippet where standard C++ headers can 1 error generated. ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md new file mode 100644 index 000000000000..59b71dcaf911 --- /dev/null +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -0,0 +1,41 @@ +# --8<-- [start:installation] + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. + +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. + +# --8<-- [end:installation] +# --8<-- [start:requirements] + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): NEON support is required + +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] + +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] + +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] + +--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md" + +Testing has been conducted on AWS Graviton3 instances for compatibility. + +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] + +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] + +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] + +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md similarity index 96% rename from docs/source/getting_started/installation/cpu/build.inc.md rename to docs/getting_started/installation/cpu/build.inc.md index f385f3d5b198..7d6472afa7ea 100644 --- a/docs/source/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -32,3 +32,5 @@ If you want to develop vllm, install it in editable mode instead. ```console VLLM_TARGET_DEVICE=cpu python setup.py develop ``` + +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md similarity index 64% rename from docs/source/getting_started/installation/cpu/s390x.inc.md rename to docs/getting_started/installation/cpu/s390x.inc.md index 9b41173b44ce..670485feefb6 100644 --- a/docs/source/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -1,25 +1,28 @@ -# Installation +# --8<-- [start:installation] vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform. Currently the CPU implementation for s390x architecture supports FP32 datatype only. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - OS: `Linux` - SDK: `gcc/g++ >= 12.3.0` or later with Command Line Tools - Instruction Set Architecture (ISA): VXE support is required. Works with Z14 and above. - Build install python packages: `pyarrow`, `torch` and `torchvision` -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4: @@ -39,9 +42,8 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \ Execute the following commands to build and install vLLM from the source. -::::{tip} -Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. -:::: +!!! tip + Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. ```console sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds @@ -53,10 +55,15 @@ Please build the following dependencies, `torchvision`, `pyarrow` from the sourc pip install dist/*.whl ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] -## Extra information +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md new file mode 100644 index 000000000000..9434eeea8b4a --- /dev/null +++ b/docs/getting_started/installation/cpu/x86.inc.md @@ -0,0 +1,46 @@ +# --8<-- [start:installation] + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. + +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. + +# --8<-- [end:installation] +# --8<-- [start:requirements] + +- OS: Linux +- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) +- Instruction Set Architecture (ISA): AVX512 (optional, recommended) + +!!! tip + [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. + +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] + +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] + +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] + +--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md" + +!!! note + - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. + - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. + +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] + +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] + +See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo) + +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] + +# --8<-- [end:build-image-from-source] +# --8<-- [start:extra-information] +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/device.template.md b/docs/getting_started/installation/device.template.md similarity index 100% rename from docs/source/getting_started/installation/device.template.md rename to docs/getting_started/installation/device.template.md diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md new file mode 100644 index 000000000000..3c983f600673 --- /dev/null +++ b/docs/getting_started/installation/gpu.md @@ -0,0 +1,124 @@ +# GPU + +vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation" + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements" + +## Set up using Python + +### Create a new Python environment + +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:create-a-new-python-environment" + +=== "AMD ROCm" + + There is no extra information on creating a new Python environment for this device. + +=== "Intel XPU" + + There is no extra information on creating a new Python environment for this device. + +### Pre-built wheels + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels" + +[](){ #build-from-source } + +### Build wheel from source + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source" + +## Set up using Docker + +### Pre-built images + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images" + +### Build image from source + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source" + +## Supported features + +=== "NVIDIA CUDA" + + --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features" + +=== "AMD ROCm" + + --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features" + +=== "Intel XPU" + + --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features" diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md similarity index 74% rename from docs/source/getting_started/installation/gpu/cuda.inc.md rename to docs/getting_started/installation/gpu/cuda.inc.md index d3d4b4ef6c80..8653f980501f 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -1,24 +1,26 @@ -# Installation +# --8<-- [start:installation] vLLM contains pre-compiled C++ and CUDA (12.8) binaries. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] ### Create a new Python environment -:::{note} -PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. -::: +!!! note + PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. -Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details. +Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details. -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] You can install vLLM using either `pip` or `uv pip`: @@ -32,9 +34,8 @@ uv pip install vllm --torch-backend=auto We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. -:::{note} -NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration. -::: +!!! note + NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration. As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: @@ -45,7 +46,7 @@ export PYTHON_VERSION=312 uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` -(install-the-latest-code)= +[](){ #install-the-latest-code } #### Install the latest code @@ -87,7 +88,8 @@ uv pip install vllm --torch-backend=auto --extra-index-url https://wheels.vllm.a The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] #### Set up using Python-only build (without compilation) @@ -105,10 +107,9 @@ This command will do the following: 3. Download the pre-built wheel of the base commit. 4. Use its compiled libraries in the installation. -:::{note} -1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol. -2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date. -::: +!!! note + 1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol. + 2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date. In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. @@ -118,12 +119,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll pip install --editable . ``` -You can find more information about vLLM's wheels in . +You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code]. -:::{note} -There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. -It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to for instructions on how to install a specified wheel. -::: +!!! note + There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel. #### Full build (with compilation) @@ -135,17 +135,16 @@ cd vllm pip install -e . ``` -:::{tip} -Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. +!!! tip + Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. -For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . -As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . + As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. -When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built. + When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built. -[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. -The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. -::: + [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. + The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. ##### Use an existing PyTorch installation @@ -220,11 +219,13 @@ export VLLM_TARGET_DEVICE=empty pip install -e . ``` -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] -See for instructions on using the official Docker image. +See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image. Another way to access the latest code is to use the docker images: @@ -237,10 +238,12 @@ These docker images are used for CI and testing only, and they are not intended The latest code can contain bugs and may not be stable. Please use it with caution. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] -See for instructions on building the Docker image. +See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. ## Supported features -See compatibility matrix for feature support information. +See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md similarity index 72% rename from docs/source/getting_started/installation/gpu/rocm.inc.md rename to docs/getting_started/installation/gpu/rocm.inc.md index dc74368fe2c9..85d539b75669 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -1,28 +1,31 @@ -# Installation +# --8<-- [start:installation] vLLM supports AMD GPUs with ROCm 6.3. -:::{attention} -There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -::: +!!! warning + There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201) - ROCm 6.3 -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built ROCm wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): -- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) -- [PyTorch](https://pytorch.org/) + - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) + - [PyTorch](https://pytorch.org/) For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3. @@ -49,9 +52,8 @@ Currently, there are no pre-built ROCm wheels. cd ../.. ``` - :::{note} - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - ::: + !!! note + If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention) @@ -69,9 +71,8 @@ Currently, there are no pre-built ROCm wheels. cd .. ``` - :::{note} - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - ::: + !!! note + You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps: @@ -84,9 +85,8 @@ Currently, there are no pre-built ROCm wheels. python3 setup.py develop ``` - :::{note} - You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose. - ::: + !!! note + You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose. 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps: @@ -108,31 +108,30 @@ Currently, there are no pre-built ROCm wheels. This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. - :::{tip} - - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. - - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - ::: + !!! tip + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. -:::{tip} -- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -::: +!!! tip + - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). ## Set up using Docker (Recommended) -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator. -:::{tip} -Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) -for instructions on how to use this prebuilt docker image. -::: +!!! tip + Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) + for instructions on how to use this prebuilt docker image. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] Building the Docker image from source is the recommended way to use vLLM with ROCm. @@ -213,4 +212,5 @@ Where the `` is the location where the model is stored, for examp ## Supported features -See compatibility matrix for feature support information. +See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md similarity index 67% rename from docs/source/getting_started/installation/gpu/xpu.inc.md rename to docs/getting_started/installation/gpu/xpu.inc.md index 74937a184227..bee9a7ebb717 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -1,23 +1,26 @@ -# Installation +# --8<-- [start:installation] vLLM initially supports basic model inference and serving on Intel GPU platform. -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: +!!! warning + There are no pre-built wheels or images for this device, so you must build vLLM from source. -## Requirements +# --8<-- [end:installation] +# --8<-- [start:requirements] - Supported Hardware: Intel Data Center GPU, Intel ARC GPU - OneAPI requirements: oneAPI 2025.0 -## Set up using Python +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] -### Pre-built wheels +# --8<-- [end:set-up-using-python] +# --8<-- [start:pre-built-wheels] Currently, there are no pre-built XPU wheels. -### Build wheel from source +# --8<-- [end:pre-built-wheels] +# --8<-- [start:build-wheel-from-source] - First, install required driver and Intel OneAPI 2025.0 or later. - Second, install Python packages for vLLM XPU backend building: @@ -35,18 +38,20 @@ pip install -v -r requirements/xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -:::{note} -- FP16 is the default data type in the current XPU backend. The BF16 data - type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. -::: +!!! note + - FP16 is the default data type in the current XPU backend. The BF16 data + type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. -## Set up using Docker +# --8<-- [end:build-wheel-from-source] +# --8<-- [start:set-up-using-docker] -### Pre-built images +# --8<-- [end:set-up-using-docker] +# --8<-- [start:pre-built-images] Currently, there are no pre-built XPU images. -### Build image from source +# --8<-- [end:pre-built-images] +# --8<-- [start:build-image-from-source] ```console $ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . @@ -73,3 +78,4 @@ python -m vllm.entrypoints.openai.api_server \ ``` By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +# --8<-- [end:extra-information] diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md similarity index 100% rename from docs/source/getting_started/installation/python_env_setup.inc.md rename to docs/getting_started/installation/python_env_setup.inc.md diff --git a/docs/source/getting_started/quickstart.md b/docs/getting_started/quickstart.md similarity index 75% rename from docs/source/getting_started/quickstart.md rename to docs/getting_started/quickstart.md index ecca296b0b0c..d24e75e8141d 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -1,11 +1,12 @@ -(quickstart)= - -# Quickstart +--- +title: Quickstart +--- +[](){ #quickstart } This guide will help you quickly get started with vLLM to perform: -- [Offline batched inference](#quickstart-offline) -- [Online serving using OpenAI-compatible server](#quickstart-online) +- [Offline batched inference][quickstart-offline] +- [Online serving using OpenAI-compatible server][quickstart-online] ## Prerequisites @@ -41,31 +42,29 @@ pip install --upgrade uv uv pip install vllm --torch-backend=auto ``` -:::{note} -For more detail and non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. -::: +!!! note + For more detail and non-CUDA platforms, please refer [here][installation-index] for specific instructions on how to install vLLM. -(quickstart-offline)= +[](){ #quickstart-offline } ## Offline Batched Inference With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: -The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: +The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]: -- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine. -- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process. +- [LLM][vllm.LLM] is the main class for running offline inference with vLLM engine. +- [SamplingParams][vllm.SamplingParams] specifies the parameters for the sampling process. ```python from vllm import LLM, SamplingParams ``` -The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params). -:::{important} -By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified. +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params]. +!!! warning + By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. -However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance. -::: + However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. ```python prompts = [ @@ -77,20 +76,18 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ``` -The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models). +The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here][supported-models]. ```python llm = LLM(model="facebook/opt-125m") ``` -:::{note} -By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. - -```shell -export VLLM_USE_MODELSCOPE=True -``` +!!! note + By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. -::: + ```shell + export VLLM_USE_MODELSCOPE=True + ``` Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. @@ -103,7 +100,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -(quickstart-online)= +[](){ #quickstart-online } ## OpenAI-Compatible Server @@ -116,15 +113,13 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` -:::{note} -By default, the server uses a predefined chat template stored in the tokenizer. -You can learn about overriding it [here](#chat-template). -::: -:::{important} -By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. +!!! note + By default, the server uses a predefined chat template stored in the tokenizer. + You can learn about overriding it [here][chat-template]. +!!! warning + By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. -To disable this behavior, please pass `--generation-config vllm` when launching the server. -::: + To disable this behavior, please pass `--generation-config vllm` when launching the server. This server can be queried in the same format as OpenAI API. For example, to list the models: @@ -215,6 +210,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`. -```{attention} -There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see for instructions on how to install it. -``` +!!! warning + There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see for instructions on how to install it. diff --git a/docs/source/getting_started/troubleshooting.md b/docs/getting_started/troubleshooting.md similarity index 86% rename from docs/source/getting_started/troubleshooting.md rename to docs/getting_started/troubleshooting.md index a4744827f226..07e30f9684ae 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/getting_started/troubleshooting.md @@ -1,12 +1,12 @@ -(troubleshooting)= - -# Troubleshooting +--- +title: Troubleshooting +--- +[](){ #troubleshooting } This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -:::{note} -Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. -::: +!!! note + Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. ## Hangs downloading a model @@ -18,13 +18,12 @@ It's recommended to download the model first using the [huggingface-cli](https:/ If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. -:::{note} -To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. -::: +!!! note + To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. ## Out of memory -If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption. +If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options][reducing-memory-usage] to reduce the memory consumption. ## Generation quality changed @@ -53,9 +52,9 @@ You might also need to set `export NCCL_SOCKET_IFNAME=` ## Error near `self.graph.replay()` If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. -To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. +To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the [LLM][vllm.LLM] class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. -(troubleshooting-incorrect-hardware-driver)= +[](){ #troubleshooting-incorrect-hardware-driver } ## Incorrect hardware/driver @@ -140,16 +139,15 @@ If the script runs successfully, you should see the message `sanity check is suc If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. -:::{note} -A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: +!!! note + A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: -- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. -- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. + - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. + - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. -Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. -::: + Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. -(troubleshooting-python-multiprocessing)= +[](){ #troubleshooting-python-multiprocessing } ## Python multiprocessing @@ -260,7 +258,7 @@ or: ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] ``` -But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model. +But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps][model-resolution] to explicitly specify the vLLM implementation for the model. ## Failed to infer device type diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/getting_started/v1_user_guide.md similarity index 100% rename from docs/source/getting_started/v1_user_guide.md rename to docs/getting_started/v1_user_guide.md diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 747ffb7b3033..000000000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py new file mode 100644 index 000000000000..9144f6824b09 --- /dev/null +++ b/docs/mkdocs/hooks/generate_examples.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 + +import itertools +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +ROOT_DIR = Path(__file__).parent.parent.parent.parent +ROOT_DIR_RELATIVE = '../../../../..' +EXAMPLE_DIR = ROOT_DIR / "examples" +EXAMPLE_DOC_DIR = ROOT_DIR / "docs/getting_started/examples" +print(ROOT_DIR.resolve()) +print(EXAMPLE_DIR.resolve()) +print(EXAMPLE_DOC_DIR.resolve()) + + +def fix_case(text: str) -> str: + subs = { + "api": "API", + "cli": "CLI", + "cpu": "CPU", + "llm": "LLM", + "mae": "MAE", + "tpu": "TPU", + "aqlm": "AQLM", + "gguf": "GGUF", + "lora": "LoRA", + "rlhf": "RLHF", + "vllm": "vLLM", + "openai": "OpenAI", + "lmcache": "LMCache", + "multilora": "MultiLoRA", + "mlpspeculator": "MLPSpeculator", + r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 + r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 + } + for pattern, repl in subs.items(): + text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE) + return text + + +@dataclass +class Example: + """ + Example class for generating documentation content from a given path. + + Attributes: + path (Path): The path to the main directory or file. + category (str): The category of the document. + main_file (Path): The main file in the directory. + other_files (list[Path]): list of other files in the directory. + title (str): The title of the document. + + Methods: + __post_init__(): Initializes the main_file, other_files, and title attributes. + determine_main_file() -> Path: Determines the main file in the given path. + determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. + determine_title() -> str: Determines the title of the document. + generate() -> str: Generates the documentation content. + """ # noqa: E501 + path: Path + category: str = None + main_file: Path = field(init=False) + other_files: list[Path] = field(init=False) + title: str = field(init=False) + + def __post_init__(self): + self.main_file = self.determine_main_file() + self.other_files = self.determine_other_files() + self.title = self.determine_title() + + def determine_main_file(self) -> Path: + """ + Determines the main file in the given path. + If the path is a file, it returns the path itself. Otherwise, it searches + for Markdown files (*.md) in the directory and returns the first one found. + Returns: + Path: The main file path, either the original path if it's a file or the first + Markdown file found in the directory. + Raises: + IndexError: If no Markdown files are found in the directory. + """ # noqa: E501 + return self.path if self.path.is_file() else list( + self.path.glob("*.md")).pop() + + def determine_other_files(self) -> list[Path]: + """ + Determine other files in the directory excluding the main file. + + This method checks if the given path is a file. If it is, it returns an empty list. + Otherwise, it recursively searches through the directory and returns a list of all + files that are not the main file. + + Returns: + list[Path]: A list of Path objects representing the other files in the directory. + """ # noqa: E501 + if self.path.is_file(): + return [] + is_other_file = lambda file: file.is_file() and file != self.main_file + return [file for file in self.path.rglob("*") if is_other_file(file)] + + def determine_title(self) -> str: + return fix_case(self.path.stem.replace("_", " ").title()) + + def generate(self) -> str: + content = f"---\ntitle: {self.title}\n---\n\n" + content += f"Source .\n\n" + + is_code = self.main_file.suffix != ".md" + if is_code: + content += f"```{self.main_file.suffix[1:]}\n" + content += f'--8<-- "{self.main_file}"\n' + if is_code: + content += "```\n" + content += "\n" + + if not self.other_files: + return content + + content += "## Example materials\n\n" + for file in sorted(self.other_files): + content += f'??? abstract "{file.relative_to(self.path)}"\n' + if file.suffix != ".md": + content += f" ```{file.suffix[1:]}\n" + content += f' --8<-- "{file}"\n' + if file.suffix != ".md": + content += " ```\n" + + return content + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + # Create the EXAMPLE_DOC_DIR if it doesn't exist + if not EXAMPLE_DOC_DIR.exists(): + EXAMPLE_DOC_DIR.mkdir(parents=True) + + categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir()) + + examples = [] + glob_patterns = ["*.py", "*.md", "*.sh"] + # Find categorised examples + for category in categories: + globs = [category.glob(pattern) for pattern in glob_patterns] + for path in itertools.chain(*globs): + examples.append(Example(path, category.stem)) + # Find examples in subdirectories + for path in category.glob("*/*.md"): + examples.append(Example(path.parent, category.stem)) + + # Generate the example documentation + for example in sorted(examples, key=lambda e: e.path.stem): + example_name = f"{example.path.stem}.md" + doc_path = EXAMPLE_DOC_DIR / example.category / example_name + print(doc_path) + if not doc_path.parent.exists(): + doc_path.parent.mkdir(parents=True) + with open(doc_path, "w+") as f: + f.write(example.generate()) diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py new file mode 100644 index 000000000000..e5f8549d8383 --- /dev/null +++ b/docs/mkdocs/hooks/remove_announcement.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 +import os +from typing import Literal + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa + if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag": + # remove the warning banner if the version is a tagged release + docs_dir = os.path.dirname(__file__) + announcement_path = os.path.join(docs_dir, + "mkdocs/overrides/main.html") + # The file might be removed already if the build is triggered multiple + # times (readthedocs build both HTML and PDF versions separately) + if os.path.exists(announcement_path): + os.remove(announcement_path) diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py new file mode 100644 index 000000000000..03e7ffbb2733 --- /dev/null +++ b/docs/mkdocs/hooks/url_schemes.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +import re + +from mkdocs.config.defaults import MkDocsConfig +from mkdocs.structure.files import Files +from mkdocs.structure.pages import Page + + +def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig, + files: Files): + gh_icon = ":octicons-mark-github-16:" + gh_url = "https://github.com" + repo_url = f"{gh_url}/vllm-project/vllm" + org_url = f"{gh_url}/orgs/vllm-project" + urls = { + "issue": f"{repo_url}/issues", + "pr": f"{repo_url}/pull", + "project": f"{org_url}/projects", + "dir": f"{repo_url}/tree/main", + "file": f"{repo_url}/blob/main", + } + titles = { + "issue": "Issue #", + "pr": "Pull Request #", + "project": "Project #", + "dir": "", + "file": "", + } + + scheme = r"gh-(?P.+?):(?P.+?)(#(?P.+?))?" + inline_link = re.compile(r"\[(?P[^\[]+?)\]\(" + scheme + r"\)") + auto_link = re.compile(f"<{scheme}>") + + def replace_inline_link(match: re.Match) -> str: + url = f'{urls[match.group("type")]}/{match.group("path")}' + if fragment := match.group("fragment"): + url += f"#{fragment}" + + return f'[{gh_icon} {match.group("title")}]({url})' + + def replace_auto_link(match: re.Match) -> str: + type = match.group("type") + path = match.group("path") + title = f"{titles[type]}{path}" + url = f"{urls[type]}/{path}" + if fragment := match.group("fragment"): + url += f"#{fragment}" + + return f"[{gh_icon} {title}]({url})" + + markdown = inline_link.sub(replace_inline_link, markdown) + markdown = auto_link.sub(replace_auto_link, markdown) + + return markdown diff --git a/docs/source/_static/custom.js b/docs/mkdocs/javascript/run_llm_widget.js similarity index 54% rename from docs/source/_static/custom.js rename to docs/mkdocs/javascript/run_llm_widget.js index 58bc2ebb9614..d0e5560e92b4 100644 --- a/docs/source/_static/custom.js +++ b/docs/mkdocs/javascript/run_llm_widget.js @@ -17,22 +17,3 @@ document.addEventListener("DOMContentLoaded", function () { script.async = true; document.head.appendChild(script); }); - -// Update URL search params when tab is clicked - document.addEventListener("DOMContentLoaded", function () { - const tabs = document.querySelectorAll(".sd-tab-label"); - - function updateURL(tab) { - const syncGroup = tab.getAttribute("data-sync-group"); - const syncId = tab.getAttribute("data-sync-id"); - if (syncGroup && syncId) { - const url = new URL(window.location); - url.searchParams.set(syncGroup, syncId); - window.history.replaceState(null, "", url); - } - } - - tabs.forEach(tab => { - tab.addEventListener("click", () => updateURL(tab)); - }); -}); diff --git a/docs/mkdocs/overrides/main.html b/docs/mkdocs/overrides/main.html new file mode 100644 index 000000000000..bdd62ebc158d --- /dev/null +++ b/docs/mkdocs/overrides/main.html @@ -0,0 +1,5 @@ +{% extends "base.html" %} + +{% block announce %} + <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p> +{% endblock %} diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md similarity index 100% rename from docs/source/models/extensions/fastsafetensor.md rename to docs/models/extensions/fastsafetensor.md diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md similarity index 86% rename from docs/source/models/extensions/runai_model_streamer.md rename to docs/models/extensions/runai_model_streamer.md index e0daa6f86dde..c80120fa98f2 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -1,6 +1,7 @@ -(runai-model-streamer)= - -# Loading models with Run:ai Model Streamer +--- +title: Loading models with Run:ai Model Streamer +--- +[](){ #runai-model-streamer } Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md). @@ -48,9 +49,8 @@ You can read further about CPU buffer memory limiting [here](https://github.com/ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` -:::{note} -For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). -::: +!!! note + For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). ## Sharded Model Loading @@ -74,6 +74,5 @@ The sharded loader supports all the same tunable parameters as the regular Run:a vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' ``` -:::{note} -The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint. -::: +!!! note + The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint. diff --git a/docs/source/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md similarity index 79% rename from docs/source/models/extensions/tensorizer.md rename to docs/models/extensions/tensorizer.md index cd94c81e620a..36b49626d47d 100644 --- a/docs/source/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -1,6 +1,7 @@ -(tensorizer)= - -# Loading models with CoreWeave's Tensorizer +--- +title: Loading models with CoreWeave's Tensorizer +--- +[](){ #tensorizer } vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized @@ -11,6 +12,5 @@ For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html). -:::{note} -Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. -::: +!!! note + Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/models/generative_models.md b/docs/models/generative_models.md similarity index 63% rename from docs/source/models/generative_models.md rename to docs/models/generative_models.md index dd765e4a9765..566b1c29fca9 100644 --- a/docs/source/models/generative_models.md +++ b/docs/models/generative_models.md @@ -1,24 +1,25 @@ -(generative-models)= - -# Generative Models +--- +title: Generative Models +--- +[](){ #generative-models } vLLM provides first-class support for generative models, which covers most of LLMs. -In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface. +In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface. Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, -which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. +which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text. For generative models, the only supported `--task` option is `"generate"`. Usually, this is automatically inferred so you don't have to specify it. ## Offline Inference -The {class}`~vllm.LLM` class provides various methods for offline inference. -See <project:#configuration> for a list of options when initializing the model. +The [LLM][vllm.LLM] class provides various methods for offline inference. +See [configuration][configuration] for a list of options when initializing the model. ### `LLM.generate` -The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. +The [generate][vllm.LLM.generate] method is available to all generative models in vLLM. It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate), except that tokenization and detokenization are also performed automatically. @@ -34,7 +35,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. +You can optionally control the language generation by passing [SamplingParams][vllm.SamplingParams]. For example, you can use greedy sampling by setting `temperature=0`: ```python @@ -50,16 +51,15 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -:::{important} -By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified. +!!! warning + By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified. -However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance. -::: + However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance. A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py> ### `LLM.beam_search` -The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of {class}`~vllm.LLM.generate`. +The [beam_search][vllm.LLM.beam_search] method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of [generate][vllm.LLM.generate]. For example, to search using 5 beams and output at most 50 tokens: ```python @@ -77,14 +77,13 @@ for output in outputs: ### `LLM.chat` -The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`. +The [chat][vllm.LLM.chat] method implements chat functionality on top of [generate][vllm.LLM.generate]. In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. -:::{important} -In general, only instruction-tuned models have a chat template. -Base models may perform poorly as they are not trained to respond to the chat conversation. -::: +!!! warning + In general, only instruction-tuned models have a chat template. + Base models may perform poorly as they are not trained to respond to the chat conversation. ```python from vllm import LLM @@ -133,7 +132,7 @@ outputs = llm.chat(conversation, chat_template=custom_template) ## Online Serving -Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs: -- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. -- [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. +- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text. +- [Chat API][chat-api] is similar to `LLM.chat`, accepting both text and [multi-modal inputs][multimodal-inputs] for models with a chat template. diff --git a/docs/source/models/pooling_models.md b/docs/models/pooling_models.md similarity index 62% rename from docs/source/models/pooling_models.md rename to docs/models/pooling_models.md index 3fd35e2e8bd1..89a128915a76 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -1,70 +1,48 @@ -(pooling-models)= - -# Pooling Models +--- +title: Pooling Models +--- +[](){ #pooling-models } vLLM also supports pooling models, including embedding, reranking and reward models. -In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface. -These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input +In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface. +These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input before returning them. -:::{note} -We currently support pooling models primarily as a matter of convenience. -As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to -pooling models as they only work on the generation or decode stage, so performance may not improve as much. -::: +!!! note + We currently support pooling models primarily as a matter of convenience. + As shown in the [Compatibility Matrix][compatibility-matrix], most vLLM features are not applicable to + pooling models as they only work on the generation or decode stage, so performance may not improve as much. For pooling models, we support the following `--task` options. The selected option sets the default pooler used to extract the final hidden states: -:::{list-table} -:widths: 50 25 25 25 -:header-rows: 1 - -- * Task - * Pooling Type - * Normalization - * Softmax -- * Embedding (`embed`) - * `LAST` - * ✅︎ - * ❌ -- * Classification (`classify`) - * `LAST` - * ❌ - * ✅︎ -- * Sentence Pair Scoring (`score`) - * \* - * \* - * \* -- * Reward Modeling (`reward`) - * `ALL` - * ❌ - * ❌ -::: +| Task | Pooling Type | Normalization | Softmax | +|---------------------------------|----------------|-----------------|-----------| +| Embedding (`embed`) | `LAST` | ✅︎ | ❌ | +| Classification (`classify`) | `LAST` | ❌ | ✅︎ | +| Sentence Pair Scoring (`score`) | \* | \* | \* | \*The default pooler is always defined by the model. -:::{note} -If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. -::: +!!! note + If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). -:::{tip} -You can customize the model's pooling method via the `--override-pooler-config` option, -which takes priority over both the model's and Sentence Transformers's defaults. -::: +!!! tip + You can customize the model's pooling method via the `--override-pooler-config` option, + which takes priority over both the model's and Sentence Transformers's defaults. ## Offline Inference -The {class}`~vllm.LLM` class provides various methods for offline inference. -See <project:#configuration> for a list of options when initializing the model. +The [LLM][vllm.LLM] class provides various methods for offline inference. +See [configuration][configuration] for a list of options when initializing the model. ### `LLM.encode` -The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. +The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. It returns the extracted hidden states directly, which is useful for reward models. ```python @@ -79,7 +57,7 @@ print(f"Data: {data!r}") ### `LLM.embed` -The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt. +The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt. It is primarily designed for embedding models. ```python @@ -96,7 +74,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/embe ### `LLM.classify` -The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt. +The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt. It is primarily designed for classification models. ```python @@ -113,13 +91,12 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/clas ### `LLM.score` -The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs. +The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs. It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems. -:::{note} -vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. -To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). -::: +!!! note + vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. + To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). ```python from vllm import LLM @@ -136,27 +113,25 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/scor ## Online Serving -Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs: -- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. -- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models. -- [Classification API](#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models. -- [Score API](#score-api) is similar to `LLM.score` for cross-encoder models. +- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models. +- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs][multimodal-inputs] for embedding models. +- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models. +- [Score API][score-api] is similar to `LLM.score` for cross-encoder models. ## Matryoshka Embeddings [Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost. -:::{warning} -Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. - -For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. +!!! warning + Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings. -```json -{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} -``` + For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error. -::: + ```json + {"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400} + ``` ### Manually enable Matryoshka Embeddings @@ -172,7 +147,7 @@ vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_ ### Offline Inference -You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`. +You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams]. ```python from vllm import LLM, PoolingParams diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md new file mode 100644 index 000000000000..416fe42fcb79 --- /dev/null +++ b/docs/models/supported_models.md @@ -0,0 +1,690 @@ +--- +title: Supported Models +--- +[](){ #supported-models } + +vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks. +If a model supports more than one task, you can set the task via the `--task` argument. + +For each task, we list the model architectures that have been implemented in vLLM. +Alongside each architecture, we include some popular models that use it. + +## Model Implementation + +### vLLM + +If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>. + +These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models]. + +[](){ #transformers-backend } + +### Transformers + +vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! + +To check if the modeling backend is Transformers, you can simply do this: + +```python +from vllm import LLM +llm = LLM(model=..., task="generate") # Name or path of your model +llm.apply_model(lambda model: print(type(model))) +``` + +If it is `TransformersForCausalLM` then it means it's based on Transformers! + +!!! tip + You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][openai-compatible-server]. + +!!! note + vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. + +#### Custom models + +If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! + +For a model to be compatible with the Transformers backend for vLLM it must: + +- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)): + * The model directory must have the correct structure (e.g. `config.json` is present). + * `config.json` must contain `auto_map.AutoModel`. +- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]): + * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`). + +If the compatible model is: + +- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][openai-compatible-server]. +- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][openai-compatible-server]. + +This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! + +[](){ #writing-custom-models } + +#### Writing custom models + +This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). + +To make your model compatible with the Transformers backend, it needs: + +1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. +2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. +3. `MyModel` must contain `_supports_attention_backend = True`. + +```python title="modeling_my_model.py" + +from transformers import PreTrainedModel +from torch import nn + +class MyAttention(nn.Module): + + def forward(self, hidden_states, **kwargs): + ... + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + **kwargs, + ) + ... + +class MyModel(PreTrainedModel): + _supports_attention_backend = True +``` + +Here is what happens in the background when this model is loaded: + +1. The config is loaded. +2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. +3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. + +That's it! + +For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: + +```python title="configuration_my_model.py" + +from transformers import PretrainedConfig + +class MyConfig(PretrainedConfig): + base_model_tp_plan = { + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } +``` + +- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). +- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s: + * You only need to do this for layers which are not present on all pipeline stages + * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages + * The `list` in the first element of the `tuple` contains the names of the input arguments + * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code + +## Loading a Model + +### Hugging Face Hub + +By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome). + +To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository. +If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. + +Models do not _need_ to be natively supported to be used in vLLM. +The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). + +!!! tip + The easiest way to check if your model is really supported at runtime is to run the program below: + + ```python + from vllm import LLM + + # For generative models (task=generate) only + llm = LLM(model=..., task="generate") # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + + # For pooling models (task={embed,classify,reward,score}) only + llm = LLM(model=..., task="embed") # Name or path of your model + output = llm.encode("Hello, my name is") + print(output) + ``` + + If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. + +Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM. +Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. + +#### Download a model + +If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository: + +```console +# Download a model +huggingface-cli download HuggingFaceH4/zephyr-7b-beta + +# Specify a custom cache directory +huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache + +# Download a specific file from a model repo +huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json +``` + +#### List the downloaded models + +Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache: + +```console +# List cached models +huggingface-cli scan-cache + +# Show detailed (verbose) output +huggingface-cli scan-cache -v + +# Specify a custom cache directory +huggingface-cli scan-cache --dir ~/.cache/huggingface/hub +``` + +#### Delete a cached model + +Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache: + +```console +# The `delete-cache` command requires extra dependencies to work with the TUI. +# Please run `pip install huggingface_hub[cli]` to install them. + +# Launch the interactive TUI to select models to delete +$ huggingface-cli delete-cache +? Select revisions to delete: 1 revisions selected counting for 438.9M. + ○ None of the following (if selected, nothing will be deleted). +Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago) +❯ ◉ a5beb1e3: main # modified 1 week ago + +Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago) + ○ d4aa6901: main # modified 1 week ago + +Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago) + ○ 2cfc18c9: main # modified 4 weeks ago + +Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification. + +# Need to confirm after selected +? Select revisions to delete: 1 revision(s) selected. +? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes +Start deletion. +Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M. +``` + +#### Using a proxy + +Here are some tips for loading/downloading models from Hugging Face using a proxy: + +- Set the proxy globally for your session (or set it in the profile file): + +```shell +export http_proxy=http://your.proxy.server:port +export https_proxy=http://your.proxy.server:port +``` + +- Set the proxy for just the current command: + +```shell +https_proxy=http://your.proxy.server:port huggingface-cli download <model_name> + +# or use vllm cmd directly +https_proxy=http://your.proxy.server:port vllm serve <model_name> --disable-log-requests +``` + +- Set the proxy in Python interpreter: + +```python +import os + +os.environ['http_proxy'] = 'http://your.proxy.server:port' +os.environ['https_proxy'] = 'http://your.proxy.server:port' +``` + +### ModelScope + +To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable: + +```shell +export VLLM_USE_MODELSCOPE=True +``` + +And use with `trust_remote_code=True`. + +```python +from vllm import LLM + +llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) + +# For generative models (task=generate) only +output = llm.generate("Hello, my name is") +print(output) + +# For pooling models (task={embed,classify,reward,score}) only +output = llm.encode("Hello, my name is") +print(output) +``` + +[](){ #feature-status-legend } + +## Feature Status Legend + +- ✅︎ indicates that the feature is supported for the model. + +- 🚧 indicates that the feature is planned but not yet supported for the model. + +- ⚠️ indicates that the feature is available but may have known issues or limitations. + +[](){ #supported-text-models } + +## List of Text-only Language Models + +### Generative Models + +See [this page][generative-models] for more information on how to use generative models. + +#### Text Generation + +Specified using `--task generate`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------| +| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | +| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | ✅︎ | | +| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | +| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | | | +| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | ✅︎ | | +| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | +| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | +| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | +| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | ✅︎ | | +| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | ✅︎ | | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | ✅︎ | | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | ✅︎ | | +| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | +| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | ✅︎ | | +| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | ✅︎ | ✅︎ | +| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | +| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | +| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | +| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | +| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | +| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | +| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | ✅︎ | | +| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | +| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | ✅︎ | | +| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | ✅︎ | | +| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | +| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | +| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | +| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | +| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | +| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | +| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | ✅︎ | | +| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | +| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | +| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | ✅︎ | | +| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | +| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | +| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | +| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | +| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | ✅︎ | | +| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | | +| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | ✅︎ | ✅︎ | +| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | | +| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | ✅︎ | | +| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | +| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | +| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | ✅︎ | | +| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | +| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | ✅︎ | | +| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | +| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | +| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | | +| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | +| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | | +| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | ✅︎ | | +| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | ✅︎ | | +| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | +| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | +| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | +| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | +| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | ✅︎ | | +| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | + +!!! note + Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. + +### Pooling Models + +See [this page](pooling-models) for more information on how to use pooling models. + +!!! warning + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +#### Text Embedding + +Specified using `--task embed`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------| +| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | +| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | +| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | +| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ︎ | | +| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ︎ | ︎ | +| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ︎ | ︎ | +| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎ | ︎ | +| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | +| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | + +!!! note + `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. + You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. + +!!! note + The HF implementation of `Alibaba-NLP/gte-Qwen2-1.5B-instruct` is hardcoded to use causal attention despite what is shown in `config.json`. To compare vLLM vs HF results, + you should set `--hf-overrides '{"is_causal": true}'` in vLLM so that the two implementations are consistent with each other. + + For both the 1.5B and 7B variants, you also need to enable `--trust-remote-code` for the correct tokenizer to be loaded. + See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). + +!!! note + `jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights. + +!!! note + The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture. + +If your model is not in the above list, we will try to automatically convert the model using +[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings +of the whole prompt are extracted from the normalized hidden state corresponding to the last token. + +#### Reward Modeling + +Specified using `--task reward`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------| +| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | +| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | +| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | + +If your model is not in the above list, we will try to automatically convert the model using +[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. + +!!! warning + For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, + e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. + +#### Classification + +Specified using `--task classify`. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------| +| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | + +If your model is not in the above list, we will try to automatically convert the model using +[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. + +#### Sentence Pair Scoring + +Specified using `--task score`. + +| Architecture | Models | Example HF Models | +|---------------------------------------|-------------------|----------------------------------------------| +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | + +[](){ #supported-mm-models } + +## List of Multimodal Language Models + +The following modalities are supported depending on the model: + +- **T**ext +- **I**mage +- **V**ideo +- **A**udio + +Any combination of modalities joined by `+` are supported. + +- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by `/` are mutually exclusive. + +- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. + +See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model. + +!!! warning + **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) + or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: + + Offline inference: + + ```python + from vllm import LLM + + llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, + ) + ``` + + Online serving: + + ```bash + vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}' + ``` + + **This is no longer required if you are using vLLM V1.** + +!!! note + vLLM currently only supports adding LoRA to the language backbone of multimodal models. + +### Generative Models + +See [this page][generative-models] for more information on how to use generative models. + +#### Text Generation + +Specified using `--task generate`. + +| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | +|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| +| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | ✅︎ | ✅︎ | | +| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | ✅︎ | ✅︎ | | +| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ | | +| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | ✅︎ | ✅︎ | | +| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | ✅︎ | ✅︎ | | +| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | | +| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | ✅︎ | ✅︎ | | +| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | +| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | ✅︎ | ✅︎ | ✅︎ | +| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | +| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | ✅︎ | ✅︎\* | | +| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | ✅︎ | | +| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | | +| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | ✅︎ | | | +| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ | | +| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | ✅︎ | ✅︎ | | +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | ✅︎ | ✅︎ | | +| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | ✅︎ | ✅︎ | | +| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | ✅︎ | ✅︎ | | +| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | ✅︎ | ✅︎ | | +| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | +| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | ✅︎ | ✅︎ | | +| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | ✅︎ | | | +| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ⚠️ | | +| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | ✅︎ | ✅︎ | | +| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | | +| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | ✅︎ | ✅︎ | | +| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | ✅︎ | ✅︎ | | +| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎\* | | +| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | ✅︎ | ✅︎ | | +| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | ✅︎ | | + +<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM. +    • For example, to use DeepSeek-VL2 series models: +      `--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` +<sup>E</sup> Pre-computed embeddings can be inputted for this modality. +<sup>+</sup> Multiple items can be inputted per text prompt for this modality. + +!!! warning + Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs. + However, there are differences in how they handle text + image inputs: + + V0 correctly implements the model's attention pattern: + - Uses bidirectional attention between the image tokens corresponding to the same image + - Uses causal attention for other tokens + - Implemented via (naive) PyTorch SDPA with masking tensors + - Note: May use significant memory for long prompts with image + + V1 currently uses a simplified attention pattern: + - Uses causal attention for all tokens, including image tokens + - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}` + - Will be updated in the future to support the correct behavior + + This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. + +!!! note + `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80. + +!!! note + To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. + +!!! warning + The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates. + + For the best results, we recommend using the following dependency versions (tested on A10 and L40): + + ```text + # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) + torch==2.5.1 + torchvision==0.20.1 + transformers==4.48.1 + tokenizers==0.21.0 + tiktoken==0.7.0 + vllm==0.7.0 + + # Optional but recommended for improved performance and stability + triton==3.1.0 + xformers==0.0.28.post3 + uvloop==0.21.0 + protobuf==5.29.3 + openai==1.60.2 + opencv-python-headless==4.11.0.86 + pillow==10.4.0 + + # Installed FlashAttention (for float16 only) + flash-attn>=2.5.6 # Not used in float32, but should be documented + ``` + + **Note:** Make sure you understand the security implications of using outdated packages. + +!!! note + The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. + For more details, please see: <gh-pr:4087#issuecomment-2250397630> + +!!! warning + Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1. + +!!! note + To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via + `pip install git+https://github.com/huggingface/transformers.git`. + + Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1. + `--mm-processor-kwargs '{"use_audio_in_video": true}'`. + +### Pooling Models + +See [this page](pooling-models) for more information on how to use pooling models. + +!!! warning + Since some model architectures support both generative and pooling tasks, + you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. + +#### Text Embedding + +Specified using `--task embed`. + +Any text generation model can be converted into an embedding model by passing `--task embed`. + +!!! note + To get the best results, you should use pooling models that are specifically trained as such. + +The following table lists those that are tested in vLLM. + +| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------| +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | ✅︎ | | +| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | + +#### Transcription + +Specified using `--task transcription`. + +Speech2Text models trained specifically for Automatic Speech Recognition. + +| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | +|----------------|----------|---------------------|------------------------|-----------------------------| + +--- + +## Model Support Policy + +At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: + +1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! + +2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. + + !!! tip + When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + +3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. + +4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. + +5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. + +Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. + +Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard. + +We have the following levels of testing for models: + +1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. +2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test. +4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/performance/benchmarks.md b/docs/performance/benchmarks.md similarity index 86% rename from docs/source/performance/benchmarks.md rename to docs/performance/benchmarks.md index 39dc470a1c70..00505fc6f2a9 100644 --- a/docs/source/performance/benchmarks.md +++ b/docs/performance/benchmarks.md @@ -1,13 +1,14 @@ -(benchmarks)= - -# Benchmark Suites +--- +title: Benchmark Suites +--- +[](){ #benchmarks } vLLM contains two sets of benchmarks: -- [Performance benchmarks](#performance-benchmarks) -- [Nightly benchmarks](#nightly-benchmarks) +- [Performance benchmarks][performance-benchmarks] +- [Nightly benchmarks][nightly-benchmarks] -(performance-benchmarks)= +[](){ #performance-benchmarks } ## Performance Benchmarks @@ -17,7 +18,7 @@ The latest performance results are hosted on the public [vLLM Performance Dashbo More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). -(nightly-benchmarks)= +[](){ #nightly-benchmarks } ## Nightly Benchmarks diff --git a/docs/source/performance/optimization.md b/docs/performance/optimization.md similarity index 98% rename from docs/source/performance/optimization.md rename to docs/performance/optimization.md index 4160f0784962..57e01a384b52 100644 --- a/docs/source/performance/optimization.md +++ b/docs/performance/optimization.md @@ -1,6 +1,7 @@ -(optimization-and-tuning)= - -# Optimization and Tuning +--- +title: Optimization and Tuning +--- +[](){ #optimization-and-tuning } This guide covers optimization strategies and performance tuning for vLLM V1. @@ -26,7 +27,7 @@ You can monitor the number of preemption requests through Prometheus metrics exp In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture. -(chunked-prefill)= +[](){ #chunked-prefill } ## Chunked Prefill diff --git a/docs/source/serving/distributed_serving.md b/docs/serving/distributed_serving.md similarity index 73% rename from docs/source/serving/distributed_serving.md rename to docs/serving/distributed_serving.md index c285ef3e8e1c..259af5cabcb8 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/serving/distributed_serving.md @@ -1,6 +1,7 @@ -(distributed-serving)= - -# Distributed Inference and Serving +--- +title: Distributed Inference and Serving +--- +[](){ #distributed-serving } ## How to decide the distributed inference strategy? @@ -14,9 +15,8 @@ In short, you should increase the number of GPUs and the number of nodes until y After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. -:::{note} -There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. -::: +!!! note + There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. ## Running vLLM on a single node @@ -77,13 +77,11 @@ bash run_cluster.sh \ Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses. -:::{warning} -It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties. -::: +!!! warning + It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties. -:::{warning} -Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`. -::: +!!! warning + Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`. Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. @@ -104,16 +102,13 @@ vllm serve /path/to/the/model/in/the/container \ To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. -:::{warning} -After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information. -::: +!!! warning + After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script][troubleshooting-incorrect-hardware-driver] for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information. -:::{warning} -Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. +!!! warning + Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. -When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. -::: + When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. -:::{warning} -If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information. -::: +!!! warning + If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information. diff --git a/docs/serving/engine_args.md b/docs/serving/engine_args.md new file mode 100644 index 000000000000..fb2689a56391 --- /dev/null +++ b/docs/serving/engine_args.md @@ -0,0 +1,18 @@ +--- +title: Engine Arguments +--- +[](){ #engine-args } + +Engine arguments control the behavior of the vLLM engine. + +- For [offline inference][offline-inference], they are part of the arguments to [LLM][vllm.LLM] class. +- For [online serving][openai-compatible-server], they are part of the arguments to `vllm serve`. + +You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments. + +However, these classes are a combination of the configuration classes defined in [vllm.config][]. Therefore, we would recommend you read about them there where they are best documented. + +For offline inference you will have access to these configuration classes and for online serving you can cross-reference the configs with `vllm serve --help`, which has its arguments grouped by config. + +!!! note + Additional arguments are available to the [AsyncLLMEngine][vllm.engine.async_llm_engine.AsyncLLMEngine] which is used for online serving. These can be found by running `vllm serve --help` diff --git a/docs/serving/env_vars.md b/docs/serving/env_vars.md new file mode 100644 index 000000000000..f6d548a19d91 --- /dev/null +++ b/docs/serving/env_vars.md @@ -0,0 +1,12 @@ +# Environment Variables + +vLLM uses the following environment variables to configure the system: + +!!! warning + Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. + + All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). + +```python +--8<-- "vllm/envs.py:env-vars-definition" +``` diff --git a/docs/source/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md similarity index 93% rename from docs/source/serving/integrations/langchain.md rename to docs/serving/integrations/langchain.md index 03142d23b145..14ea6a044341 100644 --- a/docs/source/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -1,6 +1,7 @@ -(serving-langchain)= - -# LangChain +--- +title: LangChain +--- +[](){ #serving-langchain } vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) . diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md similarity index 91% rename from docs/source/serving/integrations/llamaindex.md rename to docs/serving/integrations/llamaindex.md index 8c72605202cf..251b7155c556 100644 --- a/docs/source/serving/integrations/llamaindex.md +++ b/docs/serving/integrations/llamaindex.md @@ -1,6 +1,7 @@ -(serving-llamaindex)= - -# LlamaIndex +--- +title: LlamaIndex +--- +[](){ #serving-llamaindex } vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) . diff --git a/docs/source/serving/metrics.md b/docs/serving/metrics.md similarity index 90% rename from docs/source/serving/metrics.md rename to docs/serving/metrics.md index 647ece3f85f0..9ad7253184d9 100644 --- a/docs/source/serving/metrics.md +++ b/docs/serving/metrics.md @@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the system. These metrics are exposed via the `/metrics` endpoint on the vLLM OpenAI compatible API server. -You can start the server using Python, or using [Docker](#deployment-docker): +You can start the server using Python, or using [Docker][deployment-docker]: ```console vllm serve unsloth/Llama-3.2-1B-Instruct @@ -31,11 +31,9 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I The following metrics are exposed: -:::{literalinclude} ../../../vllm/engine/metrics.py -:end-before: end-metrics-definitions -:language: python -:start-after: begin-metrics-definitions -::: +```python +--8<-- "vllm/engine/metrics.py:metrics-definitions" +``` The following metrics are deprecated and due to be removed in a future version: diff --git a/docs/source/serving/offline_inference.md b/docs/serving/offline_inference.md similarity index 76% rename from docs/source/serving/offline_inference.md rename to docs/serving/offline_inference.md index 433d2e894dd8..584d7cd143bc 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/serving/offline_inference.md @@ -1,10 +1,11 @@ -(offline-inference)= - -# Offline Inference +--- +title: Offline Inference +--- +[](){ #offline-inference } You can run vLLM in your own code on a list of prompts. -The offline API is based on the {class}`~vllm.LLM` class. +The offline API is based on the [LLM][vllm.LLM] class. To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run. For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace @@ -19,23 +20,22 @@ llm = LLM(model="facebook/opt-125m") After initializing the `LLM` instance, you can perform model inference using various APIs. The available APIs depend on the type of model that is being run: -- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text. -- [Pooling models](#pooling-models) output their hidden states directly. +- [Generative models][generative-models] output logprobs which are sampled from to obtain the final output text. +- [Pooling models][pooling-models] output their hidden states directly. Please refer to the above pages for more details about each API. -:::{seealso} -[API Reference](#offline-inference-api) -::: +!!! info + [API Reference][offline-inference-api] -(configuration-options)= +[](){ #configuration-options } ## Configuration Options This section lists the most common options for running the vLLM engine. -For a full list, refer to the <project:#configuration> page. +For a full list, refer to the [configuration][configuration] page. -(model-resolution)= +[](){ #model-resolution } ### Model resolution @@ -59,9 +59,9 @@ model = LLM( ) ``` -Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM. +Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM. -(reducing-memory-usage)= +[](){ #reducing-memory-usage } ### Reducing memory usage @@ -80,18 +80,16 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` -:::{important} -To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`) -before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. +!!! warning + To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) + before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. -To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. -::: + To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. -:::{note} -With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). +!!! note + With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). -You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. -::: + You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. #### Quantization @@ -100,7 +98,7 @@ Quantized models take less memory at the cost of lower precision. Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI)) and used directly without extra configuration. -Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details. +Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details. #### Context length and batch size @@ -119,9 +117,8 @@ llm = LLM(model="adept/fuyu-8b", By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. -:::{important} -CUDA graph capture takes up more memory in V1 than in V0. -::: +!!! warning + CUDA graph capture takes up more memory in V1 than in V0. You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: @@ -214,4 +211,4 @@ llm = LLM(model="OpenGVLab/InternVL2-2B", ### Performance optimization and tuning You can potentially improve the performance of vLLM by finetuning various options. -Please refer to [this guide](#optimization-and-tuning) for more details. +Please refer to [this guide][optimization-and-tuning] for more details. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md similarity index 61% rename from docs/source/serving/openai_compatible_server.md rename to docs/serving/openai_compatible_server.md index 61f7e98bf108..27cb9310c516 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -1,10 +1,11 @@ -(openai-compatible-server)= - -# OpenAI-Compatible Server +--- +title: OpenAI-Compatible Server +--- +[](){ #openai-compatible-server } vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client. -In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`](#serve-args) command. (You can also use our [Docker](#deployment-docker) image.) +In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 @@ -20,58 +21,56 @@ client = OpenAI( ) completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Hello!"} - ] + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Hello!"} + ] ) print(completion.choices[0].message) ``` -:::{tip} -vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. -You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. -::: +!!! tip + vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. + You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. -:::{important} -By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. +!!! warning + By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. -To disable this behavior, please pass `--generation-config vllm` when launching the server. -::: + To disable this behavior, please pass `--generation-config vllm` when launching the server. ## Supported APIs We currently support the following OpenAI APIs: -- [Completions API](#completions-api) (`/v1/completions`) - - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). - - *Note: `suffix` parameter is not supported.* -- [Chat Completions API](#chat-api) (`/v1/chat/completions`) - - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template). - - *Note: `parallel_tool_calls` and `user` parameters are ignored.* -- [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). -- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`) - - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). +- [Completions API][completions-api] (`/v1/completions`) + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). + - *Note: `suffix` parameter is not supported.* +- [Chat Completions API][chat-api] (`/v1/chat/completions`) + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template][chat-template]. + - *Note: `parallel_tool_calls` and `user` parameters are ignored.* +- [Embeddings API][embeddings-api] (`/v1/embeddings`) + - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). +- [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`) + - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). In addition, we have the following custom APIs: -- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - - Applicable to any model with a tokenizer. -- [Pooling API](#pooling-api) (`/pooling`) - - Applicable to all [pooling models](../models/pooling_models.md). -- [Classification API](#classification-api) (`/classify`) - - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`). -- [Score API](#score-api) (`/score`) - - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`). -- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) - - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) - - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) - - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. - - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). - -(chat-template)= +- [Tokenizer API][tokenizer-api] (`/tokenize`, `/detokenize`) + - Applicable to any model with a tokenizer. +- [Pooling API][pooling-api] (`/pooling`) + - Applicable to all [pooling models](../models/pooling_models.md). +- [Classification API][classification-api] (`/classify`) + - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`). +- [Score API][score-api] (`/score`) + - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`). +- [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`) + - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) + - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) + - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). + +[](){ #chat-template } ## Chat Template @@ -97,10 +96,10 @@ both a `type` and a `text` field. An example is provided below: ```python completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} - ] + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} + ] ) ``` @@ -111,9 +110,9 @@ request. vLLM provides best-effort support to detect this automatically, which i the detected format, which can be one of: - `"string"`: A string. - - Example: `"Hello world"` + - Example: `"Hello world"` - `"openai"`: A list of dictionaries, similar to OpenAI schema. - - Example: `[{"type": "text", "text": "Hello world!"}]` + - Example: `[{"type": "text", "text": "Hello world!"}]` If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument to override which format to use. @@ -126,13 +125,13 @@ Or directly merge them into the JSON payload if you are using HTTP call directly ```python completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={ - "guided_choice": ["positive", "negative"] - } + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={ + "guided_choice": ["positive", "negative"] + } ) ``` @@ -148,29 +147,29 @@ with `--enable-request-id-headers`. ```python completion = client.chat.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_headers={ - "x-request-id": "sentiment-classification-00001", - } + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_headers={ + "x-request-id": "sentiment-classification-00001", + } ) print(completion._request_id) completion = client.completions.create( - model="NousResearch/Meta-Llama-3-8B-Instruct", - prompt="A robot may not injure a human being", - extra_headers={ - "x-request-id": "completion-test", - } + model="NousResearch/Meta-Llama-3-8B-Instruct", + prompt="A robot may not injure a human being", + extra_headers={ + "x-request-id": "completion-test", + } ) print(completion._request_id) ``` ## API Reference -(completions-api)= +[](){ #completions-api } ### Completions API @@ -181,23 +180,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py> #### Extra parameters -The following [sampling parameters](#sampling-params) are supported. +The following [sampling parameters][sampling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-completion-sampling-params -:end-before: end-completion-sampling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-completion-extra-params -:end-before: end-completion-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params" +``` -(chat-api)= +[](){ #chat-api } ### Chat API @@ -206,37 +201,33 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](#multimodal-inputs) guide for more information. +see our [Multimodal Inputs][multimodal-inputs] guide for more information. - *Note: `image_url.detail` parameter is not supported.* Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py> #### Extra parameters -The following [sampling parameters](#sampling-params) are supported. +The following [sampling parameters][sampling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-chat-completion-sampling-params -:end-before: end-chat-completion-sampling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-chat-completion-extra-params -:end-before: end-chat-completion-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" +``` -(embeddings-api)= +[](){ #embeddings-api } ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) +If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api]) which will be treated as a single prompt to the model. Code example: <gh-file:examples/online_serving/openai_embedding_client.py> @@ -246,138 +237,117 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py> You can pass multi-modal inputs to embedding models by defining a custom chat template for the server and passing a list of `messages` in the request. Refer to the examples below for illustration. -:::::{tab-set} -::::{tab-item} VLM2Vec - -To serve the model: +=== "VLM2Vec" -```bash -vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ - --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja -``` + To serve the model: -:::{important} -Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` -to run this model in embedding mode instead of text generation mode. + ```bash + vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja + ``` -The custom chat template is completely different from the original one for this model, -and can be found here: <gh-file:examples/template_vlm2vec.jinja> -::: + !!! warning + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` + to run this model in embedding mode instead of text generation mode. -Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + The custom chat template is completely different from the original one for this model, + and can be found here: <gh-file:examples/template_vlm2vec.jinja> -```python -import requests - -image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - -response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - }], - "encoding_format": "float", - }, -) -response.raise_for_status() -response_json = response.json() -print("Embedding output:", response_json["data"][0]["embedding"]) -``` + Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: -:::: + ```python + import requests -::::{tab-item} DSE-Qwen2-MRL + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" -To serve the model: + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) + ``` -```bash -vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ - --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja -``` +=== "DSE-Qwen2-MRL" -:::{important} -Like with VLM2Vec, we have to explicitly pass `--task embed`. + To serve the model: -Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled -by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja> -::: + ```bash + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja + ``` -:::{important} -`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code -example below for details. -::: + !!! warning + Like with VLM2Vec, we have to explicitly pass `--task embed`. -:::: + Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled + by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja> -::::: + !!! warning + `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py> #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-embedding-pooling-params -:end-before: end-embedding-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params" +``` The following extra parameters are supported by default: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-embedding-extra-params -:end-before: end-embedding-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" +``` For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-chat-embedding-extra-params -:end-before: end-chat-embedding-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" +``` -(transcriptions-api)= +[](){ #transcriptions-api } ### Transcriptions API Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -:::{note} -To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`. -::: +!!! note + To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`. Code example: <gh-file:examples/online_serving/openai_transcription_client.py> <!-- TODO: api enforced limits + uploading audios --> #### Extra Parameters -The following [sampling parameters](#sampling-params) are supported. +The following [sampling parameters][sampling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-transcription-sampling-params -:end-before: end-transcription-sampling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-transcription-extra-params -:end-before: end-transcription-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" +``` -(tokenizer-api)= +[](){ #tokenizer-api } ### Tokenizer API @@ -387,17 +357,17 @@ It consists of two endpoints: - `/tokenize` corresponds to calling `tokenizer.encode()`. - `/detokenize` corresponds to calling `tokenizer.decode()`. -(pooling-api)= +[](){ #pooling-api } ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. -The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. +The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats. Code example: <gh-file:examples/online_serving/openai_pooling_client.py> -(classification-api)= +[](){ #classification-api } ### Classification API @@ -505,23 +475,19 @@ Response: #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-classification-pooling-params -:end-before: end-classification-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-classification-extra-params -:end-before: end-classification-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params" +``` -(score-api)= +[](){ #score-api } ### Score API @@ -668,23 +634,19 @@ Response: #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-score-pooling-params -:end-before: end-score-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-score-extra-params -:end-before: end-score-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params" +``` -(rerank-api)= +[](){ #rerank-api } ### Re-rank API @@ -755,18 +717,14 @@ Response: #### Extra parameters -The following [pooling parameters](#pooling-params) are supported. +The following [pooling parameters][pooling-params] are supported. -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-rerank-pooling-params -:end-before: end-rerank-pooling-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params" +``` The following extra parameters are supported: -:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py -:language: python -:start-after: begin-rerank-extra-params -:end-before: end-rerank-extra-params -::: +```python +--8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params" +``` diff --git a/docs/serving/serve_args.md b/docs/serving/serve_args.md new file mode 100644 index 000000000000..16b4b29f45d9 --- /dev/null +++ b/docs/serving/serve_args.md @@ -0,0 +1,38 @@ +--- +title: Server Arguments +--- +[](){ #serve-args } + +The `vllm serve` command is used to launch the OpenAI-compatible server. + +## CLI Arguments + +The `vllm serve` command is used to launch the OpenAI-compatible server. +To see the available CLI arguments, run `vllm serve --help`! + +## Configuration file + +You can load CLI arguments via a [YAML](https://yaml.org/) config file. +The argument names must be the long form of those outlined [above][serve-args]. + +For example: + +```yaml +# config.yaml + +model: meta-llama/Llama-3.1-8B-Instruct +host: "127.0.0.1" +port: 6379 +uvicorn-log-level: "info" +``` + +To use the above config file: + +```bash +vllm serve --config config.yaml +``` + +!!! note + In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. + The order of priorities is `command line > config file values > defaults`. + e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file. diff --git a/docs/source/serving/usage_stats.md b/docs/serving/usage_stats.md similarity index 100% rename from docs/source/serving/usage_stats.md rename to docs/serving/usage_stats.md diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css deleted file mode 100644 index 79bd2082b49e..000000000000 --- a/docs/source/_static/custom.css +++ /dev/null @@ -1,8 +0,0 @@ -.vertical-table-header th.head:not(.stub) { - writing-mode: sideways-lr; - white-space: nowrap; - max-width: 0; - p { - margin: 0; - } -} diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html deleted file mode 100644 index 7174431b1027..000000000000 --- a/docs/source/_templates/sections/header.html +++ /dev/null @@ -1,39 +0,0 @@ -<style> - .notification-bar { - width: 100vw; - display: flex; - justify-content: center; - align-items: center; - font-size: 16px; - padding: 0 6px 0 6px; - } - .notification-bar p { - margin: 0; - } - .notification-bar a { - font-weight: bold; - text-decoration: none; - } - - /* Light mode styles (default) */ - .notification-bar { - background-color: #fff3cd; - color: #856404; - } - .notification-bar a { - color: #d97706; - } - - /* Dark mode styles */ - html[data-theme=dark] .notification-bar { - background-color: #333; - color: #ddd; - } - html[data-theme=dark] .notification-bar a { - color: #ffa500; /* Brighter color for visibility */ - } -</style> - -<div class="notification-bar"> - <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p> -</div> diff --git a/docs/source/api/summary.md b/docs/source/api/summary.md deleted file mode 100644 index 46de545f9ded..000000000000 --- a/docs/source/api/summary.md +++ /dev/null @@ -1,133 +0,0 @@ -# Summary - -(configuration)= - -## Configuration - -API documentation for vLLM's configuration classes. - -```{autodoc2-summary} - vllm.config.ModelConfig - vllm.config.CacheConfig - vllm.config.TokenizerPoolConfig - vllm.config.LoadConfig - vllm.config.ParallelConfig - vllm.config.SchedulerConfig - vllm.config.DeviceConfig - vllm.config.SpeculativeConfig - vllm.config.LoRAConfig - vllm.config.PromptAdapterConfig - vllm.config.MultiModalConfig - vllm.config.PoolerConfig - vllm.config.DecodingConfig - vllm.config.ObservabilityConfig - vllm.config.KVTransferConfig - vllm.config.CompilationConfig - vllm.config.VllmConfig -``` - -(offline-inference-api)= - -## Offline Inference - -LLM Class. - -```{autodoc2-summary} - vllm.LLM -``` - -LLM Inputs. - -```{autodoc2-summary} - vllm.inputs.PromptType - vllm.inputs.TextPrompt - vllm.inputs.TokensPrompt -``` - -## vLLM Engines - -Engine classes for offline and online inference. - -```{autodoc2-summary} - vllm.LLMEngine - vllm.AsyncLLMEngine -``` - -## Inference Parameters - -Inference parameters for vLLM APIs. - -(sampling-params)= -(pooling-params)= - -```{autodoc2-summary} - vllm.SamplingParams - vllm.PoolingParams -``` - -(multi-modality)= - -## Multi-Modality - -vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. - -Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) -via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. - -Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal). - -```{autodoc2-summary} - vllm.multimodal.MULTIMODAL_REGISTRY -``` - -### Inputs - -User-facing inputs. - -```{autodoc2-summary} - vllm.multimodal.inputs.MultiModalDataDict -``` - -Internal data structures. - -```{autodoc2-summary} - vllm.multimodal.inputs.PlaceholderRange - vllm.multimodal.inputs.NestedTensors - vllm.multimodal.inputs.MultiModalFieldElem - vllm.multimodal.inputs.MultiModalFieldConfig - vllm.multimodal.inputs.MultiModalKwargsItem - vllm.multimodal.inputs.MultiModalKwargs - vllm.multimodal.inputs.MultiModalInputs -``` - -### Data Parsing - -```{autodoc2-summary} - vllm.multimodal.parse -``` - -### Data Processing - -```{autodoc2-summary} - vllm.multimodal.processing -``` - -### Memory Profiling - -```{autodoc2-summary} - vllm.multimodal.profiling -``` - -### Registry - -```{autodoc2-summary} - vllm.multimodal.registry -``` - -## Model Development - -```{autodoc2-summary} - vllm.model_executor.models.interfaces_base - vllm.model_executor.models.interfaces - vllm.model_executor.models.adapters -``` diff --git a/docs/source/autodoc2_docstring_parser.py b/docs/source/autodoc2_docstring_parser.py deleted file mode 100644 index 41c49ed1c545..000000000000 --- a/docs/source/autodoc2_docstring_parser.py +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -from docutils import nodes -from myst_parser.parsers.sphinx_ import MystParser -from sphinx.ext.napoleon import docstring - - -class NapoleonParser(MystParser): - - def parse(self, input_string: str, document: nodes.document) -> None: - # Get the Sphinx configuration - config = document.settings.env.config - - parsed_content = str( - docstring.GoogleDocstring( - str(docstring.NumpyDocstring(input_string, config)), - config, - )) - return super().parse(parsed_content, document) - - -Parser = NapoleonParser diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md deleted file mode 100644 index e8030edfa02e..000000000000 --- a/docs/source/community/blog.md +++ /dev/null @@ -1,3 +0,0 @@ -# vLLM Blog - -vLLM blog posts are published [here](https://blog.vllm.ai/). diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 5620d6de2c59..000000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,263 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import datetime -import logging -import os -import re -import sys -from pathlib import Path - -import requests - -logger = logging.getLogger(__name__) -REPO_ROOT = Path(__file__).resolve().parent.parent.parent -sys.path.append(os.path.abspath(REPO_ROOT)) - -# -- Project information ----------------------------------------------------- - -project = 'vLLM' -copyright = f'{datetime.datetime.now().year}, vLLM Team' -author = 'the vLLM Team' - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.napoleon", - "sphinx.ext.linkcode", - "sphinx.ext.intersphinx", - "sphinx_copybutton", - "autodoc2", - "myst_parser", - "sphinxarg.ext", - "sphinx_design", - "sphinx_togglebutton", -] -myst_enable_extensions = [ - "colon_fence", - "fieldlist", -] -autodoc2_packages = [ - { - "path": "../../vllm", - "exclude_dirs": ["__pycache__", "third_party"], - }, -] -autodoc2_output_dir = "api" -autodoc2_render_plugin = "myst" -autodoc2_hidden_objects = ["dunder", "private", "inherited"] -autodoc2_sort_names = True -autodoc2_index_template = None - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"] - -# Exclude the prompt "$" when copying code -copybutton_prompt_text = r"\$ " -copybutton_prompt_is_regexp = True - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_title = project -html_theme = 'sphinx_book_theme' -html_logo = 'assets/logos/vllm-logo-text-light.png' -html_favicon = 'assets/logos/vllm-logo-only-light.ico' -html_theme_options = { - 'path_to_docs': 'docs/source', - 'repository_url': 'https://github.com/vllm-project/vllm', - 'use_repository_button': True, - 'use_edit_page_button': True, - # Prevents the full API being added to the left sidebar of every page. - # Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB. - 'collapse_navbar': True, - # Makes API visible in the right sidebar on API reference pages. - 'show_toc_level': 3, -} -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] -html_js_files = ["custom.js"] -html_css_files = ["custom.css"] - -myst_heading_anchors = 2 -myst_url_schemes = { - 'http': None, - 'https': None, - 'mailto': None, - 'ftp': None, - "gh-issue": { - "url": - "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}", - "title": "Issue #{{path}}", - "classes": ["github"], - }, - "gh-pr": { - "url": - "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}", - "title": "Pull Request #{{path}}", - "classes": ["github"], - }, - "gh-project": { - "url": "https://github.com/orgs/vllm-project/projects/{{path}}", - "title": "Project #{{path}}", - "classes": ["github"], - }, - "gh-dir": { - "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}", - "title": "{{path}}", - "classes": ["github"], - }, - "gh-file": { - "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}", - "title": "{{path}}", - "classes": ["github"], - }, -} - -# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa -READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') -if READTHEDOCS_VERSION_TYPE == "tag": - # remove the warning banner if the version is a tagged release - header_file = os.path.join(os.path.dirname(__file__), - "_templates/sections/header.html") - # The file might be removed already if the build is triggered multiple times - # (readthedocs build both HTML and PDF versions separately) - if os.path.exists(header_file): - os.remove(header_file) - - -# Generate additional rst documentation here. -def setup(app): - from docs.source.generate_examples import generate_examples - generate_examples() - - -_cached_base: str = "" -_cached_branch: str = "" - - -def get_repo_base_and_branch(pr_number): - global _cached_base, _cached_branch - if _cached_base and _cached_branch: - return _cached_base, _cached_branch - - url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}" - response = requests.get(url) - if response.status_code == 200: - data = response.json() - _cached_base = data['head']['repo']['full_name'] - _cached_branch = data['head']['ref'] - return _cached_base, _cached_branch - else: - logger.error("Failed to fetch PR details: %s", response) - return None, None - - -def linkcode_resolve(domain, info): - if domain != 'py': - return None - if not info['module']: - return None - - # Get path from module name - file = Path(f"{info['module'].replace('.', '/')}.py") - path = REPO_ROOT / file - if not path.exists(): - path = REPO_ROOT / file.with_suffix("") / "__init__.py" - if not path.exists(): - return None - - # Get the line number of the object - with open(path) as f: - lines = f.readlines() - name = info['fullname'].split(".")[-1] - pattern = fr"^( {{4}})*((def|class) )?{name}\b.*" - for lineno, line in enumerate(lines, 1): - if not line or line.startswith("#"): - continue - if re.match(pattern, line): - break - - # If the line number is not found, return None - if lineno == len(lines): - return None - - # If the line number is found, create the URL - filename = path.relative_to(REPO_ROOT) - if "checkouts" in path.parts: - # a PR build on readthedocs - pr_number = REPO_ROOT.name - base, branch = get_repo_base_and_branch(pr_number) - if base and branch: - return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}" - # Otherwise, link to the source file on the main branch - return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}" - - -# Mock out external dependencies here, otherwise sphinx-argparse won't work. -autodoc_mock_imports = [ - "huggingface_hub", - "pydantic", - "zmq", - "cloudpickle", - "aiohttp", - "starlette", - "blake3", - "cpuinfo", - "transformers", - "psutil", - "vllm._C", - "PIL", - "numpy", - "tqdm", - # The mocks below are required by - # docs/source/serving/openai_compatible_server.md's - # vllm.entrypoints.openai.cli_args - "openai", - "fastapi", - "partial_json_parser", -] - -for mock_target in autodoc_mock_imports: - if mock_target in sys.modules: - logger.info( - "Potentially problematic mock target (%s) found; " - "autodoc_mock_imports cannot mock modules that have already " - "been loaded into sys.modules when the sphinx build starts.", - mock_target) - -intersphinx_mapping = { - "python": ("https://docs.python.org/3", None), - "typing_extensions": - ("https://typing-extensions.readthedocs.io/en/latest", None), - "aiohttp": ("https://docs.aiohttp.org/en/stable", None), - "pillow": ("https://pillow.readthedocs.io/en/stable", None), - "numpy": ("https://numpy.org/doc/stable", None), - "torch": ("https://pytorch.org/docs/stable", None), - "psutil": ("https://psutil.readthedocs.io/en/stable", None), -} - -navigation_with_keys = False diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md deleted file mode 100644 index 721ee3cd2047..000000000000 --- a/docs/source/contributing/model/index.md +++ /dev/null @@ -1,27 +0,0 @@ -(new-model)= - -# Adding a New Model - -This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. - -:::{toctree} -:caption: Contents -:maxdepth: 1 - -basic -registration -tests -multimodal -::: - -:::{note} -The complexity of adding a new model depends heavily on the model's architecture. -The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. -However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. -::: - -:::{tip} -If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) -or ask on our [developer slack](https://slack.vllm.ai). -We will be happy to help you out! -::: diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md deleted file mode 100644 index b42536f054d7..000000000000 --- a/docs/source/contributing/model/multimodal.md +++ /dev/null @@ -1,834 +0,0 @@ -(supports-multimodal)= - -# Multi-Modal Support - -This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs). - -## 1. Update the base vLLM model - -It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic). -Further update the model as follows: - -- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example: - - ```diff - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - ``` - - More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it. - -- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. - - ```python - class YourModelForImage2Seq(nn.Module): - ... - - def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: - - assert self.vision_encoder is not None - image_features = self.vision_encoder(image_input) - return self.multi_modal_projector(image_features) - - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - - # Validate the multimodal input keyword arguments - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return None - - # Run multimodal inputs through encoder and projector - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - ``` - - :::{important} - The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. - ::: - -- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. - - ```python - from .utils import merge_multimodal_embeddings - - class YourModelForImage2Seq(nn.Module): - ... - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - - # `get_input_embeddings` should already be implemented for the language - # model as one of the requirements of basic vLLM model implementation. - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=self.config.image_token_index) - - return inputs_embeds - ``` - -- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model. - - ```python - class YourModelForImage2Seq(nn.Module): - ... - - def get_language_model(self) -> torch.nn.Module: - # Change `language_model` according to your implementation. - return self.language_model - ``` - -- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - - ```diff - + from vllm.model_executor.models.interfaces import SupportsMultiModal - - - class YourModelForImage2Seq(nn.Module): - + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - ``` - - :::{note} - The model class does not have to be named {code}`*ForCausalLM`. - Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. - ::: - -## 2. Specify processing information - -Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo` -to provide basic information related to HF processing. - -### Maximum number of input items - -You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits` -to return the maximum number of input items for each modality supported by the model. - -For example, if the model supports any number of images but only one video per prompt: - -```python -def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": 1} -``` - -## 3. Specify dummy inputs - -Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for -HF processing as well as memory profiling. - -### For memory profiling - -Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it. - -Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens. - -::::{tab-set} -:::{tab-item} Basic example: LLaVA -:sync: llava - -Looking at the code of HF's `LlavaForConditionalGeneration`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 -n_image_tokens = (input_ids == self.config.image_token_index).sum().item() -n_image_features = image_features.shape[0] * image_features.shape[1] - -if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) -special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) -) -image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) -inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) -``` - -The number of placeholder feature tokens per image is `image_features.shape[1]`. -`image_features` is calculated inside the `get_image_features` method: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 -image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) - -selected_image_feature = image_outputs.hidden_states[vision_feature_layer] -if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] -elif vision_feature_select_strategy == "full": - selected_image_feature = selected_image_feature -else: - raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") -image_features = self.multi_modal_projector(selected_image_feature) -return image_features -``` - -We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower -(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). -Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`. -The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention -mechanism doesn't change the sequence length of the output hidden states. - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102 -hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) -hidden_states = self.pre_layrnorm(hidden_states) - -encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, -) -``` - -To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 -target_dtype = self.patch_embedding.weight.dtype -patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] -patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - -class_embeds = self.class_embedding.expand(batch_size, 1, -1) -embeddings = torch.cat([class_embeds, patch_embeds], dim=1) -if interpolate_pos_encoding: - embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) -else: - embeddings = embeddings + self.position_embedding(self.position_ids) -return embeddings -``` - -We can infer that `embeddings.shape[1] == self.num_positions`, where - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196 -self.num_patches = (self.image_size // self.patch_size) ** 2 -self.num_positions = self.num_patches + 1 -``` - -Overall, the number of placeholder feature tokens for an image can be calculated as: - -```python -def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, -) -> int: - hf_config = self.get_hf_config() - hf_processor = self.get_hf_processor() - - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - - num_image_tokens = (image_size // patch_size) ** 2 + 1 - if hf_processor.vision_feature_select_strategy == "default": - num_image_tokens -= 1 - - return num_image_tokens -``` - -Notice that the number of image tokens doesn't depend on the image width and height. -We can simply use a dummy `image_size` to calculate the multimodal profiling data: - -```python -# NOTE: In actuality, this is usually implemented as part of the -# model's subclass of `BaseProcessingInfo`, but we show it as is -# here for simplicity. -def get_image_size_with_most_features(self) -> ImageSize: - hf_config = self.get_hf_config() - width = height = hf_config.image_size - return ImageSize(width=width, height=height) - -def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], -) -> MultiModalDataDict: - num_images = mm_counts.get("image", 0) - - target_width, target_height = \ - self.info.get_image_size_with_most_features() - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } -``` - -For the text, we simply expand the multimodal image token from the model config to match the desired number of images. - -```python -def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - num_images = mm_counts.get("image", 0) - - processor = self.info.get_hf_processor() - image_token = processor.image_token - - return image_token * num_images -``` - -::: - -:::{tab-item} No input placeholders: Fuyu -:sync: fuyu - -Looking at the code of HF's `FuyuForCausalLM`: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 -if image_patches is not None and past_key_values is None: - patch_embeddings = [ - self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) - .squeeze(0) - .to(inputs_embeds.device) - for patch in image_patches - ] - inputs_embeds = self.gather_continuous_embeddings( - word_embeddings=inputs_embeds, - continuous_embeddings=patch_embeddings, - image_patch_input_indices=image_patches_indices, - ) -``` - -The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, -which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. - -Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information? -Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**. - -The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then -`FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`. - -In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, -returning the dimensions after resizing (but before padding) as metadata. - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 -image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) -batch_images = image_encoding["images"] -image_unpadded_heights = image_encoding["image_unpadded_heights"] -image_unpadded_widths = image_encoding["image_unpadded_widths"] - -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L -if do_resize: - batch_images = [ - [self.resize(image, size=size, input_data_format=input_data_format) for image in images] - for images in batch_images - ] - -image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] -image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] -image_unpadded_widths = [[image_size[1]] for image_size in image_sizes] - -if do_pad: - batch_images = [ - [ - self.pad_image( - image, - size=size, - mode=padding_mode, - constant_values=padding_value, - input_data_format=input_data_format, - ) - for image in images - ] - for images in batch_images - ] -``` - -In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 -model_image_input = self.image_processor.preprocess_with_tokenizer_info( - image_input=tensor_batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=image_placeholder_id, - image_newline_id=image_newline_id, - variable_sized=True, -) - -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 -image_height, image_width = image.shape[1], image.shape[2] -if variable_sized: # variable_sized=True - new_h = min( - image_height, - math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height, - ) - new_w = min( - image_width, - math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width, - ) - image = image[:, :new_h, :new_w] - image_height, image_width = new_h, new_w - -num_patches = self.get_num_patches(image_height=image_height, image_width=image_width) -tensor_of_image_ids = torch.full( - [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device -) -patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) -assert num_patches == patches.shape[0] -``` - -The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 -patch_size = patch_size if patch_size is not None else self.patch_size -patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] - -if image_height % patch_height != 0: - raise ValueError(f"{image_height=} must be divisible by {patch_height}") -if image_width % patch_width != 0: - raise ValueError(f"{image_width=} must be divisible by {patch_width}") - -num_patches_per_dim_h = image_height // patch_height -num_patches_per_dim_w = image_width // patch_width -num_patches = num_patches_per_dim_h * num_patches_per_dim_w -``` - -These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized -to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. - -```python -def get_image_size_with_most_features(self) -> ImageSize: - image_processor = self.get_image_processor() - return ImageSize(width=image_processor.size["width"], - height=image_processor.size["height"]) -``` - -Fuyu does not expect image placeholders in the inputs to HF processor, so -the dummy prompt text is empty regardless of the number of images. - -```python -def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - return "" -``` - -For the multimodal image profiling data, the logic is very similar to LLaVA: - -```python -def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], -) -> MultiModalDataDict: - target_width, target_height = \ - self.info.get_image_size_with_most_features() - num_images = mm_counts.get("image", 0) - - return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } -``` - -::: - -:::: - -## 4. Specify processing details - -Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` -to fill in the missing details about HF processing. - -:::{seealso} -[Multi-Modal Data Processing](#mm-processing) -::: - -### Multi-modal fields - -Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to -return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. - -:::::{tab-set} -::::{tab-item} Basic example: LLaVA -:sync: llava - -The output of `CLIPImageProcessor` is a simple tensor with shape -`(num_images, num_channels, image_height, image_width)`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 -images = [ - to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - for image in all_images -] - -data = {"pixel_values": images} -return BatchFeature(data=data, tensor_type=return_tensors) -``` - -So, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows: - -```python -def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], -) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - ) -``` - -:::{note} -Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports -pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. -::: - -:::: - -::::{tab-item} With postprocessing: Fuyu -:sync: fuyu - -The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates -the patches from each image belonging to an item in the batch: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679 - image_input_ids.append(tensor_of_image_ids) - image_patches.append(patches) - else: - image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device)) - -batch_image_input_ids.append(image_input_ids) -batch_image_patches.append(image_patches) -``` - -The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore -`(1, num_images, num_patches, patch_width * patch_height * num_channels)`. - -In order to support the use of {func}`MultiModalFieldConfig.batched` like in LLaVA, -we remove the extra batch dimension by overriding {meth}`BaseMultiModalProcessor._call_hf_processor`: - -```python -def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], -) -> BatchFeature: - processed_outputs = super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - - image_patches = processed_outputs.get("image_patches") - if image_patches is not None: - images = mm_data["images"] - assert isinstance(images, list) - - # Original output: (1, num_images, Pn, Px * Py * C) - # New output: (num_images, Pn, Px * Py * C) - assert (isinstance(image_patches, list) - and len(image_patches) == 1) - assert (isinstance(image_patches[0], torch.Tensor) - and len(image_patches[0]) == len(images)) - - processed_outputs["image_patches"] = image_patches[0] - - return processed_outputs -``` - -:::{note} -Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling -for text-only inputs to prevent unnecessary warnings from HF processor. -::: - -This lets us override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows: - -```python -def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], -) -> Mapping[str, MultiModalFieldConfig]: - return dict(image_patches=MultiModalFieldConfig.batched("image")) -``` - -:::: - -::::: - -### Prompt updates - -Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to -return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances. - -Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation -(e.g.: insertion, replacement) performed by the HF processor. - -::::{tab-set} -:::{tab-item} Basic example: LLaVA -:sync: llava - -Looking at HF's `LlavaProcessor`: - -```python -# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170 -prompt_strings = [] -for sample in text: - sample = sample.replace(self.image_token, self.image_token * num_image_tokens) - prompt_strings.append(sample) -``` - -It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). -Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows: - -```python -def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, -) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index - - def get_replacement(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - - image_size = images.get_image_size(item_idx) - num_image_tokens = self.info.get_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, - ) - - return [image_token_id] * num_image_tokens - - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement, - ), - ] -``` - -::: - -:::{tab-item} Handling additional tokens: Fuyu -:sync: fuyu - -Recall the layout of feature tokens from Step 2: - -``` -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -... -|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE| -``` - -We define a helper function to return `ncols` and `nrows` directly: - -```python -def get_image_feature_grid_size( - self, - *, - image_width: int, - image_height: int, -) -> tuple[int, int]: - image_processor = self.get_image_processor() - target_width = image_processor.size["width"] - target_height = image_processor.size["height"] - patch_width = image_processor.patch_size["width"] - patch_height = image_processor.patch_size["height"] - - if not (image_width <= target_width and image_height <= target_height): - height_scale_factor = target_height / image_height - width_scale_factor = target_width / image_width - optimal_scale_factor = min(height_scale_factor, width_scale_factor) - - image_height = int(image_height * optimal_scale_factor) - image_width = int(image_width * optimal_scale_factor) - - ncols = math.ceil(image_width / patch_width) - nrows = math.ceil(image_height / patch_height) - return ncols, nrows -``` - -Based on this, we can initially define our replacement tokens as: - -```python -def get_replacement(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, - ) - - # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|` - # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|` - return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows -``` - -However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, -a BOS token (`<s>`) is also added to the promopt: - -```python -# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 -model_image_input = self.image_processor.preprocess_with_tokenizer_info( - image_input=tensor_batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=image_placeholder_id, - image_newline_id=image_newline_id, - variable_sized=True, -) -prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( - tokenizer=self.tokenizer, - prompts=prompts, - scale_factors=scale_factors, - max_tokens_to_generate=self.max_tokens_to_generate, - max_position_embeddings=self.max_position_embeddings, - add_BOS=True, - add_beginning_of_answer_token=True, -) -``` - -To assign the vision embeddings to only the image tokens, instead of a string -you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`: - -```python -hf_config = self.info.get_hf_config() -bos_token_id = hf_config.bos_token_id # `<s>` -assert isinstance(bos_token_id, int) - -def get_replacement_fuyu(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, - ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows - - return PromptUpdateDetails.select_token_id( - image_tokens + [bos_token_id], - embed_token_id=_IMAGE_TOKEN_ID, - ) -``` - -Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, -we can search for it to conduct the replacement at the start of the string: - -```python -def _get_prompt_updates( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, -) -> Sequence[PromptUpdate]: - hf_config = self.info.get_hf_config() - bos_token_id = hf_config.bos_token_id - assert isinstance(bos_token_id, int) - - tokenizer = self.info.get_tokenizer() - eot_token_id = tokenizer.bos_token_id - assert isinstance(eot_token_id, int) - - def get_replacement_fuyu(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ncols, nrows = self.info.get_image_feature_grid_size( - image_width=image_size.width, - image_height=image_size.height, - ) - image_tokens = ([_IMAGE_TOKEN_ID] * ncols + - [_NEWLINE_TOKEN_ID]) * nrows - - return PromptUpdateDetails.select_token_id( - image_tokens + [bos_token_id], - embed_token_id=_IMAGE_TOKEN_ID, - ) - - return [ - PromptReplacement( - modality="image", - target=[eot_token_id], - replacement=get_replacement_fuyu, - ) - ] -``` - -::: - -:::: - -## 5. Register processor-related classes - -After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2), -{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3), -and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4), -decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>` -to register them to the multi-modal registry: - -```diff - from vllm.model_executor.models.interfaces import SupportsMultiModal -+ from vllm.multimodal import MULTIMODAL_REGISTRY - -+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, -+ info=YourProcessingInfo, -+ dummy_inputs=YourDummyInputsBuilder) - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): -``` - -## Notes - -### Inserting feature tokens without replacement - -Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. - -Examples: - -- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py> -- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py> -- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py> - -### Handling prompt updates unrelated to multi-modal data - -{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing). - -Examples: - -- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py> -- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py> -- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py> - -### Custom HF processor - -Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`. - -Examples: - -- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py> -- InternVL: <gh-file:vllm/model_executor/models/internvl.py> -- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py> diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md deleted file mode 100644 index ca56710bc2ef..000000000000 --- a/docs/source/deployment/docker.md +++ /dev/null @@ -1,133 +0,0 @@ -(deployment-docker)= - -# Using Docker - -(deployment-docker-pre-built-image)= - -## Use vLLM's Official Docker Image - -vLLM offers an official Docker image for deployment. -The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). - -```console -$ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 -``` - -This image can also be used with other container engines such as [Podman](https://podman.io/). - -```console -$ podman run --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 -``` - -You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`). - -:::{note} -You can either use the `ipc=host` flag or `--shm-size` flag to allow the -container to access the host's shared memory. vLLM uses PyTorch, which uses shared -memory to share data between processes under the hood, particularly for tensor parallel inference. -::: - -:::{note} -Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>). - -If you need to use those dependencies (having accepted the license terms), -create a custom Dockerfile on top of the base image with an extra layer that installs them: - -```Dockerfile -FROM vllm/vllm-openai:v0.8.3 - -# e.g. install the `audio` optional dependencies -# NOTE: Make sure the version of vLLM matches the base image! -RUN uv pip install --system vllm[audio]==0.8.3 -``` - -::: - -:::{tip} -Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers). - -To use the development version of `transformers`, create a custom Dockerfile on top of the base image -with an extra layer that installs their code from source: - -```Dockerfile -FROM vllm/vllm-openai:latest - -RUN uv pip install --system git+https://github.com/huggingface/transformers.git -``` - -::: - -(deployment-docker-build-image-from-source)= - -## Building vLLM's Docker Image from Source - -You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM: - -```console -# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile -``` - -:::{note} -By default vLLM will build for all GPU types for widest distribution. If you are just building for the -current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` -for vLLM to find the current GPU type and build for that. - -If you are using Podman instead of Docker, you might need to disable SELinux labeling by -adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). -::: - -## Building for Arm64/aarch64 - -A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use -of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. - -:::{note} -Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` -flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. -Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). -::: - -```console -# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) -$ python3 use_existing_torch.py -$ DOCKER_BUILDKIT=1 docker build . \ - --file docker/Dockerfile \ - --target vllm-openai \ - --platform "linux/arm64" \ - -t vllm/vllm-gh200-openai:latest \ - --build-arg max_jobs=66 \ - --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" -``` - -## Use the custom-built vLLM Docker image - -To run vLLM with the custom-built Docker image: - -```console -$ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ - vllm/vllm-openai <args...> -``` - -The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). - -:::{note} -**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . -::: diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md deleted file mode 100644 index 7320d727fbaa..000000000000 --- a/docs/source/deployment/frameworks/helm.md +++ /dev/null @@ -1,250 +0,0 @@ -(deployment-helm)= - -# Helm - -A Helm chart to deploy vLLM for Kubernetes - -Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values. - -This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file. - -## Prerequisites - -Before you begin, ensure that you have the following: - -- A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) -- Available GPU resources in your cluster -- S3 with the model which will be deployed - -## Installing the chart - -To install the chart with the release name `test-vllm`: - -```console -helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY -``` - -## Uninstalling the Chart - -To uninstall the `test-vllm` deployment: - -```console -helm uninstall test-vllm --namespace=ns-vllm -``` - -The command removes all the Kubernetes components associated with the -chart **including persistent volumes** and deletes the release. - -## Architecture - -:::{image} /assets/deployment/architecture_helm_deployment.png -::: - -## Values - -:::{list-table} -:widths: 25 25 25 25 -:header-rows: 1 - -- * Key - * Type - * Default - * Description -- * autoscaling - * object - * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} - * Autoscaling configuration -- * autoscaling.enabled - * bool - * false - * Enable autoscaling -- * autoscaling.maxReplicas - * int - * 100 - * Maximum replicas -- * autoscaling.minReplicas - * int - * 1 - * Minimum replicas -- * autoscaling.targetCPUUtilizationPercentage - * int - * 80 - * Target CPU utilization for autoscaling -- * configs - * object - * {} - * Configmap -- * containerPort - * int - * 8000 - * Container port -- * customObjects - * list - * [] - * Custom Objects configuration -- * deploymentStrategy - * object - * {} - * Deployment strategy configuration -- * externalConfigs - * list - * [] - * External configuration -- * extraContainers - * list - * [] - * Additional containers configuration -- * extraInit - * object - * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} - * Additional configuration for the init container -- * extraInit.pvcStorage - * string - * "50Gi" - * Storage size of the s3 -- * extraInit.s3modelpath - * string - * "relative_s3_model_path/opt-125m" - * Path of the model on the s3 which hosts model weights and config files -- * extraInit.awsEc2MetadataDisabled - * boolean - * true - * Disables the use of the Amazon EC2 instance metadata service -- * extraPorts - * list - * [] - * Additional ports configuration -- * gpuModels - * list - * ["TYPE_GPU_USED"] - * Type of gpu used -- * image - * object - * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} - * Image configuration -- * image.command - * list - * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] - * Container launch command -- * image.repository - * string - * "vllm/vllm-openai" - * Image repository -- * image.tag - * string - * "latest" - * Image tag -- * livenessProbe - * object - * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} - * Liveness probe configuration -- * livenessProbe.failureThreshold - * int - * 3 - * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive -- * livenessProbe.httpGet - * object - * {"path":"/health","port":8000} - * Configuration of the Kubelet http request on the server -- * livenessProbe.httpGet.path - * string - * "/health" - * Path to access on the HTTP server -- * livenessProbe.httpGet.port - * int - * 8000 - * Name or number of the port to access on the container, on which the server is listening -- * livenessProbe.initialDelaySeconds - * int - * 15 - * Number of seconds after the container has started before liveness probe is initiated -- * livenessProbe.periodSeconds - * int - * 10 - * How often (in seconds) to perform the liveness probe -- * maxUnavailablePodDisruptionBudget - * string - * "" - * Disruption Budget Configuration -- * readinessProbe - * object - * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} - * Readiness probe configuration -- * readinessProbe.failureThreshold - * int - * 3 - * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready -- * readinessProbe.httpGet - * object - * {"path":"/health","port":8000} - * Configuration of the Kubelet http request on the server -- * readinessProbe.httpGet.path - * string - * "/health" - * Path to access on the HTTP server -- * readinessProbe.httpGet.port - * int - * 8000 - * Name or number of the port to access on the container, on which the server is listening -- * readinessProbe.initialDelaySeconds - * int - * 5 - * Number of seconds after the container has started before readiness probe is initiated -- * readinessProbe.periodSeconds - * int - * 5 - * How often (in seconds) to perform the readiness probe -- * replicaCount - * int - * 1 - * Number of replicas -- * resources - * object - * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} - * Resource configuration -- * resources.limits."nvidia.com/gpu" - * int - * 1 - * Number of gpus used -- * resources.limits.cpu - * int - * 4 - * Number of CPUs -- * resources.limits.memory - * string - * "16Gi" - * CPU memory configuration -- * resources.requests."nvidia.com/gpu" - * int - * 1 - * Number of gpus used -- * resources.requests.cpu - * int - * 4 - * Number of CPUs -- * resources.requests.memory - * string - * "16Gi" - * CPU memory configuration -- * secrets - * object - * {} - * Secrets configuration -- * serviceName - * string - * - * Service name -- * servicePort - * int - * 80 - * Service port -- * labels.environment - * string - * test - * Environment name -- * labels.release - * string - * test - * Release name -::: diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md deleted file mode 100644 index 3408c6c10ede..000000000000 --- a/docs/source/deployment/frameworks/index.md +++ /dev/null @@ -1,22 +0,0 @@ -# Using other frameworks - -:::{toctree} -:maxdepth: 1 - -anything-llm -bentoml -cerebrium -chatbox -dify -dstack -helm -litellm -lobe-chat -lws -modal -open-webui -retrieval_augmented_generation -skypilot -streamlit -triton -::: diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md deleted file mode 100644 index 410742b88c73..000000000000 --- a/docs/source/deployment/integrations/index.md +++ /dev/null @@ -1,11 +0,0 @@ -# External Integrations - -:::{toctree} -:maxdepth: 1 - -kserve -kubeai -llamastack -llmaz -production-stack -::: diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md deleted file mode 100644 index 8865d26deaed..000000000000 --- a/docs/source/features/compatibility_matrix.md +++ /dev/null @@ -1,476 +0,0 @@ -(compatibility-matrix)= - -# Compatibility Matrix - -The tables below show mutually exclusive features and the support on some hardware. - -The symbols used have the following meanings: - -- ✅ = Full compatibility -- 🟠 = Partial compatibility -- ❌ = No compatibility - -:::{note} -Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination. -::: - -## Feature x Feature - -:::{raw} html -<style> - /* Make smaller to try to improve readability */ - td { - font-size: 0.8rem; - text-align: center; - } - - th { - text-align: center; - font-size: 0.8rem; - } -</style> -::: - -:::{list-table} -:header-rows: 1 -:stub-columns: 1 -:widths: auto -:class: vertical-table-header - -- * Feature - * [CP](#chunked-prefill) - * [APC](#automatic-prefix-caching) - * [LoRA](#lora-adapter) - * <abbr title="Prompt Adapter">prmpt adptr</abbr> - * [SD](#spec-decode) - * CUDA graph - * <abbr title="Pooling Models">pooling</abbr> - * <abbr title="Encoder-Decoder Models">enc-dec</abbr> - * <abbr title="Logprobs">logP</abbr> - * <abbr title="Prompt Logprobs">prmpt logP</abbr> - * <abbr title="Async Output Processing">async output</abbr> - * multi-step - * <abbr title="Multimodal Inputs">mm</abbr> - * best-of - * beam-search - * <abbr title="Guided Decoding">guided dec</abbr> -- * [CP](#chunked-prefill) - * ✅ - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * -- * [APC](#automatic-prefix-caching) - * ✅ - * ✅ - * - * - * - * - * - * - * - * - * - * - * - * - * - * -- * [LoRA](#lora-adapter) - * ✅ - * ✅ - * ✅ - * - * - * - * - * - * - * - * - * - * - * - * - * -- * <abbr title="Prompt Adapter">prmpt adptr</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * - * - * - * - * - * - * - * - * - * - * - * -- * [SD](#spec-decode) - * ✅ - * ✅ - * ❌ - * ✅ - * ✅ - * - * - * - * - * - * - * - * - * - * - * -- * CUDA graph - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * - * - * - * - * - * - * - * - * - * -- * <abbr title="Pooling Models">pooling</abbr> - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ - * ✅ - * - * - * - * - * - * - * - * - * -- * <abbr title="Encoder-Decoder Models">enc-dec</abbr> - * ❌ - * [❌](gh-issue:7366) - * ❌ - * ❌ - * [❌](gh-issue:7366) - * ✅ - * ✅ - * ✅ - * - * - * - * - * - * - * - * -- * <abbr title="Logprobs">logP</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ❌ - * ✅ - * ✅ - * - * - * - * - * - * - * -- * <abbr title="Prompt Logprobs">prmpt logP</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ❌ - * ✅ - * ✅ - * ✅ - * - * - * - * - * - * -- * <abbr title="Async Output Processing">async output</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ❌ - * ✅ - * ❌ - * ❌ - * ✅ - * ✅ - * ✅ - * - * - * - * - * -- * multi-step - * ❌ - * ✅ - * ❌ - * ✅ - * ❌ - * ✅ - * ❌ - * ❌ - * ✅ - * ✅ - * ✅ - * ✅ - * - * - * - * -- * <abbr title="Multimodal Inputs">mm</abbr> - * ✅ - * [🟠](gh-pr:8348) - * [🟠](gh-pr:4194) - * ❔ - * ❔ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ❔ - * ✅ - * - * - * -- * best-of - * ✅ - * ✅ - * ✅ - * ✅ - * [❌](gh-issue:6137) - * ✅ - * ❌ - * ✅ - * ✅ - * ✅ - * ❔ - * [❌](gh-issue:7968) - * ✅ - * ✅ - * - * -- * beam-search - * ✅ - * ✅ - * ✅ - * ✅ - * [❌](gh-issue:6137) - * ✅ - * ❌ - * ✅ - * ✅ - * ✅ - * ❔ - * [❌](gh-issue:7968) - * ❔ - * ✅ - * ✅ - * -- * <abbr title="Guided Decoding">guided dec</abbr> - * ✅ - * ✅ - * ❔ - * ❔ - * [❌](gh-issue:11484) - * ✅ - * ❌ - * ❔ - * ✅ - * ✅ - * ✅ - * [❌](gh-issue:9893) - * ❔ - * ✅ - * ✅ - * ✅ -::: - -(feature-x-hardware)= - -## Feature x Hardware - -:::{list-table} -:header-rows: 1 -:stub-columns: 1 -:widths: auto - -- * Feature - * Volta - * Turing - * Ampere - * Ada - * Hopper - * CPU - * AMD -- * [CP](#chunked-prefill) - * [❌](gh-issue:2729) - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * [APC](#automatic-prefix-caching) - * [❌](gh-issue:3687) - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * [LoRA](#lora-adapter) - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * <abbr title="Prompt Adapter">prmpt adptr</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * [❌](gh-issue:8475) - * ✅ -- * [SD](#spec-decode) - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * CUDA graph - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ❌ - * ✅ -- * <abbr title="Pooling Models">pooling</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ❔ -- * <abbr title="Encoder-Decoder Models">enc-dec</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ❌ -- * <abbr title="Multimodal Inputs">mm</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * <abbr title="Logprobs">logP</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * <abbr title="Prompt Logprobs">prmpt logP</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * <abbr title="Async Output Processing">async output</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ❌ - * ❌ -- * multi-step - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * [❌](gh-issue:8477) - * ✅ -- * best-of - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * beam-search - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -- * <abbr title="Guided Decoding">guided dec</abbr> - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ - * ✅ -::: diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md deleted file mode 100644 index 7ad46b7094ee..000000000000 --- a/docs/source/features/quantization/index.md +++ /dev/null @@ -1,24 +0,0 @@ -(quantization-index)= - -# Quantization - -Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. - -:::{toctree} -:caption: Contents -:maxdepth: 1 - -supported_hardware -auto_awq -bnb -bitblas -gguf -gptqmodel -int4 -int8 -fp8 -modelopt -quark -quantized_kvcache -torchao -::: diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md deleted file mode 100644 index f8af1ba60b12..000000000000 --- a/docs/source/features/quantization/supported_hardware.md +++ /dev/null @@ -1,153 +0,0 @@ -(quantization-supported-hardware)= - -# Supported Hardware - -The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: - -:::{list-table} -:header-rows: 1 -:widths: 20 8 8 8 8 8 8 8 8 8 8 - -- * Implementation - * Volta - * Turing - * Ampere - * Ada - * Hopper - * AMD GPU - * Intel GPU - * x86 CPU - * AWS Inferentia - * Google TPU -- * AWQ - * ❌ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ✅︎ - * ✅︎ - * ❌ - * ❌ -- * GPTQ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ✅︎ - * ✅︎ - * ❌ - * ❌ -- * Marlin (GPTQ/AWQ/FP8) - * ❌ - * ❌ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ -- * INT8 (W8A8) - * ❌ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ✅︎ - * ❌ - * ✅︎ -- * FP8 (W8A8) - * ❌ - * ❌ - * ❌ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ❌ - * ❌ -- * BitBLAS (GPTQ) - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ -- * AQLM - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ -- * bitsandbytes - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ -- * DeepSpeedFP - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ -- * GGUF - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ❌ - * ❌ - * ❌ - * ❌ -- * modelopt - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎ - * ✅︎︎ - * ❌ - * ❌ - * ❌ - * ❌ - * ❌ -::: - -- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. -- ✅︎ indicates that the quantization method is supported on the specified hardware. -- ❌ indicates that the quantization method is not supported on the specified hardware. - -:::{note} -This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. - -For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team. -::: diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py deleted file mode 100644 index f77dbefb0a01..000000000000 --- a/docs/source/generate_examples.py +++ /dev/null @@ -1,244 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import itertools -import re -from dataclasses import dataclass, field -from pathlib import Path - -ROOT_DIR = Path(__file__).parent.parent.parent.resolve() -ROOT_DIR_RELATIVE = '../../../..' -EXAMPLE_DIR = ROOT_DIR / "examples" -EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples" - - -def fix_case(text: str) -> str: - subs = { - "api": "API", - "cli": "CLI", - "cpu": "CPU", - "llm": "LLM", - "mae": "MAE", - "tpu": "TPU", - "aqlm": "AQLM", - "gguf": "GGUF", - "lora": "LoRA", - "rlhf": "RLHF", - "vllm": "vLLM", - "openai": "OpenAI", - "lmcache": "LMCache", - "multilora": "MultiLoRA", - "mlpspeculator": "MLPSpeculator", - r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32 - r"int\d+": lambda x: x.group(0).upper(), # e.g. int8, int16 - } - for pattern, repl in subs.items(): - text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE) - return text - - -@dataclass -class Index: - """ - Index class to generate a structured document index. - - Attributes: - path (Path): The path save the index file to. - title (str): The title of the index. - description (str): A brief description of the index. - caption (str): An optional caption for the table of contents. - maxdepth (int): The maximum depth of the table of contents. Defaults to 1. - documents (list[str]): A list of document paths to include in the index. Defaults to an empty list. - - Methods: - generate() -> str: - Generates the index content as a string in the specified format. - """ # noqa: E501 - path: Path - title: str - description: str - caption: str - maxdepth: int = 1 - documents: list[str] = field(default_factory=list) - - def generate(self) -> str: - content = f"# {self.title}\n\n{self.description}\n\n" - content += ":::{toctree}\n" - content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" - content += "\n".join(self.documents) + "\n:::\n" - return content - - -@dataclass -class Example: - """ - Example class for generating documentation content from a given path. - - Attributes: - path (Path): The path to the main directory or file. - category (str): The category of the document. - main_file (Path): The main file in the directory. - other_files (list[Path]): list of other files in the directory. - title (str): The title of the document. - - Methods: - __post_init__(): Initializes the main_file, other_files, and title attributes. - determine_main_file() -> Path: Determines the main file in the given path. - determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. - determine_title() -> str: Determines the title of the document. - generate() -> str: Generates the documentation content. - """ # noqa: E501 - path: Path - category: str = None - main_file: Path = field(init=False) - other_files: list[Path] = field(init=False) - title: str = field(init=False) - - def __post_init__(self): - self.main_file = self.determine_main_file() - self.other_files = self.determine_other_files() - self.title = self.determine_title() - - def determine_main_file(self) -> Path: - """ - Determines the main file in the given path. - If the path is a file, it returns the path itself. Otherwise, it searches - for Markdown files (*.md) in the directory and returns the first one found. - Returns: - Path: The main file path, either the original path if it's a file or the first - Markdown file found in the directory. - Raises: - IndexError: If no Markdown files are found in the directory. - """ # noqa: E501 - return self.path if self.path.is_file() else list( - self.path.glob("*.md")).pop() - - def determine_other_files(self) -> list[Path]: - """ - Determine other files in the directory excluding the main file. - - This method checks if the given path is a file. If it is, it returns an empty list. - Otherwise, it recursively searches through the directory and returns a list of all - files that are not the main file. - - Returns: - list[Path]: A list of Path objects representing the other files in the directory. - """ # noqa: E501 - if self.path.is_file(): - return [] - is_other_file = lambda file: file.is_file() and file != self.main_file - return [file for file in self.path.rglob("*") if is_other_file(file)] - - def determine_title(self) -> str: - return fix_case(self.path.stem.replace("_", " ").title()) - - def generate(self) -> str: - # Convert the path to a relative path from __file__ - make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to( - ROOT_DIR) - - content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n" - include = "include" if self.main_file.suffix == ".md" else \ - "literalinclude" - if include == "literalinclude": - content += f"# {self.title}\n\n" - content += f":::{{{include}}} {make_relative(self.main_file)}\n" - if include == "literalinclude": - content += f":language: {self.main_file.suffix[1:]}\n" - content += ":::\n\n" - - if not self.other_files: - return content - - content += "## Example materials\n\n" - for file in sorted(self.other_files): - include = "include" if file.suffix == ".md" else "literalinclude" - content += f":::{{admonition}} {file.relative_to(self.path)}\n" - content += ":class: dropdown\n\n" - content += f":::{{{include}}} {make_relative(file)}\n:::\n" - content += ":::\n\n" - - return content - - -def generate_examples(): - # Create the EXAMPLE_DOC_DIR if it doesn't exist - if not EXAMPLE_DOC_DIR.exists(): - EXAMPLE_DOC_DIR.mkdir(parents=True) - - # Create empty indices - examples_index = Index( - path=EXAMPLE_DOC_DIR / "examples_index.md", - title="Examples", - description= - "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.", # noqa: E501 - caption="Examples", - maxdepth=2) - # Category indices stored in reverse order because they are inserted into - # examples_index.documents at index 0 in order - category_indices = { - "other": - Index( - path=EXAMPLE_DOC_DIR / "examples_other_index.md", - title="Other", - description= - "Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501 - caption="Examples", - ), - "online_serving": - Index( - path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md", - title="Online Serving", - description= - "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501 - caption="Examples", - ), - "offline_inference": - Index( - path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md", - title="Offline Inference", - description= - "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.", # noqa: E501 - caption="Examples", - ), - } - - examples = [] - glob_patterns = ["*.py", "*.md", "*.sh"] - # Find categorised examples - for category in category_indices: - category_dir = EXAMPLE_DIR / category - globs = [category_dir.glob(pattern) for pattern in glob_patterns] - for path in itertools.chain(*globs): - examples.append(Example(path, category)) - # Find examples in subdirectories - for path in category_dir.glob("*/*.md"): - examples.append(Example(path.parent, category)) - # Find uncategorised examples - globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns] - for path in itertools.chain(*globs): - examples.append(Example(path)) - # Find examples in subdirectories - for path in EXAMPLE_DIR.glob("*/*.md"): - # Skip categorised examples - if path.parent.name in category_indices: - continue - examples.append(Example(path.parent)) - - # Generate the example documentation - for example in sorted(examples, key=lambda e: e.path.stem): - doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md" - with open(doc_path, "w+") as f: - f.write(example.generate()) - # Add the example to the appropriate index - index = category_indices.get(example.category, examples_index) - index.documents.append(example.path.stem) - - # Generate the index files - for category_index in category_indices.values(): - if category_index.documents: - examples_index.documents.insert(0, category_index.path.name) - with open(category_index.path, "w+") as f: - f.write(category_index.generate()) - - with open(examples_index.path, "w+") as f: - f.write(examples_index.generate()) diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md deleted file mode 100644 index 44134bf01b76..000000000000 --- a/docs/source/getting_started/installation.md +++ /dev/null @@ -1,28 +0,0 @@ -(installation-index)= - -# Installation - -vLLM supports the following hardware platforms: - -:::{toctree} -:maxdepth: 1 -:hidden: - -installation/gpu -installation/cpu -installation/ai_accelerator -::: - -- <project:installation/gpu.md> - - NVIDIA CUDA - - AMD ROCm - - Intel XPU -- <project:installation/cpu.md> - - Intel/AMD x86 - - ARM AArch64 - - Apple silicon - - IBM Z (S390X) -- <project:installation/ai_accelerator.md> - - Google TPU - - Intel Gaudi - - AWS Neuron diff --git a/docs/source/getting_started/installation/ai_accelerator.md b/docs/source/getting_started/installation/ai_accelerator.md deleted file mode 100644 index 0a207af1a4c7..000000000000 --- a/docs/source/getting_started/installation/ai_accelerator.md +++ /dev/null @@ -1,299 +0,0 @@ -# Other AI accelerators - -vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:selected: -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::: - -## Requirements - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "## Requirements" -:end-before: "## Configure a new environment" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "## Requirements" -:end-before: "## Configure a new environment" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "## Requirements" -:end-before: "## Configure a new environment" -::: - -:::: - -::::: - -## Configure a new environment - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "## Configure a new environment" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "## Configure a new environment" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "## Configure a new environment" -:end-before: "## Set up using Python" -::: - -:::: - -::::: - -## Set up using Python - -### Pre-built wheels - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::: - -### Build wheel from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::: - -## Set up using Docker - -### Pre-built images - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::: - -### Build image from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "### Build image from source" -:end-before: "## Extra information" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "### Build image from source" -:end-before: "## Extra information" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "### Build image from source" -:end-before: "## Extra information" -::: - -:::: - -::::: - -## Extra information - -:::::{tab-set} -:sync-group: device - -::::{tab-item} Google TPU -:sync: tpu - -:::{include} ai_accelerator/tpu.inc.md -:start-after: "## Extra information" -::: - -:::: - -::::{tab-item} Intel Gaudi -:sync: hpu-gaudi - -:::{include} ai_accelerator/hpu-gaudi.inc.md -:start-after: "## Extra information" -::: - -:::: - -::::{tab-item} AWS Neuron -:sync: neuron - -:::{include} ai_accelerator/neuron.inc.md -:start-after: "## Extra information" -::: - -:::: - -::::: diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md deleted file mode 100644 index e7d8d60630dc..000000000000 --- a/docs/source/getting_started/installation/cpu/arm.inc.md +++ /dev/null @@ -1,34 +0,0 @@ -# Installation - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. - -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: - -## Requirements - -- OS: Linux -- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) -- Instruction Set Architecture (ISA): NEON support is required - -## Set up using Python - -### Pre-built wheels - -### Build wheel from source - -:::{include} cpu/build.inc.md -::: - -Testing has been conducted on AWS Graviton3 instances for compatibility. - -## Set up using Docker - -### Pre-built images - -### Build image from source - -## Extra information diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md deleted file mode 100644 index 9ae2035db543..000000000000 --- a/docs/source/getting_started/installation/cpu/x86.inc.md +++ /dev/null @@ -1,41 +0,0 @@ -# Installation - -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. - -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: - -## Requirements - -- OS: Linux -- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended) -- Instruction Set Architecture (ISA): AVX512 (optional, recommended) - -:::{tip} -[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. -::: - -## Set up using Python - -### Pre-built wheels - -### Build wheel from source - -:::{include} cpu/build.inc.md -::: - -:::{note} -- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. -- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. -::: - -## Set up using Docker - -### Pre-built images - -See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo) - -### Build image from source - -## Extra information diff --git a/docs/source/getting_started/installation/gpu.md b/docs/source/getting_started/installation/gpu.md deleted file mode 100644 index 22db992354fb..000000000000 --- a/docs/source/getting_started/installation/gpu.md +++ /dev/null @@ -1,301 +0,0 @@ -# GPU - -vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:selected: -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - -::::: - -## Requirements - -- OS: Linux -- Python: 3.9 -- 3.12 - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - -::::: - -## Set up using Python - -### Create a new Python environment - -:::{include} python_env_setup.inc.md -::: - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "## Create a new Python environment" -:end-before: "### Pre-built wheels" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -There is no extra information on creating a new Python environment for this device. - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -There is no extra information on creating a new Python environment for this device. - -:::: - -::::: - -### Pre-built wheels - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - -::::: - -(build-from-source)= - -### Build wheel from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - -::::: - -## Set up using Docker - -### Pre-built images - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - -::::: - -### Build image from source - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "### Build image from source" -:end-before: "## Supported features" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "### Build image from source" -:end-before: "## Supported features" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "### Build image from source" -:end-before: "## Supported features" -::: - -:::: - -::::: - -## Supported features - -:::::{tab-set} -:sync-group: device - -::::{tab-item} NVIDIA CUDA -:sync: cuda - -:::{include} gpu/cuda.inc.md -:start-after: "## Supported features" -::: - -:::: - -::::{tab-item} AMD ROCm -:sync: rocm - -:::{include} gpu/rocm.inc.md -:start-after: "## Supported features" -::: - -:::: - -::::{tab-item} Intel XPU -:sync: xpu - -:::{include} gpu/xpu.inc.md -:start-after: "## Supported features" -::: - -:::: - -::::: diff --git a/docs/source/index.md b/docs/source/index.md deleted file mode 100644 index db2192e87dcf..000000000000 --- a/docs/source/index.md +++ /dev/null @@ -1,217 +0,0 @@ -# Welcome to vLLM - -:::{figure} ./assets/logos/vllm-logo-text-light.png -:align: center -:alt: vLLM -:class: no-scaled-link -:width: 60% -::: - -:::{raw} html -<p style="text-align:center"> -<strong>Easy, fast, and cheap LLM serving for everyone -</strong> -</p> - -<p style="text-align:center"> -<script async defer src="https://buttons.github.io/buttons.js"></script> -<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a> -<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a> -<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a> -</p> -::: - -vLLM is a fast and easy-to-use library for LLM inference and serving. - -Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. - -vLLM is fast with: - -- State-of-the-art serving throughput -- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) -- Continuous batching of incoming requests -- Fast model execution with CUDA/HIP graph -- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 -- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. -- Speculative decoding -- Chunked prefill - -vLLM is flexible and easy to use with: - -- Seamless integration with popular HuggingFace models -- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more -- Tensor parallelism and pipeline parallelism support for distributed inference -- Streaming outputs -- OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators. -- Prefix caching support -- Multi-lora support - -For more information, check out the following: - -- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) -- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) -- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. -- [vLLM Meetups](#meetups) - -## Documentation - -% How to start using vLLM? - -:::{toctree} -:caption: Getting Started -:maxdepth: 1 - -getting_started/installation -getting_started/quickstart -getting_started/examples/examples_index -getting_started/troubleshooting -getting_started/faq -getting_started/v1_user_guide - -::: - -% What does vLLM support? - -:::{toctree} -:caption: Models -:maxdepth: 1 - -models/supported_models -models/generative_models -models/pooling_models -models/extensions/index -::: - -% Additional capabilities - -:::{toctree} -:caption: Features -:maxdepth: 1 - -features/quantization/index -features/multimodal_inputs -features/prompt_embeds -features/lora -features/tool_calling -features/reasoning_outputs -features/structured_outputs -features/automatic_prefix_caching -features/disagg_prefill -features/spec_decode -features/compatibility_matrix -::: - -% Details about running vLLM - -:::{toctree} -:caption: Training -:maxdepth: 1 - -training/trl.md -training/rlhf.md - -::: - -:::{toctree} -:caption: Inference and Serving -:maxdepth: 1 - -serving/offline_inference -serving/openai_compatible_server -serving/serve_args -serving/distributed_serving -serving/metrics -serving/engine_args -serving/env_vars -serving/usage_stats -serving/integrations/index -::: - -% Scaling up vLLM for production - -:::{toctree} -:caption: Deployment -:maxdepth: 1 - -deployment/security -deployment/docker -deployment/k8s -deployment/nginx -deployment/frameworks/index -deployment/integrations/index -::: - -% Making the most out of vLLM - -:::{toctree} -:caption: Performance -:maxdepth: 1 - -performance/optimization -performance/benchmarks -::: - -% Explanation of vLLM internals - -:::{toctree} -:caption: Design Documents -:maxdepth: 2 - -design/arch_overview -design/huggingface_integration -design/plugin_system -design/kernel/paged_attention -design/mm_processing -design/automatic_prefix_caching -design/multiprocessing -::: - -:::{toctree} -:caption: V1 Design Documents -:maxdepth: 2 - -design/v1/torch_compile -design/v1/prefix_caching -design/v1/metrics -::: - -% How to contribute to the vLLM project - -:::{toctree} -:caption: Developer Guide -:maxdepth: 2 - -contributing/overview -contributing/deprecation_policy -contributing/profiling/profiling_index -contributing/dockerfile/dockerfile -contributing/model/index -contributing/vulnerability_management -::: - -% Technical API specifications - -:::{toctree} -:caption: API Reference -:maxdepth: 2 - -api/summary -api/vllm/vllm -::: - -% Latest news and acknowledgements - -:::{toctree} -:caption: Community -:maxdepth: 1 - -community/blog -community/meetups -community/sponsors -::: - -## Indices and tables - -- {ref}`genindex` -- {ref}`modindex` diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md deleted file mode 100644 index cdcdaa5b3501..000000000000 --- a/docs/source/models/extensions/index.md +++ /dev/null @@ -1,9 +0,0 @@ -# Built-in Extensions - -:::{toctree} -:maxdepth: 1 - -runai_model_streamer -tensorizer -fastsafetensor -::: diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md deleted file mode 100644 index 6022dfb9c2c6..000000000000 --- a/docs/source/models/supported_models.md +++ /dev/null @@ -1,1406 +0,0 @@ -(supported-models)= - -# Supported Models - -vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks. -If a model supports more than one task, you can set the task via the `--task` argument. - -For each task, we list the model architectures that have been implemented in vLLM. -Alongside each architecture, we include some popular models that use it. - -## Model Implementation - -### vLLM - -If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>. - -These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>. - -(transformers-backend)= - -### Transformers - -vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! - -To check if the modeling backend is Transformers, you can simply do this: - -```python -from vllm import LLM -llm = LLM(model=..., task="generate") # Name or path of your model -llm.apply_model(lambda model: print(type(model))) -``` - -If it is `TransformersForCausalLM` then it means it's based on Transformers! - -:::{tip} -You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>. -::: - -:::{note} -vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. -::: - -#### Custom models - -If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! - -For a model to be compatible with the Transformers backend for vLLM it must: - -- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)): - * The model directory must have the correct structure (e.g. `config.json` is present). - * `config.json` must contain `auto_map.AutoModel`. -- be a Transformers backend for vLLM compatible model (see <project:#writing-custom-models>): - * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`). - -If the compatible model is: - -- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for <project:#offline-inference> or `--trust-remote-code` for the <project:#openai-compatible-server>. -- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for <project:#offline-inference> or `vllm serve <MODEL_DIR>` for the <project:#openai-compatible-server>. - -This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! - -(writing-custom-models)= - -#### Writing custom models - -This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)). - -To make your model compatible with the Transformers backend, it needs: - -1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`. -2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. -3. `MyModel` must contain `_supports_attention_backend = True`. - -```{code-block} python -:caption: modeling_my_model.py - -from transformers import PreTrainedModel -from torch import nn - -class MyAttention(nn.Module): - - def forward(self, hidden_states, **kwargs): - ... - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - attn_output, attn_weights = attention_interface( - self, - query_states, - key_states, - value_states, - **kwargs, - ) - ... - -class MyModel(PreTrainedModel): - _supports_attention_backend = True -``` - -Here is what happens in the background when this model is loaded: - -1. The config is loaded. -2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. - -That's it! - -For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: - -```{code-block} python -:caption: configuration_my_model.py - -from transformers import PretrainedConfig - -class MyConfig(PretrainedConfig): - base_model_tp_plan = { - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } -``` - -- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). -- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s: - * You only need to do this for layers which are not present on all pipeline stages - * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages - * The `list` in the first element of the `tuple` contains the names of the input arguments - * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code - -## Loading a Model - -### Hugging Face Hub - -By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome). - -To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository. -If the `"architectures"` field contains a model architecture listed below, then it should be natively supported. - -Models do not _need_ to be natively supported to be used in vLLM. -The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!). - -:::{tip} -The easiest way to check if your model is really supported at runtime is to run the program below: - -```python -from vllm import LLM - -# For generative models (task=generate) only -llm = LLM(model=..., task="generate") # Name or path of your model -output = llm.generate("Hello, my name is") -print(output) - -# For pooling models (task={embed,classify,reward,score}) only -llm = LLM(model=..., task="embed") # Name or path of your model -output = llm.encode("Hello, my name is") -print(output) -``` - -If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. -::: - -Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. -Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. - -#### Download a model - -If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository: - -```console -# Download a model -huggingface-cli download HuggingFaceH4/zephyr-7b-beta - -# Specify a custom cache directory -huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache - -# Download a specific file from a model repo -huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json -``` - -#### List the downloaded models - -Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache: - -```console -# List cached models -huggingface-cli scan-cache - -# Show detailed (verbose) output -huggingface-cli scan-cache -v - -# Specify a custom cache directory -huggingface-cli scan-cache --dir ~/.cache/huggingface/hub -``` - -#### Delete a cached model - -Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache: - -```console -# The `delete-cache` command requires extra dependencies to work with the TUI. -# Please run `pip install huggingface_hub[cli]` to install them. - -# Launch the interactive TUI to select models to delete -$ huggingface-cli delete-cache -? Select revisions to delete: 1 revisions selected counting for 438.9M. - ○ None of the following (if selected, nothing will be deleted). -Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago) -❯ ◉ a5beb1e3: main # modified 1 week ago - -Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago) - ○ d4aa6901: main # modified 1 week ago - -Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago) - ○ 2cfc18c9: main # modified 4 weeks ago - -Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification. - -# Need to confirm after selected -? Select revisions to delete: 1 revision(s) selected. -? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes -Start deletion. -Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M. -``` - -#### Using a proxy - -Here are some tips for loading/downloading models from Hugging Face using a proxy: - -- Set the proxy globally for your session (or set it in the profile file): - -```shell -export http_proxy=http://your.proxy.server:port -export https_proxy=http://your.proxy.server:port -``` - -- Set the proxy for just the current command: - -```shell -https_proxy=http://your.proxy.server:port huggingface-cli download <model_name> - -# or use vllm cmd directly -https_proxy=http://your.proxy.server:port vllm serve <model_name> --disable-log-requests -``` - -- Set the proxy in Python interpreter: - -```python -import os - -os.environ['http_proxy'] = 'http://your.proxy.server:port' -os.environ['https_proxy'] = 'http://your.proxy.server:port' -``` - -### ModelScope - -To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable: - -```shell -export VLLM_USE_MODELSCOPE=True -``` - -And use with `trust_remote_code=True`. - -```python -from vllm import LLM - -llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) - -# For generative models (task=generate) only -output = llm.generate("Hello, my name is") -print(output) - -# For pooling models (task={embed,classify,reward,score}) only -output = llm.encode("Hello, my name is") -print(output) -``` - -(feature-status-legend)= - -## Feature Status Legend - -- ✅︎ indicates that the feature is supported for the model. - -- 🚧 indicates that the feature is planned but not yet supported for the model. - -- ⚠️ indicates that the feature is available but may have known issues or limitations. - -(supported-text-models)= - -## List of Text-only Language Models - -### Generative Models - -See [this page](#generative-models) for more information on how to use generative models. - -#### Text Generation - -Specified using `--task generate`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `AquilaForCausalLM` - * Aquila, Aquila2 - * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. - * ✅︎ - * ✅︎ -- * `ArcticForCausalLM` - * Arctic - * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. - * - * ✅︎ -- * `BaiChuanForCausalLM` - * Baichuan2, Baichuan - * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. - * ✅︎ - * ✅︎ -- * `BambaForCausalLM` - * Bamba - * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` - * - * -- * `BloomForCausalLM` - * BLOOM, BLOOMZ, BLOOMChat - * `bigscience/bloom`, `bigscience/bloomz`, etc. - * - * ✅︎ -- * `BartForConditionalGeneration` - * BART - * `facebook/bart-base`, `facebook/bart-large-cnn`, etc. - * - * -- * `ChatGLMModel`, `ChatGLMForConditionalGeneration` - * ChatGLM - * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. - * ✅︎ - * ✅︎ -- * `CohereForCausalLM`, `Cohere2ForCausalLM` - * Command-R - * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. - * ✅︎ - * ✅︎ -- * `DbrxForCausalLM` - * DBRX - * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. - * - * ✅︎ -- * `DeciLMForCausalLM` - * DeciLM - * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. - * - * ✅︎ -- * `DeepseekForCausalLM` - * DeepSeek - * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. - * - * ✅︎ -- * `DeepseekV2ForCausalLM` - * DeepSeek-V2 - * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. - * - * ✅︎ -- * `DeepseekV3ForCausalLM` - * DeepSeek-V3 - * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. - * - * ✅︎ -- * `ExaoneForCausalLM` - * EXAONE-3 - * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. - * ✅︎ - * ✅︎ -- * `FalconForCausalLM` - * Falcon - * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. - * - * ✅︎ -- * `FalconMambaForCausalLM` - * FalconMamba - * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. - * ✅︎ - * ✅︎ -- * `FalconH1ForCausalLM` - * Falcon-H1 - * `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. - * ✅︎ - * ✅︎ -- * `GemmaForCausalLM` - * Gemma - * `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. - * ✅︎ - * ✅︎ -- * `Gemma2ForCausalLM` - * Gemma 2 - * `google/gemma-2-9b`, `google/gemma-2-27b`, etc. - * ✅︎ - * ✅︎ -- * `Gemma3ForCausalLM` - * Gemma 3 - * `google/gemma-3-1b-it`, etc. - * ✅︎ - * ✅︎ -- * `GlmForCausalLM` - * GLM-4 - * `THUDM/glm-4-9b-chat-hf`, etc. - * ✅︎ - * ✅︎ -- * `Glm4ForCausalLM` - * GLM-4-0414 - * `THUDM/GLM-4-32B-0414`, etc. - * ✅︎ - * ✅︎ -- * `GPT2LMHeadModel` - * GPT-2 - * `gpt2`, `gpt2-xl`, etc. - * - * ✅︎ -- * `GPTBigCodeForCausalLM` - * StarCoder, SantaCoder, WizardCoder - * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. - * ✅︎ - * ✅︎ -- * `GPTJForCausalLM` - * GPT-J - * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. - * - * ✅︎ -- * `GPTNeoXForCausalLM` - * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. - * - * ✅︎ -- * `GraniteForCausalLM` - * Granite 3.0, Granite 3.1, PowerLM - * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. - * ✅︎ - * ✅︎ -- * `GraniteMoeForCausalLM` - * Granite 3.0 MoE, PowerMoE - * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. - * ✅︎ - * ✅︎ -- * `GraniteMoeHybridForCausalLM` - * Granite 4.0 MoE Hybrid - * `ibm-granite/granite-4.0-tiny-preview`, etc. - * ✅︎ - * ✅︎ -- * `GraniteMoeSharedForCausalLM` - * Granite MoE Shared - * `ibm-research/moe-7b-1b-active-shared-experts` (test model) - * ✅︎ - * ✅︎ -- * `GritLM` - * GritLM - * `parasail-ai/GritLM-7B-vllm`. - * ✅︎ - * ✅︎ -- * `Grok1ModelForCausalLM` - * Grok1 - * `hpcai-tech/grok-1`. - * ✅︎ - * ✅︎ -- * `InternLMForCausalLM` - * InternLM - * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. - * ✅︎ - * ✅︎ -- * `InternLM2ForCausalLM` - * InternLM2 - * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. - * ✅︎ - * ✅︎ -- * `InternLM3ForCausalLM` - * InternLM3 - * `internlm/internlm3-8b-instruct`, etc. - * ✅︎ - * ✅︎ -- * `JAISLMHeadModel` - * Jais - * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. - * - * ✅︎ -- * `JambaForCausalLM` - * Jamba - * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. - * ✅︎ - * ✅︎ -- * `LlamaForCausalLM` - * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. - * ✅︎ - * ✅︎ -- * `MambaForCausalLM` - * Mamba - * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. - * - * ✅︎ -- * `MiniCPMForCausalLM` - * MiniCPM - * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. - * ✅︎ - * ✅︎ -- * `MiniCPM3ForCausalLM` - * MiniCPM3 - * `openbmb/MiniCPM3-4B`, etc. - * ✅︎ - * ✅︎ -- * `MistralForCausalLM` - * Mistral, Mistral-Instruct - * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. - * ✅︎ - * ✅︎ -- * `MixtralForCausalLM` - * Mixtral-8x7B, Mixtral-8x7B-Instruct - * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. - * ✅︎ - * ✅︎ -- * `MPTForCausalLM` - * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. - * - * ✅︎ -- * `NemotronForCausalLM` - * Nemotron-3, Nemotron-4, Minitron - * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. - * ✅︎ - * ✅︎ -- * `OLMoForCausalLM` - * OLMo - * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. - * - * ✅︎ -- * `OLMo2ForCausalLM` - * OLMo2 - * `allenai/OLMo-2-0425-1B`, etc. - * - * ✅︎ -- * `OLMoEForCausalLM` - * OLMoE - * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. - * ✅︎ - * ✅︎ -- * `OPTForCausalLM` - * OPT, OPT-IML - * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. - * - * ✅︎ -- * `OrionForCausalLM` - * Orion - * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. - * - * ✅︎ -- * `PhiForCausalLM` - * Phi - * `microsoft/phi-1_5`, `microsoft/phi-2`, etc. - * ✅︎ - * ✅︎ -- * `Phi3ForCausalLM` - * Phi-4, Phi-3 - * `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. - * ✅︎ - * ✅︎ -- * `Phi3SmallForCausalLM` - * Phi-3-Small - * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. - * - * ✅︎ -- * `PhiMoEForCausalLM` - * Phi-3.5-MoE - * `microsoft/Phi-3.5-MoE-instruct`, etc. - * ✅︎ - * ✅︎ -- * `PersimmonForCausalLM` - * Persimmon - * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. - * - * ✅︎ -- * `Plamo2ForCausalLM` - * PLaMo2 - * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. - * - * -- * `QWenLMHeadModel` - * Qwen - * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. - * ✅︎ - * ✅︎ -- * `Qwen2ForCausalLM` - * QwQ, Qwen2 - * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - * ✅︎ - * ✅︎ -- * `Qwen2MoeForCausalLM` - * Qwen2MoE - * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. - * - * ✅︎ -- * `Qwen3ForCausalLM` - * Qwen3 - * `Qwen/Qwen3-8B`, etc. - * ✅︎ - * ✅︎ -- * `Qwen3MoeForCausalLM` - * Qwen3MoE - * `Qwen/Qwen3-30B-A3B`, etc. - * - * ✅︎ -- * `StableLmForCausalLM` - * StableLM - * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. - * - * ✅︎ -- * `Starcoder2ForCausalLM` - * Starcoder2 - * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. - * - * ✅︎ -- * `SolarForCausalLM` - * Solar Pro - * `upstage/solar-pro-preview-instruct`, etc. - * ✅︎ - * ✅︎ -- * `TeleChat2ForCausalLM` - * TeleChat2 - * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. - * ✅︎ - * ✅︎ -- * `TeleFLMForCausalLM` - * TeleFLM - * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. - * ✅︎ - * ✅︎ -- * `XverseForCausalLM` - * XVERSE - * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. - * ✅︎ - * ✅︎ -- * `MiniMaxText01ForCausalLM` - * MiniMax-Text - * `MiniMaxAI/MiniMax-Text-01`, etc. - * - * ✅︎ -- * `Zamba2ForCausalLM` - * Zamba2 - * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. - * - * -- * `MiMoForCausalLM` - * MiMo - * `XiaomiMiMo/MiMo-7B-RL`, etc. - * - * -::: - -:::{note} -Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -::: - -### Pooling Models - -See [this page](pooling-models) for more information on how to use pooling models. - -:::{important} -Since some model architectures support both generative and pooling tasks, -you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -::: - -#### Text Embedding - -Specified using `--task embed`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `BertModel` - * BERT-based - * `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. - * - * -- * `Gemma2Model` - * Gemma 2-based - * `BAAI/bge-multilingual-gemma2`, etc. - * - * ✅︎ -- * `GritLM` - * GritLM - * `parasail-ai/GritLM-7B-vllm`. - * ✅︎ - * ✅︎ -- * `GteModel` - * Arctic-Embed-2.0-M - * `Snowflake/snowflake-arctic-embed-m-v2.0`. - * - * ︎ -- * `GteNewModel` - * mGTE-TRM (see note) - * `Alibaba-NLP/gte-multilingual-base`, etc. - * ︎ - * ︎ -- * `ModernBertModel` - * ModernBERT-based - * `Alibaba-NLP/gte-modernbert-base`, etc. - * ︎ - * ︎ -- * `NomicBertModel` - * Nomic BERT - * `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. - * ︎ - * ︎ -- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. - * Llama-based - * `intfloat/e5-mistral-7b-instruct`, etc. - * ✅︎ - * ✅︎ -- * `Qwen2Model`, `Qwen2ForCausalLM` - * Qwen2-based - * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - * ✅︎ - * ✅︎ -- * `RobertaModel`, `RobertaForMaskedLM` - * RoBERTa-based - * `sentence-transformers/all-roberta-large-v1`, etc. - * - * -- * `XLMRobertaModel` - * XLM-RoBERTa-based - * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, `Snowflake/snowflake-arctic-embed-l-v2.0`, `jinaai/jina-embeddings-v3`(see note), etc. - * - * -::: - -:::{note} -`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. -You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. -::: - -:::{note} -The HF implementation of `Alibaba-NLP/gte-Qwen2-1.5B-instruct` is hardcoded to use causal attention despite what is shown in `config.json`. To compare vLLM vs HF results, -you should set `--hf-overrides '{"is_causal": true}'` in vLLM so that the two implementations are consistent with each other. - -For both the 1.5B and 7B variants, you also need to enable `--trust-remote-code` for the correct tokenizer to be loaded. -See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). -::: - -:::{note} -`jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights. -::: - -:::{note} -The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture. -::: - -If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings -of the whole prompt are extracted from the normalized hidden state corresponding to the last token. - -#### Reward Modeling - -Specified using `--task reward`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `InternLM2ForRewardModel` - * InternLM2-based - * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. - * ✅︎ - * ✅︎ -- * `LlamaForCausalLM` - * Llama-based - * `peiyi9979/math-shepherd-mistral-7b-prm`, etc. - * ✅︎ - * ✅︎ -- * `Qwen2ForRewardModel` - * Qwen2-based - * `Qwen/Qwen2.5-Math-RM-72B`, etc. - * ✅︎ - * ✅︎ -- * `Qwen2ForProcessRewardModel` - * Qwen2-based - * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. - * ✅︎ - * ✅︎ -::: - -If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. - -:::{important} -For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, -e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. -::: - -#### Classification - -Specified using `--task classify`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `JambaForSequenceClassification` - * Jamba - * `ai21labs/Jamba-tiny-reward-dev`, etc. - * ✅︎ - * ✅︎ -- * `Qwen2ForSequenceClassification` - * Qwen2-based - * `jason9693/Qwen2.5-1.5B-apeach`, etc. - * ✅︎ - * ✅︎ -::: - -If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. - -#### Sentence Pair Scoring - -Specified using `--task score`. - -:::{list-table} -:widths: 25 25 50 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `BertForSequenceClassification` - * BERT-based - * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - * - * -- * `RobertaForSequenceClassification` - * RoBERTa-based - * `cross-encoder/quora-roberta-base`, etc. - * - * -- * `XLMRobertaForSequenceClassification` - * XLM-RoBERTa-based - * `BAAI/bge-reranker-v2-m3`, etc. - * - * -- * `ModernBertForSequenceClassification` - * ModernBert-based - * `Alibaba-NLP/gte-reranker-modernbert-base`, etc. - * - * -::: - -(supported-mm-models)= - -## List of Multimodal Language Models - -The following modalities are supported depending on the model: - -- **T**ext -- **I**mage -- **V**ideo -- **A**udio - -Any combination of modalities joined by `+` are supported. - -- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs. - -On the other hand, modalities separated by `/` are mutually exclusive. - -- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. - -See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. - -:::{important} -**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) -or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: - -Offline inference: - -```python -from vllm import LLM - -llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, -) -``` - -Online serving: - -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}' -``` - -**This is no longer required if you are using vLLM V1.** - -::: - -:::{note} -vLLM currently only supports adding LoRA to the language backbone of multimodal models. -::: - -### Generative Models - -See [this page](#generative-models) for more information on how to use generative models. - -#### Text Generation - -Specified using `--task generate`. - -:::{list-table} -:widths: 25 25 15 20 5 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Inputs - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) - * [V1](gh-issue:8779) -- * `AriaForConditionalGeneration` - * Aria - * T + I<sup>+</sup> - * `rhymes-ai/Aria` - * - * ✅︎ - * ✅︎ -- * `AyaVisionForConditionalGeneration` - * Aya Vision - * T + I<sup>+</sup> - * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. - * - * ✅︎ - * ✅︎ -- * `Blip2ForConditionalGeneration` - * BLIP-2 - * T + I<sup>E</sup> - * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. - * - * ✅︎ - * ✅︎ -- * `ChameleonForConditionalGeneration` - * Chameleon - * T + I - * `facebook/chameleon-7b` etc. - * - * ✅︎ - * ✅︎ -- * `DeepseekVLV2ForCausalLM`<sup>^</sup> - * DeepSeek-VL2 - * T + I<sup>+</sup> - * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. - * - * ✅︎ - * ✅︎ -- * `Florence2ForConditionalGeneration` - * Florence-2 - * T + I - * `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. - * - * - * -- * `FuyuForCausalLM` - * Fuyu - * T + I - * `adept/fuyu-8b` etc. - * - * ✅︎ - * ✅︎ -- * `Gemma3ForConditionalGeneration` - * Gemma 3 - * T + I<sup>+</sup> - * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. - * ✅︎ - * ✅︎ - * ⚠️ -- * `GLM4VForCausalLM`<sup>^</sup> - * GLM-4V - * T + I - * `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `GraniteSpeechForConditionalGeneration` - * Granite Speech - * T + A - * `ibm-granite/granite-speech-3.3-8b` - * ✅︎ - * ✅︎ - * ✅︎ -- * `H2OVLChatModel` - * H2OVL - * T + I<sup>E+</sup> - * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. - * - * ✅︎ - * ✅︎\* -- * `Idefics3ForConditionalGeneration` - * Idefics3 - * T + I - * `HuggingFaceM4/Idefics3-8B-Llama3` etc. - * ✅︎ - * - * ✅︎ -- * `InternVLChatModel` - * InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 - * T + I<sup>E+</sup> - * `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. - * - * ✅︎ - * ✅︎ -- * `KimiVLForConditionalGeneration` - * Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking - * T + I<sup>+</sup> - * `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` - * - * - * ✅︎ -- * `Llama4ForConditionalGeneration` - * Llama 4 - * T + I<sup>+</sup> - * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. - * - * ✅︎ - * ✅︎ -- * `LlavaForConditionalGeneration` - * LLaVA-1.5 - * T + I<sup>E+</sup> - * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - * - * ✅︎ - * ✅︎ -- * `LlavaNextForConditionalGeneration` - * LLaVA-NeXT - * T + I<sup>E+</sup> - * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - * - * ✅︎ - * ✅︎ -- * `LlavaNextVideoForConditionalGeneration` - * LLaVA-NeXT-Video - * T + V - * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - * - * ✅︎ - * ✅︎ -- * `LlavaOnevisionForConditionalGeneration` - * LLaVA-Onevision - * T + I<sup>+</sup> + V<sup>+</sup> - * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - * - * ✅︎ - * ✅︎ -- * `MiniCPMO` - * MiniCPM-O - * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> - * `openbmb/MiniCPM-o-2_6`, etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `MiniCPMV` - * MiniCPM-V - * T + I<sup>E+</sup> + V<sup>E+</sup> - * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `MiniMaxVL01ForConditionalGeneration` - * MiniMax-VL - * T + I<sup>E+</sup> - * `MiniMaxAI/MiniMax-VL-01`, etc. - * - * ✅︎ - * ✅︎ -- * `Mistral3ForConditionalGeneration` - * Mistral3 - * T + I<sup>+</sup> - * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `MllamaForConditionalGeneration` - * Llama 3.2 - * T + I<sup>+</sup> - * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. - * - * - * -- * `MolmoForCausalLM` - * Molmo - * T + I<sup>+</sup> - * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `NVLM_D_Model` - * NVLM-D 1.0 - * T + I<sup>+</sup> - * `nvidia/NVLM-D-72B`, etc. - * - * ✅︎ - * ✅︎ -- * `Ovis` - * Ovis2, Ovis1.6 - * T + I<sup>+</sup> - * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. - * - * - * ✅︎ -- * `PaliGemmaForConditionalGeneration` - * PaliGemma, PaliGemma 2 - * T + I<sup>E</sup> - * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. - * - * ✅︎ - * ⚠️ -- * `Phi3VForCausalLM` - * Phi-3-Vision, Phi-3.5-Vision - * T + I<sup>E+</sup> - * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. - * - * ✅︎ - * ✅︎ -- * `Phi4MMForCausalLM` - * Phi-4-multimodal - * T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> - * `microsoft/Phi-4-multimodal-instruct`, etc. - * ✅︎ - * - * ✅︎ -- * `PixtralForConditionalGeneration` - * Pixtral - * T + I<sup>+</sup> - * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. - * - * ✅︎ - * ✅︎ -- * `QwenVLForConditionalGeneration`<sup>^</sup> - * Qwen-VL - * T + I<sup>E+</sup> - * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `Qwen2AudioForConditionalGeneration` - * Qwen2-Audio - * T + A<sup>+</sup> - * `Qwen/Qwen2-Audio-7B-Instruct` - * - * ✅︎ - * ✅︎ -- * `Qwen2VLForConditionalGeneration` - * QVQ, Qwen2-VL - * T + I<sup>E+</sup> + V<sup>E+</sup> - * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `Qwen2_5_VLForConditionalGeneration` - * Qwen2.5-VL - * T + I<sup>E+</sup> + V<sup>E+</sup> - * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. - * ✅︎ - * ✅︎ - * ✅︎ -- * `Qwen2_5OmniThinkerForConditionalGeneration` - * Qwen2.5-Omni - * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> - * `Qwen/Qwen2.5-Omni-7B` - * - * ✅︎ - * ✅︎\* -- * `SkyworkR1VChatModel` - * Skywork-R1V-38B - * T + I - * `Skywork/Skywork-R1V-38B` - * - * ✅︎ - * ✅︎ -- * `SmolVLMForConditionalGeneration` - * SmolVLM2 - * T + I - * `SmolVLM2-2.2B-Instruct` - * - * ✅︎ - * ✅︎ -- * `UltravoxModel` - * Ultravox - * T + A<sup>E+</sup> - * `fixie-ai/ultravox-v0_5-llama-3_2-1b` - * ✅︎ - * ✅︎ - * ✅︎ -::: - -<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM. -    • For example, to use DeepSeek-VL2 series models: -      `--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` -<sup>E</sup> Pre-computed embeddings can be inputted for this modality. -<sup>+</sup> Multiple items can be inputted per text prompt for this modality. - -:::{warning} -Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs. -However, there are differences in how they handle text + image inputs: - -V0 correctly implements the model's attention pattern: -- Uses bidirectional attention between the image tokens corresponding to the same image -- Uses causal attention for other tokens -- Implemented via (naive) PyTorch SDPA with masking tensors -- Note: May use significant memory for long prompts with image - -V1 currently uses a simplified attention pattern: -- Uses causal attention for all tokens, including image tokens -- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}` -- Will be updated in the future to support the correct behavior - -This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. -::: - -:::{note} -`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80. -::: - -:::{note} -To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. -::: - -:::{warning} -The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates. - -For the best results, we recommend using the following dependency versions (tested on A10 and L40): - -```text -# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) -torch==2.5.1 -torchvision==0.20.1 -transformers==4.48.1 -tokenizers==0.21.0 -tiktoken==0.7.0 -vllm==0.7.0 - -# Optional but recommended for improved performance and stability -triton==3.1.0 -xformers==0.0.28.post3 -uvloop==0.21.0 -protobuf==5.29.3 -openai==1.60.2 -opencv-python-headless==4.11.0.86 -pillow==10.4.0 - -# Installed FlashAttention (for float16 only) -flash-attn>=2.5.6 # Not used in float32, but should be documented -``` - -**Note:** Make sure you understand the security implications of using outdated packages. -::: - -:::{note} -The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. -For more details, please see: <gh-pr:4087#issuecomment-2250397630> -::: - -:::{warning} -Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1. -::: - -:::{note} -To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via -`pip install git+https://github.com/huggingface/transformers.git`. - -Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1. -`--mm-processor-kwargs '{"use_audio_in_video": true}'`. -::: - -### Pooling Models - -See [this page](pooling-models) for more information on how to use pooling models. - -:::{important} -Since some model architectures support both generative and pooling tasks, -you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -::: - -#### Text Embedding - -Specified using `--task embed`. - -Any text generation model can be converted into an embedding model by passing `--task embed`. - -:::{note} -To get the best results, you should use pooling models that are specifically trained as such. -::: - -The following table lists those that are tested in vLLM. - -:::{list-table} -:widths: 25 25 15 25 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Inputs - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `LlavaNextForConditionalGeneration` - * LLaVA-NeXT-based - * T / I - * `royokong/e5-v` - * - * ✅︎ -- * `Phi3VForCausalLM` - * Phi-3-Vision-based - * T + I - * `TIGER-Lab/VLM2Vec-Full` - * 🚧 - * ✅︎ -- * `Qwen2VLForConditionalGeneration` - * Qwen2-VL-based - * T + I - * `MrLight/dse-qwen2-2b-mrl-v1` - * - * ✅︎ -::: - -#### Transcription - -Specified using `--task transcription`. - -Speech2Text models trained specifically for Automatic Speech Recognition. - -:::{list-table} -:widths: 25 25 25 5 5 -:header-rows: 1 - -- * Architecture - * Models - * Example HF Models - * [LoRA](#lora-adapter) - * [PP](#distributed-serving) -- * `Whisper` - * Whisper-based - * `openai/whisper-large-v3-turbo` - * 🚧 - * 🚧 -::: - -_________________ - -## Model Support Policy - -At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: - -1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! - -2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. - - :::{tip} - When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. - ::: - -3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. - -4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. - -5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. - -Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. - -Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard. - -We have the following levels of testing for models: - -1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. -2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. -3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test. -4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md deleted file mode 100644 index 9325a2406e8c..000000000000 --- a/docs/source/serving/engine_args.md +++ /dev/null @@ -1,36 +0,0 @@ -(engine-args)= - -# Engine Arguments - -Engine arguments control the behavior of the vLLM engine. - -- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class. -- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`. - -For references to all arguments available from `vllm serve` see the [serve args](#serve-args) documentation. - -Below, you can find an explanation of every engine argument: - -<!--- pyml disable-num-lines 7 no-space-in-emphasis --> -```{eval-rst} -.. argparse:: - :module: vllm.engine.arg_utils - :func: _engine_args_parser - :prog: vllm serve - :nodefaultconst: - :markdownhelp: -``` - -## Async Engine Arguments - -Additional arguments are available to the asynchronous engine which is used for online serving: - -<!--- pyml disable-num-lines 7 no-space-in-emphasis --> -```{eval-rst} -.. argparse:: - :module: vllm.engine.arg_utils - :func: _async_engine_args_parser - :prog: vllm serve - :nodefaultconst: - :markdownhelp: -``` diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md deleted file mode 100644 index 9845241930a4..000000000000 --- a/docs/source/serving/env_vars.md +++ /dev/null @@ -1,15 +0,0 @@ -# Environment Variables - -vLLM uses the following environment variables to configure the system: - -:::{warning} -Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. - -All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -::: - -:::{literalinclude} ../../../vllm/envs.py -:end-before: end-env-vars-definition -:language: python -:start-after: begin-env-vars-definition -::: diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md deleted file mode 100644 index e2b4c0814605..000000000000 --- a/docs/source/serving/integrations/index.md +++ /dev/null @@ -1,8 +0,0 @@ -# External Integrations - -:::{toctree} -:maxdepth: 1 - -langchain -llamaindex -::: diff --git a/docs/source/serving/serve_args.md b/docs/source/serving/serve_args.md deleted file mode 100644 index edb49f4ba6de..000000000000 --- a/docs/source/serving/serve_args.md +++ /dev/null @@ -1,47 +0,0 @@ -(serve-args)= - -# Server Arguments - -The `vllm serve` command is used to launch the OpenAI-compatible server. - -## CLI Arguments - -The following are all arguments available from the `vllm serve` command: - -<!--- pyml disable-num-lines 7 no-space-in-emphasis --> -```{eval-rst} -.. argparse:: - :module: vllm.entrypoints.openai.cli_args - :func: create_parser_for_docs - :prog: vllm serve - :nodefaultconst: - :markdownhelp: -``` - -## Configuration file - -You can load CLI arguments via a [YAML](https://yaml.org/) config file. -The argument names must be the long form of those outlined [above](#serve-args). - -For example: - -```yaml -# config.yaml - -model: meta-llama/Llama-3.1-8B-Instruct -host: "127.0.0.1" -port: 6379 -uvicorn-log-level: "info" -``` - -To use the above config file: - -```bash -vllm serve --config config.yaml -``` - -:::{note} -In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. -The order of priorities is `command line > config file values > defaults`. -e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file. -::: diff --git a/docs/source/training/rlhf.md b/docs/training/rlhf.md similarity index 100% rename from docs/source/training/rlhf.md rename to docs/training/rlhf.md diff --git a/docs/source/training/trl.md b/docs/training/trl.md similarity index 66% rename from docs/source/training/trl.md rename to docs/training/trl.md index ebdf593dbde5..c7c1a5a3bbd1 100644 --- a/docs/source/training/trl.md +++ b/docs/training/trl.md @@ -6,8 +6,7 @@ Online methods such as GRPO or Online DPO require the model to generate completi See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information. -:::{seealso} -For more information on the `use_vllm` flag you can provide to the configs of these online methods, see: -- [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm) -- [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm) -::: +!!! info + For more information on the `use_vllm` flag you can provide to the configs of these online methods, see: + - [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm) + - [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm) diff --git a/mkdocs.yaml b/mkdocs.yaml new file mode 100644 index 000000000000..a1c6319bb008 --- /dev/null +++ b/mkdocs.yaml @@ -0,0 +1,117 @@ +site_name: vLLM +site_url: https://docs.vllm.ai +repo_url: https://github.com/vllm-project/vllm +exclude_docs: | + *.inc.md + *.template.md +theme: + name: material + logo: assets/logos/vllm-logo-only-light.ico + favicon: assets/logos/vllm-logo-only-light.ico + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to light mode + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: white + toggle: + icon: material/brightness-7 + name: Switch to dark mode + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + toggle: + icon: material/brightness-2 + name: Switch to system preference + features: + - content.code.copy + - content.tabs.link + - navigation.tracking + - navigation.tabs + - navigation.sections + - navigation.prune + - navigation.top + - search.highlight + - search.share + - toc.follow + custom_dir: docs/mkdocs/overrides + +hooks: + - docs/mkdocs/hooks/remove_announcement.py + - docs/mkdocs/hooks/generate_examples.py + - docs/mkdocs/hooks/url_schemes.py + +# Required to stop api-autonav from raising an error +# https://github.com/tlambert03/mkdocs-api-autonav/issues/16 +nav: + - api + +plugins: + - meta + - search + - autorefs + - awesome-nav + # For API reference generation + - api-autonav: + modules: ["vllm"] + api_root_uri: "api" + - mkdocstrings: + handlers: + python: + options: + show_symbol_type_heading: true + show_symbol_type_toc: true + summary: + modules: true + show_if_no_docstring: true + show_signature_annotations: true + separate_signature: true + show_overloads: true + signature_crossrefs: true + inventories: + - https://docs.python.org/3/objects.inv + - https://typing-extensions.readthedocs.io/en/latest/objects.inv + - https://docs.aiohttp.org/en/stable/objects.inv + - https://pillow.readthedocs.io/en/stable/objects.inv + - https://numpy.org/doc/stable/objects.inv + - https://pytorch.org/docs/stable/objects.inv + - https://psutil.readthedocs.io/en/stable/objects.inv + +markdown_extensions: + - attr_list + - md_in_html + - admonition + - pymdownx.details + # For content tabs + - pymdownx.superfences + - pymdownx.tabbed: + slugify: !!python/object/apply:pymdownx.slugs.slugify + kwds: + case: lower + alternate_style: true + # For code highlighting + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + # For emoji and icons + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + # For in page [TOC] (not sidebar) + - toc: + permalink: true + # For math rendering + - mdx_math: + enable_dollar_delimiter: true + +extra_javascript: + - mkdocs/javascript/run_llm_widget.js + - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML diff --git a/pyproject.toml b/pyproject.toml index 3011cffb8f1e..29186d5ff027 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -165,9 +165,11 @@ markers = [ [tool.pymarkdown] plugins.md004.style = "sublist" # ul-style +plugins.md007.indent = 4 # ul-indent plugins.md013.enabled = false # line-length plugins.md041.enabled = false # first-line-h1 plugins.md033.enabled = false # inline-html +plugins.md046.enabled = false # code-block-style plugins.md024.allow_different_nesting = true # no-duplicate-headers [tool.ty] diff --git a/requirements/docs.txt b/requirements/docs.txt index 9c267edaceaf..a1f51334ed81 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,19 +1,8 @@ -sphinx==7.4.7 -sphinx-argparse==0.5.2 -sphinx-book-theme==1.1.4 -sphinx-copybutton==0.5.2 -sphinx-design==0.6.1 -sphinx-togglebutton==0.3.2 -myst-parser==3.0.1 # `myst-parser==4.0.1` breaks inline code in titles -msgspec -snowballstemmer<3 # https://github.com/snowballstem/snowball/issues/229 -commonmark # Required by sphinx-argparse when using :markdownhelp: - -# Custom autodoc2 is necessary for faster docstring processing -# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035 -git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0 - -# packages to install to build the documentation -cachetools --f https://download.pytorch.org/whl/cpu -torch \ No newline at end of file +mkdocs +mkdocs-api-autonav +mkdocs-material +mkdocstrings-python +mkdocs-gen-files +mkdocs-awesome-nav +python-markdown-math +ruff diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2a27afe9757e..c48d8a386969 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1263,12 +1263,10 @@ def _advance_to_next_step( def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. - :::{figure} https://i.imgur.com/sv2HssD.png - :alt: Overview of the step function - :align: center - - Overview of the step function. - ::: + <figure markdown="span"> + ![Overview of the step function](https://i.imgur.com/sv2HssD.png) + <figcaption>Overview of the step function</figcaption> + </figure> Details: - Step 1: Schedules the sequences to be executed in the next diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 033551d07c39..34b48f83b643 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -29,7 +29,7 @@ # to extract the metrics definitions. -# begin-metrics-definitions +# --8<-- [start:metrics-definitions] class Metrics: """ vLLM uses a multiprocessing-based frontend for the OpenAI server. @@ -293,7 +293,7 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): labelnames=labelnames)) -# end-metrics-definitions +# --8<-- [end:metrics-definitions] def _unregister_vllm_metrics(self) -> None: for collector in list(prometheus_client.REGISTRY._collector_to_names): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 52b50229b8d1..0465302c5a1c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -131,10 +131,9 @@ class LLM: **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See {ref}`engine-args`) - :::{note} - This class is intended to be used for offline inference. For online - serving, use the {class}`~vllm.AsyncLLMEngine` class instead. - ::: + Note: + This class is intended to be used for offline inference. For online + serving, use the {class}`~vllm.AsyncLLMEngine` class instead. """ DEPRECATE_LEGACY: ClassVar[bool] = True @@ -422,11 +421,10 @@ def generate( A list of `RequestOutput` objects containing the generated completions in the same order as the input prompts. - :::{note} - Using `prompts` and `prompt_token_ids` as keyword parameters is - considered legacy and may be deprecated in the future. You should - instead pass them via the `inputs` parameter. - ::: + Note: + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. """ runner_type = self.llm_engine.model_config.runner_type if runner_type not in ["generate", "transcription"]: @@ -502,10 +500,9 @@ def collective_rpc(self, Returns: A list containing the results from each worker. - :::{note} - It is recommended to use this API to only pass control messages, - and set up data-plane communication to pass data. - ::: + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. """ return self.llm_engine.collective_rpc(method, timeout, args, kwargs) @@ -924,11 +921,10 @@ def encode( A list of `PoolingRequestOutput` objects containing the pooled hidden states in the same order as the input prompts. - :::{note} - Using `prompts` and `prompt_token_ids` as keyword parameters is - considered legacy and may be deprecated in the future. You should - instead pass them via the `inputs` parameter. - ::: + Note: + Using `prompts` and `prompt_token_ids` as keyword parameters is + considered legacy and may be deprecated in the future. You should + instead pass them via the `inputs` parameter. """ runner_type = self.llm_engine.model_config.runner_type if runner_type != "pooling": diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 5ab2356a0898..da01eb472c44 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -251,7 +251,7 @@ class ChatCompletionRequest(OpenAIBaseModel): parallel_tool_calls: Optional[bool] = False user: Optional[str] = None - # doc: begin-chat-completion-sampling-params + # --8<-- [start:chat-completion-sampling-params] best_of: Optional[int] = None use_beam_search: bool = False top_k: Optional[int] = None @@ -266,9 +266,9 @@ class ChatCompletionRequest(OpenAIBaseModel): spaces_between_special_tokens: bool = True truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None prompt_logprobs: Optional[int] = None - # doc: end-chat-completion-sampling-params + # --8<-- [end:chat-completion-sampling-params] - # doc: begin-chat-completion-extra-params + # --8<-- [start:chat-completion-extra-params] echo: bool = Field( default=False, description=( @@ -407,7 +407,7 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description="KVTransfer parameters used for disaggregated serving.") - # doc: end-chat-completion-extra-params + # --8<-- [end:chat-completion-extra-params] # Default sampling parameters for chat completion requests _DEFAULT_SAMPLING_PARAMS: dict = { @@ -764,7 +764,7 @@ class CompletionRequest(OpenAIBaseModel): top_p: Optional[float] = None user: Optional[str] = None - # doc: begin-completion-sampling-params + # --8<-- [start:completion-sampling-params] use_beam_search: bool = False top_k: Optional[int] = None min_p: Optional[float] = None @@ -779,9 +779,9 @@ class CompletionRequest(OpenAIBaseModel): truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None allowed_token_ids: Optional[list[int]] = None prompt_logprobs: Optional[int] = None - # doc: end-completion-sampling-params + # --8<-- [end:completion-sampling-params] - # doc: begin-completion-extra-params + # --8<-- [start:completion-extra-params] add_special_tokens: bool = Field( default=True, description=( @@ -858,7 +858,7 @@ class CompletionRequest(OpenAIBaseModel): default=None, description="KVTransfer parameters used for disaggregated serving.") - # doc: end-completion-extra-params + # --8<-- [end:completion-extra-params] # Default sampling parameters for completion requests _DEFAULT_SAMPLING_PARAMS: dict = { @@ -1045,11 +1045,11 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): user: Optional[str] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-embedding-pooling-params + # --8<-- [start:embedding-pooling-params] additional_data: Optional[Any] = None - # doc: end-embedding-pooling-params + # --8<-- [end:embedding-pooling-params] - # doc: begin-embedding-extra-params + # --8<-- [start:embedding-extra-params] add_special_tokens: bool = Field( default=True, description=( @@ -1064,7 +1064,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-embedding-extra-params + # --8<-- [end:embedding-extra-params] def to_pooling_params(self): return PoolingParams(dimensions=self.dimensions, @@ -1080,11 +1080,11 @@ class EmbeddingChatRequest(OpenAIBaseModel): user: Optional[str] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-chat-embedding-pooling-params + # --8<-- [start:chat-embedding-pooling-params] additional_data: Optional[Any] = None - # doc: end-chat-embedding-pooling-params + # --8<-- [end:chat-embedding-pooling-params] - # doc: begin-chat-embedding-extra-params + # --8<-- [start:chat-embedding-extra-params] add_special_tokens: bool = Field( default=False, description=( @@ -1118,7 +1118,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling."), ) - # doc: end-chat-embedding-extra-params + # --8<-- [end:chat-embedding-extra-params] @model_validator(mode="before") @classmethod @@ -1147,11 +1147,11 @@ class ScoreRequest(OpenAIBaseModel): text_2: Union[list[str], str] truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-score-pooling-params + # --8<-- [start:score-pooling-params] additional_data: Optional[Any] = None - # doc: end-score-pooling-params + # --8<-- [end:score-pooling-params] - # doc: begin-score-extra-params + # --8<-- [start:score-extra-params] priority: int = Field( default=0, description=( @@ -1160,7 +1160,7 @@ class ScoreRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-score-extra-params + # --8<-- [end:score-extra-params] def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) @@ -1173,11 +1173,11 @@ class RerankRequest(OpenAIBaseModel): top_n: int = Field(default_factory=lambda: 0) truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None - # doc: begin-rerank-pooling-params + # --8<-- [start:rerank-pooling-params] additional_data: Optional[Any] = None - # doc: end-rerank-pooling-params + # --8<-- [end:rerank-pooling-params] - # doc: begin-rerank-extra-params + # --8<-- [start:rerank-extra-params] priority: int = Field( default=0, description=( @@ -1186,7 +1186,7 @@ class RerankRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-rerank-extra-params + # --8<-- [end:rerank-extra-params] def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) @@ -1321,11 +1321,11 @@ class ClassificationRequest(OpenAIBaseModel): truncate_prompt_tokens: Optional[int] = None user: Optional[str] = None - # doc: begin-classification-pooling-params + # --8<-- [start:classification-pooling-params] additional_data: Optional[Any] = None - # doc: end-classification-pooling-params + # --8<-- [end:classification-pooling-params] - # doc: begin-classification-extra-params + # --8<-- [start:classification-extra-params] priority: int = Field( default=0, description=( @@ -1334,7 +1334,7 @@ class ClassificationRequest(OpenAIBaseModel): "if the served model does not use priority scheduling."), ) - # doc: end-classification-extra-params + # --8<-- [end:classification-extra-params] def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) @@ -1698,7 +1698,7 @@ class TranscriptionRequest(OpenAIBaseModel): timestamps incurs additional latency. """ - # doc: begin-transcription-extra-params + # --8<-- [start:transcription-extra-params] stream: Optional[bool] = False """Custom field not present in the original OpenAI definition. When set, it will enable output to be streamed in a similar fashion as the Chat @@ -1707,9 +1707,9 @@ class TranscriptionRequest(OpenAIBaseModel): # Flattened stream option to simplify form data. stream_include_usage: Optional[bool] = False stream_continuous_usage_stats: Optional[bool] = False - # doc: end-transcription-extra-params + # --8<-- [end:transcription-extra-params] - # doc: begin-transcription-sampling-params + # --8<-- [start:transcription-sampling-params] temperature: float = Field(default=0.0) """The sampling temperature, between 0 and 1. @@ -1743,7 +1743,7 @@ class TranscriptionRequest(OpenAIBaseModel): presence_penalty: Optional[float] = 0.0 """The presence penalty to use for sampling.""" - # doc: end-transcription-sampling-params + # --8<-- [end:transcription-sampling-params] # Default sampling parameters for transcription requests. _DEFAULT_SAMPLING_PARAMS: dict = { diff --git a/vllm/envs.py b/vllm/envs.py index dc23c8ea5314..2d330b8fbee8 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -175,7 +175,7 @@ def get_vllm_port() -> Optional[int]: # The begin-* and end* here are used by the documentation generator # to extract the used env vars. -# begin-env-vars-definition +# --8<-- [start:env-vars-definition] environment_variables: dict[str, Callable[[], Any]] = { @@ -813,7 +813,7 @@ def get_vllm_port() -> Optional[int]: lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), } -# end-env-vars-definition +# --8<-- [end:env-vars-definition] def __getattr__(name: str): diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 9b0b98731e03..8e67c7a41bb1 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -528,12 +528,12 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: ray.get(parallel_worker_tasks) def _check_ray_cgraph_installation(self): - import pkg_resources + import importlib.metadata + from packaging import version required_version = version.parse("2.43.0") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) + current_version = version.parse(importlib.metadata.version("ray")) if current_version < required_version: raise ValueError(f"Ray version {required_version} is " f"required, but found {current_version}") diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 2ff7e394a416..db0dd2051d52 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -681,9 +681,8 @@ def forward( batch. pixel_values: The pixels in each input image. - :::{seealso} - {class}`Blip2ImageInputs` - ::: + Info: + [Blip2ImageInputs][] """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 95c1a0ca0b98..ced71b6dcdeb 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -721,9 +721,8 @@ def forward( batch. pixel_values: The pixels in each input image. - :::{seealso} - {class}`LlavaImageInputs` - ::: + Info: + [LlavaImageInputs][] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 581a32325d4c..10261aa423c0 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -551,9 +551,8 @@ def forward( pixel_values: The pixels in each grid patch for each input image. image_sizes: The original `(height, width)` for each input image. - :::{seealso} - {class}`LlavaNextImageInputs` - ::: + Info: + [LlavaNextImageInputs][] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 2b9cbf10440a..051a73120838 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -559,9 +559,8 @@ def forward( batch. pixel_values: The pixels in each input image. - :::{seealso} - {class}`Mistral3ImagePixelInputs` - ::: + Info: + [Mistral3ImagePixelInputs][] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 756ea11311da..70568a195fd8 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -11,9 +11,8 @@ The global {class}`~MultiModalRegistry` is used by model runners to dispatch data processing according to the target model. -:::{seealso} -{ref}`mm-processing` -::: +Info: + {ref}`mm-processing` """ __all__ = [ diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 2335af843ed5..71ef1a98e0d0 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -289,9 +289,8 @@ def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: @dataclass(frozen=True) class MultiModalBatchedField(BaseMultiModalField): """ - :::{seealso} - {func}`MultiModalFieldConfig.batched` - ::: + Info: + [MultiModalFieldConfig.batched][] """ def build_elems( @@ -320,10 +319,9 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: @dataclass(frozen=True) class MultiModalFlatField(BaseMultiModalField): """ - :::{seealso} - {func}`MultiModalFieldConfig.flat` - {func}`MultiModalFieldConfig.flat_from_sizes` - ::: + Info: + [MultiModalFieldConfig.flat][] + [MultiModalFieldConfig.flat_from_sizes][] """ slices: Union[Sequence[slice], Sequence[Sequence[slice]]] dim: int = 0 @@ -363,9 +361,8 @@ def _expect_same_shape(tensor: torch.Tensor): @dataclass(frozen=True) class MultiModalSharedField(BaseMultiModalField): """ - :::{seealso} - {func}`MultiModalFieldConfig.shared` - ::: + Info: + [MultiModalFieldConfig.shared][] """ batch_size: int @@ -510,9 +507,8 @@ def flat_from_sizes(modality: str, Element 3: [[C],[C]] ``` - :::{seealso} - {func}`MultiModalFieldConfig.flat` - ::: + Info: + [MultiModalFieldConfig.flat][] """ if size_per_item.ndim != 1: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 67d0d7fc1183..8a27d866e88e 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -214,9 +214,8 @@ def register_processor( When the model receives multi-modal data, the provided function is invoked to transform the data into a dictionary of model inputs. - :::{seealso} - {ref}`mm-processing` - ::: + Info: + {ref}`mm-processing` """ def wrapper(model_cls: N) -> N: @@ -260,9 +259,8 @@ def create_processor( """ Create a multi-modal processor for a specific model and tokenizer. - :::{seealso} - {ref}`mm-processing` - ::: + Info: + {ref}`mm-processing` """ if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") diff --git a/vllm/utils.py b/vllm/utils.py index bfc01972bbd2..fcc0ab3b237a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1926,9 +1926,8 @@ class _PlaceholderBase: We need to explicitly override each dunder method because {meth}`__getattr__` is not called when they are accessed. - :::{seealso} - [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) - ::: + Info: + [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) """ def __getattr__(self, key: str) -> Never: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2b945cc4111a..a7c70fec042c 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -172,10 +172,9 @@ def determine_available_memory(self) -> int: Then, it calculate the free memory that can be used for KV cache in bytes. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 42882992f2da..d7fe0fe0fe4c 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -201,10 +201,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 41546462e5c4..5e3b6e4b62ea 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -234,10 +234,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 65085f80f97a..a78a41e03ea3 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -93,10 +93,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: Then, it calculate the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. - :::{tip} - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - ::: + Tip: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. From fbb13a2c15bf505914667ae5db9d5cde0847c7b8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Fri, 23 May 2025 17:18:22 +0800 Subject: [PATCH 087/192] Revert "[V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (#18034)" (#18600) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/transformers_utils/configs/eagle.py | 3 ++- vllm/v1/spec_decode/eagle.py | 6 +----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 31e3172c61eb..377523efefc3 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -70,7 +70,8 @@ def __init__(self, if self.model is not None: for k, v in self.model.to_dict().items(): - setattr(self, k, v) + if not hasattr(self, k): + setattr(self, k, v) @classmethod def from_pretrained( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 671b98544387..460d645a1a6c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -9,7 +9,6 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model -from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.triton_utils import tl, triton from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata @@ -311,10 +310,7 @@ def load_model(self, target_model: nn.Module) -> None: if self.vllm_config.speculative_config.method != "eagle3" and \ hasattr(target_model, "lm_head"): logger.info("Loading EAGLE LM head weights from the target model.") - if supports_multimodal(target_model): - self.model.lm_head = target_model.get_language_model().lm_head - else: - self.model.lm_head = target_model.lm_head + self.model.lm_head = target_model.lm_head @torch.inference_mode() def dummy_run( From 4ce64e2df48649c4873f828b8bf71790aa1e56ee Mon Sep 17 00:00:00 2001 From: Mengqing Cao <cmq0113@163.com> Date: Fri, 23 May 2025 17:39:05 +0800 Subject: [PATCH 088/192] [Bugfix][Model] Fix baichuan model loader for tp (#18597) Signed-off-by: Mengqing Cao <cmq0113@163.com> --- vllm/model_executor/models/baichuan.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 077e36176430..bcff6eb3fd31 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -42,7 +42,8 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, row_parallel_weight_loader) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors @@ -384,7 +385,7 @@ def __init__( lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - + self.tp_size = get_tensor_model_parallel_world_size() self.quant_config = quant_config self.model = BaiChuanModel(vllm_config=vllm_config, prefix=prefix, @@ -438,8 +439,10 @@ def lm_head_weight_loader(self, param: nn.Parameter, is_baichuan2 = self.config.vocab_size == 125696 if is_baichuan2: loaded_weight = torch.nn.functional.normalize(loaded_weight) - - default_weight_loader(param, loaded_weight) + if self.tp_size > 1: + row_parallel_weight_loader(param, loaded_weight) + else: + default_weight_loader(param, loaded_weight) class BaichuanForCausalLM(BaiChuanBaseForCausalLM): From e493e48524e9e78ab33eafec6461b3940e361189 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan <shadeMe@users.noreply.github.com> Date: Fri, 23 May 2025 12:38:23 +0200 Subject: [PATCH 089/192] [V0][Bugfix] Fix parallel sampling performance regression when guided decoding is enabled (#17731) Signed-off-by: Madeesh Kannan <shadeMe@users.noreply.github.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> --- .../guidance_logits_processors.py | 26 ++++++++++++++++--- .../outlines_logits_processors.py | 12 +++++++++ .../guided_decoding/xgrammar_decoding.py | 8 +++--- vllm/sequence.py | 2 +- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py index 4b45c272adc5..e17df68b4b4d 100644 --- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py +++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import copy import os from typing import Any @@ -34,9 +35,24 @@ def __init__( self.grammar = grammar self.tokenizer = tokenizer self.tokenizer_name = tokenizer.name_or_path + self.ll_tokenizer = None + self.ll_matcher = None + self.bitmask = None self.new_sampling = False self.initialized = False + def clone(self) -> "GuidanceLogitsProcessor": + cloned = copy.copy(self) + if self.initialized: + cloned.ll_matcher = llguidance.LLMatcher( + self.ll_tokenizer, # type: ignore[assignment] + self.grammar, + log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")), + ) + self.bitmask = llguidance.torch.allocate_token_bitmask( + 1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined] + return cloned + def _initialize(self): if self.initialized: return @@ -56,7 +72,7 @@ def _initialize(self): # create reusable bitmask self.bitmask = llguidance.torch.allocate_token_bitmask( - 1, self.ll_tokenizer.vocab_size) + 1, self.ll_tokenizer.vocab_size) # type: ignore[attr-defined] self.initialized = True @@ -70,15 +86,17 @@ def __call__( self._initialize() if self.new_sampling and len(input_ids) > 0: - self.ll_matcher.consume_token(input_ids[-1]) - err = self.ll_matcher.get_error() + self.ll_matcher.consume_token( # type: ignore[attr-defined] + input_ids[-1]) + err = self.ll_matcher.get_error() # type: ignore[attr-defined] if err: logger.warning("Error in LLMatcher: %s", err) llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask, 0) llguidance.torch.apply_token_bitmask_inplace( - scores, self.bitmask.to(scores.device)) + scores, + self.bitmask.to(scores.device)) # type: ignore[attr-defined] self.new_sampling = True diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 8ae7c7b6b2c7..6986b6554c23 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -56,6 +56,12 @@ def __init__(self, guide: Guide, reasoner: Optional[ReasoningParser]): self._fsm_state: defaultdict[int, Union[int, CFGState]] = defaultdict(int) + def clone(self) -> "BaseLogitsProcessor": + cloned = copy.copy(self) + cloned._guide = self._guide.copy() + cloned._fsm_state = copy.deepcopy(self._fsm_state) + return cloned + def __call__(self, input_ids: list[int], scores: torch.Tensor) -> torch.Tensor: """Use the FSM to bias the logits before sampling the next token.""" @@ -218,6 +224,12 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase, reasoner) self._guide = self._guide.copy() + def clone(self) -> "CFGLogitsProcessor": + cloned = copy.copy(self) + cloned._fsm_state = copy.deepcopy(self._fsm_state) + cloned._guide = self._guide.copy() + return cloned + @lru_cache(maxsize=32) def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase): diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 8e40da4b3aa9..7ca7bab818fc 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -302,8 +302,9 @@ class XGrammarLogitsProcessor: prefilled: bool = field(default=False) def __post_init__(self): - self.tokenizer_info = self.config.tokenizer_info( - self.config.tokenizer_data) + if self.tokenizer_info is None: + self.tokenizer_info = self.config.tokenizer_info( + self.config.tokenizer_data) def __getstate__(self) -> dict[str, Any]: return {'config': self.config, 'reasoner': self.reasoner} @@ -400,7 +401,8 @@ def __call__(self, input_ids: list[int], def clone(self) -> XGrammarLogitsProcessor: """Create a new instance with shared compiled grammar but separate state""" - new_processor = XGrammarLogitsProcessor(self.config, self.reasoner) + new_processor = XGrammarLogitsProcessor(self.config, self.reasoner, + None, self.tokenizer_info) # Share the compiled grammar context (immutable after compilation) new_processor.ctx = self.ctx diff --git a/vllm/sequence.py b/vllm/sequence.py index f5f9c56a7db2..f3dfd32d9169 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1494,7 +1494,7 @@ def add_request(request_id: str, engine, params, **kwargs): for i in range(original_params.n): request_id_i = f"{request_id}_parallel_sample_{i}" group.seq_id_to_index[request_id_i] = i - params = copy.deepcopy(original_params) + params = params.clone() params.n = 1 if params.seed is not None: params.seed += i From 6526e0511163b8660250f843eddd74475760f675 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 13:08:31 +0200 Subject: [PATCH 090/192] Add myself as docs code owner (#18605) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a37bdb0f4d9e..4452ce22d504 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -42,3 +42,7 @@ CMakeLists.txt @tlrmchlsmth /tests/v1/structured_output @mgoin @russellb /tests/weight_loading @mgoin @youkaichao /tests/lora @jeejeelee + +# Docs +/docs @hmellor +mkdocs.yaml @hmellor \ No newline at end of file From 7ab056c273dbf89ac1e157e539a8b2c1967fb263 Mon Sep 17 00:00:00 2001 From: Kay Yan <kay.yan@daocloud.io> Date: Fri, 23 May 2025 19:38:42 +0800 Subject: [PATCH 091/192] [Hardware][CPU] Update intel_extension_for_pytorch 2.7.0 and move to `requirements/cpu.txt` (#18542) Signed-off-by: Kay Yan <kay.yan@daocloud.io> --- docker/Dockerfile.cpu | 3 --- requirements/cpu.txt | 4 ++++ vllm/model_executor/layers/quantization/ipex_quant.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index c647d9036f40..5395b3884fb5 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -51,9 +51,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --upgrade pip && \ uv pip install -r requirements/cpu.txt -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0 - ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" RUN echo 'ulimit -c 0' >> ~/.bashrc diff --git a/requirements/cpu.txt b/requirements/cpu.txt index d4191888382c..c064ecbb9b1f 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -20,3 +20,7 @@ datasets # for benchmark scripts # cpu cannot use triton 3.3.0 triton==3.2.0; platform_machine == "x86_64" + +# Intel Extension for PyTorch, only for x86_64 CPUs +intel-openmp; platform_machine == "x86_64" +intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64" diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index b7baa3d3363b..8108c797637d 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.platforms import current_platform -MIN_IPEX_VERSION = "2.5.0" +MIN_IPEX_VERSION = "2.7.0" class IPEXConfig(QuantizationConfig): From cd821ea5d29f66ee0fd3a6c4580a089011e95343 Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Fri, 23 May 2025 19:49:18 +0800 Subject: [PATCH 092/192] [CI] fix kv_cache_type argument (#18594) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- tests/plugins_tests/test_platform_plugins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 9d6872e0e077..207de53abd8d 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -29,5 +29,5 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch): # ignore the backend env variable if it is set with monkeypatch.context() as m: m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) - backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + backend = get_attn_backend(16, torch.float16, "auto", 16, False) assert backend.get_name() == "Dummy_Backend" From 38a95cb4a81d4a23924b453041174b399fab5632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Fri, 23 May 2025 21:50:07 +0900 Subject: [PATCH 093/192] [Doc] Fix indent of contributing to vllm (#18611) Signed-off-by: Zerohertz <ohg3417@gmail.com> --- docs/contributing/overview.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/contributing/overview.md b/docs/contributing/overview.md index 7dbf8bfdcf24..48f0bab5e9b3 100644 --- a/docs/contributing/overview.md +++ b/docs/contributing/overview.md @@ -16,9 +16,9 @@ Finally, one of the most impactful ways to support us is by raising awareness ab Unsure on where to start? Check out the following links for tasks to work on: - [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22) - - [Selected onboarding tasks](gh-project:6) + - [Selected onboarding tasks](gh-project:6) - [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22) - - [Models with multi-modal capabilities](gh-project:10) + - [Models with multi-modal capabilities](gh-project:10) ## License From 2edb533af26d2cdf7e4b7bdd3da0df11c009f654 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 14:51:38 +0200 Subject: [PATCH 094/192] Replace `{func}` with mkdocs style links (#18610) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/llava_next.py | 2 +- vllm/multimodal/processing.py | 2 +- vllm/platforms/interface.py | 4 ++-- vllm/sequence.py | 2 +- vllm/v1/worker/utils.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 10261aa423c0..2fb79f57a67f 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -540,7 +540,7 @@ def forward( Unlike in LLaVA-1.5, the number of image tokens inputted to the language model depends on the original size of the input image. Including the original image token in the input, the required number of image tokens - is given by {func}`get_llava_next_image_feature_size`. + is given by [get_llava_next_image_feature_size][]. This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 320a26f37555..f56110d94ab2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -387,7 +387,7 @@ def modality(self) -> str: def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: - """Convenience function to apply {func}`full_groupby` based on modality.""" + """Convenience function to apply [full_groupby][] based on modality.""" return full_groupby(values, key=lambda x: x.modality) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 20284b4e1801..646faa944565 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -157,7 +157,7 @@ def is_out_of_tree(self) -> bool: return self._enum == PlatformEnum.OOT def is_cuda_alike(self) -> bool: - """Stateless version of {func}`torch.cuda.is_available`.""" + """Stateless version of [torch.cuda.is_available][].""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) def is_sleep_mode_available(self) -> bool: @@ -194,7 +194,7 @@ def get_device_capability( cls, device_id: int = 0, ) -> Optional[DeviceCapability]: - """Stateless version of {func}`torch.cuda.get_device_capability`.""" + """Stateless version of [torch.cuda.get_device_capability][].""" return None @classmethod diff --git a/vllm/sequence.py b/vllm/sequence.py index f3dfd32d9169..e9212a82506e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -27,7 +27,7 @@ def array_full(token_id: int, count: int): - """{class}`array` equivalent of {func}`numpy.full`.""" + """{class}`array` equivalent of [numpy.full][].""" return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 267754036b31..28503a0a926d 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -66,7 +66,7 @@ def gather_mm_placeholders( """ Reconstructs the embeddings from the placeholder tokens. - This is the operation of {func}`scatter_mm_placeholders`. + This is the operation of [scatter_mm_placeholders][]. """ if is_embed is None: return placeholders From 6dd51c7ef11ff06ce589c41ae0c3aef30f3f13b8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Fri, 23 May 2025 20:51:53 +0800 Subject: [PATCH 095/192] [CI/Build] Fix V1 flag being set in entrypoints tests (#18598) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .buildkite/test-pipeline.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 774a5df16d7f..181fbda57b3f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -125,9 +125,8 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Distributed Tests (4 GPUs) # 10min @@ -390,10 +389,12 @@ steps: source_file_dependencies: - vllm/model_executor/model_loader - tests/tensorizer_loader + - tests/entrypoints/openai/test_tensorizer_entrypoint.py commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s tensorizer_loader + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - label: Benchmarks # 9min mirror_hardwares: [amdexperimental, amdproduction] From 52fb23f47e043b9621d37322c1ca72b40bf721f7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 14:53:44 +0200 Subject: [PATCH 096/192] Fix examples with code blocks in docs (#18609) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/mkdocs/hooks/generate_examples.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 9144f6824b09..a2131c342e8c 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -107,12 +107,15 @@ def generate(self) -> str: content = f"---\ntitle: {self.title}\n---\n\n" content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n" + # Use long code fence to avoid issues with + # included files containing code fences too + code_fence = "``````" is_code = self.main_file.suffix != ".md" if is_code: - content += f"```{self.main_file.suffix[1:]}\n" + content += f"{code_fence}{self.main_file.suffix[1:]}\n" content += f'--8<-- "{self.main_file}"\n' if is_code: - content += "```\n" + content += f"{code_fence}\n" content += "\n" if not self.other_files: @@ -122,10 +125,10 @@ def generate(self) -> str: for file in sorted(self.other_files): content += f'??? abstract "{file.relative_to(self.path)}"\n' if file.suffix != ".md": - content += f" ```{file.suffix[1:]}\n" + content += f" {code_fence}{file.suffix[1:]}\n" content += f' --8<-- "{file}"\n' if file.suffix != ".md": - content += " ```\n" + content += f" {code_fence}\n" return content From 6220f3c6b0a1f96bc494a8f927ff490e8b12dde0 Mon Sep 17 00:00:00 2001 From: Tristan Leclercq <49700633+tristanleclercq@users.noreply.github.com> Date: Fri, 23 May 2025 14:54:13 +0200 Subject: [PATCH 097/192] [Bugfix] Fix transformers model impl ignored for mixtral quant (#18602) Signed-off-by: Tristan Leclercq <tristanleclercq@gmail.com> --- vllm/model_executor/model_loader/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 39e380f07297..9c8d647a24fe 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -225,17 +225,16 @@ def get_model_architecture( "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "quark" ] - if (model_config.quantization is not None - and model_config.quantization not in mixtral_supported - and "MixtralForCausalLM" in architectures): - architectures = ["QuantMixtralForCausalLM"] - vllm_supported_archs = ModelRegistry.get_supported_archs() vllm_not_supported = not any(arch in vllm_supported_archs for arch in architectures) if (model_config.model_impl == ModelImpl.TRANSFORMERS or model_config.model_impl != ModelImpl.VLLM and vllm_not_supported): architectures = resolve_transformers_arch(model_config, architectures) + elif (model_config.quantization is not None + and model_config.quantization not in mixtral_supported + and "MixtralForCausalLM" in architectures): + architectures = ["QuantMixtralForCausalLM"] model_cls, arch = ModelRegistry.resolve_model_cls(architectures) if model_config.task == "embed": From d4c291976085e4bdfabe22ed7c69534263c4cc7b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 15:18:31 +0200 Subject: [PATCH 098/192] Include private attributes in API documentation (#18614) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- mkdocs.yaml | 1 + .../layers/rejection_sampler.py | 35 ++++++------ .../layers/typical_acceptance_sampler.py | 56 ++++++++----------- 3 files changed, 43 insertions(+), 49 deletions(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index a1c6319bb008..b6fabbeed15a 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -66,6 +66,7 @@ plugins: options: show_symbol_type_heading: true show_symbol_type_toc: true + filters: [] summary: modules: true show_if_no_docstring: true diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index af82b9dc93b7..3db73495827c 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -262,16 +262,16 @@ def _get_accepted( True, then a token can be accepted, else it should be rejected. - Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of - {math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according - to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the + Given $q(\hat{x}_{n+1}|x_1, \dots, x_n)$, the probability of + $\hat{x}_{n+1}$ given context $x_1, \dots, x_n$ according + to the target model, and $p(\hat{x}_{n+1}|x_1, \dots, x_n)$, the same conditional probability according to the draft model, the token is accepted with probability: - :::{math} + $$ \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) - ::: + $$ This implementation does not apply causality. When using the output, if a token is rejected, subsequent tokens should not be used. @@ -314,30 +314,31 @@ def _get_recovered_probs( target model is recovered (within hardware numerics). The probability distribution used in this rejection case is constructed - as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of - {math}`x` given context {math}`x_1, \dots, x_n` according to the target - model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability + as follows. Given $q(x|x_1, \dots, x_n)$, the probability of + $x$ given context $x_1, \dots, x_n$ according to the target + model and $p(x|x_1, \dots, x_n)$, the same conditional probability according to the draft model: - :::{math} + $$ x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ - ::: + $$ - where {math}`(f(x))_+` is defined as: + where $(f(x))_+$ is defined as: - :::{math} + $$ (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} - ::: + $$ See https://github.com/vllm-project/vllm/pull/2336 for a visualization of the draft, target, and recovered probability distributions. Returns a tensor of shape [batch_size, k, vocab_size]. - Note: This batches operations on GPU and thus constructs the recovered - distribution for all tokens, even if they are accepted. This causes - division-by-zero errors, so we use self._smallest_positive_value to - avoid that. This introduces some drift to the distribution. + Note: + This batches operations on GPU and thus constructs the recovered + distribution for all tokens, even if they are accepted. This causes + division-by-zero errors, so we use self._smallest_positive_value to + avoid that. This introduces some drift to the distribution. """ _, k, _ = draft_probs.shape diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index 527a301cd8e2..a14c86148e73 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -93,29 +93,27 @@ def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): Evaluates and returns a mask of accepted tokens based on the posterior probabilities. - Parameters: - ---------- - target_probs : torch.Tensor - A tensor of shape (batch_size, k, vocab_size) representing - the probabilities of each token in the vocabulary for each - position in the proposed sequence. This is the distribution - generated by the target model. - draft_token_ids : torch.Tensor - A tensor of shape (batch_size, k) representing the proposed - token ids. + Args: + target_probs (torch.Tensor): A tensor of shape + (batch_size, k, vocab_size) representing the probabilities of + each token in the vocabulary for each position in the proposed + sequence. This is the distribution generated by the target + model. + draft_token_ids (torch.Tensor): A tensor of shape (batch_size, k) + representing the proposed token ids. A draft token_id x_{n+k} is accepted if it satisfies the following condition - :::{math} + $$ p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > \min \left( \epsilon, \delta * \exp \left( -H(p_{\text{original}}( \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) - ::: + $$ - where {math}`p_{\text{original}}` corresponds to target_probs - and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters + where $p_{\text{original}}$ corresponds to target_probs + and $\epsilon$ and $\delta$ correspond to hyperparameters specified using self._posterior_threshold and self._posterior_alpha This method computes the posterior probabilities for the given @@ -126,13 +124,10 @@ def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): returns a boolean mask indicating which tokens can be accepted. Returns: - ------- - torch.Tensor - A boolean tensor of shape (batch_size, k) where each element - indicates whether the corresponding draft token has been accepted - or rejected. True indicates acceptance and false indicates - rejection. - + torch.Tensor: A boolean tensor of shape (batch_size, k) where each + element indicates whether the corresponding draft token has + been accepted or rejected. True indicates acceptance and false + indicates rejection. """ device = target_probs.device candidates_prob = torch.gather( @@ -156,17 +151,14 @@ def _get_recovered_token_ids(self, target_probs): The recovered token ids will fill the first unmatched token by the target token. - Parameters - ---------- - target_probs : torch.Tensor - A tensor of shape (batch_size, k, vocab_size) containing - the target probability distribution - - Returns - ------- - torch.Tensor - A tensor of shape (batch_size, k) with the recovered token - ids which are selected from target probs. + Args: + target_probs (torch.Tensor): A tensor of shape + (batch_size, k, vocab_size) containing the target probability + distribution. + + Returns: + torch.Tensor: A tensor of shape (batch_size, k) with the recovered + token ids which are selected from target probs. """ max_indices = torch.argmax(target_probs, dim=-1) From 2cd1fa4556fe1edc1be32be99214024393e2d7b5 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Fri, 23 May 2025 21:21:19 +0800 Subject: [PATCH 099/192] [Misc] add Haystack integration (#18601) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- docs/deployment/frameworks/haystack.md | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 docs/deployment/frameworks/haystack.md diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md new file mode 100644 index 000000000000..2eac4a5279fd --- /dev/null +++ b/docs/deployment/frameworks/haystack.md @@ -0,0 +1,60 @@ +--- +title: Haystack +--- +[](){ #deployment-haystack } + +# Haystack + +[Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case. + +It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. + +## Prerequisites + +- Setup vLLM and Haystack environment + +```console +pip install vllm haystack-ai +``` + +## Deploy + +- Start the vLLM server with the supported chat completion model, e.g. + +```console +vllm serve mistralai/Mistral-7B-Instruct-v0.1 +``` + +- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. + +```python +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage +from haystack.utils import Secret + +generator = OpenAIChatGenerator( + # for compatibility with the OpenAI API, a placeholder api_key is needed + api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), + model="mistralai/Mistral-7B-Instruct-v0.1", + api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", + generation_kwargs = {"max_tokens": 512} +) + +response = generator.run( + messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")] +) + +print("-"*30) +print(response) +print("-"*30) +``` + +Output e.g.: + +```console +------------------------------ +{'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text=' Of course! Where in Italy would you like to go and what type of trip are you looking to plan?')], _name=None, _meta={'model': 'mistralai/Mistral-7B-Instruct-v0.1', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 23, 'prompt_tokens': 21, 'total_tokens': 44, 'completion_tokens_details': None, 'prompt_tokens_details': None}})]} +------------------------------ +``` + +For details, see the tutorial [Using vLLM in Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/vllm.md). From 1068556b2ca6c136000fa48db7d62ce1b5250dea Mon Sep 17 00:00:00 2001 From: Simon Mo <simon.mo@hey.com> Date: Fri, 23 May 2025 07:43:58 -0700 Subject: [PATCH 100/192] [Bugfix][Build/CI] Fixup CUDA compiler version check for CUDA_SUPPORTED_ARCHS (#18579) --- CMakeLists.txt | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ffb801d62619..6a1ed588749a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,13 +29,6 @@ set(ignoreMe "${VLLM_PYTHON_PATH}") # set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") -# Supported NVIDIA architectures. -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL) - set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") -else() - set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") -endif() - # Supported AMD GPU architectures. set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") @@ -83,6 +76,15 @@ endif() # find_package(Torch REQUIRED) +# Supported NVIDIA architectures. +# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined +if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND + CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) + set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") +else() + set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") +endif() + # # Forward the non-CUDA device extensions to external CMake scripts. # From 5221815a0045e9baeaede0f2530ffecfd6950503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Sat, 24 May 2025 00:23:21 +0900 Subject: [PATCH 101/192] [Doc] Fix markdown list indentation for MkDocs rendering (#18620) Signed-off-by: Zerohertz <ohg3417@gmail.com> --- docs/design/huggingface_integration.md | 15 ++++++--------- pyproject.toml | 1 + 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md index 68cc27ea768c..2d462ccb6535 100644 --- a/docs/design/huggingface_integration.md +++ b/docs/design/huggingface_integration.md @@ -8,17 +8,15 @@ This document describes how vLLM integrates with HuggingFace libraries. We will Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`. 1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process: - - - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. - - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. - - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. + - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. + - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation. 3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that: - - - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. - - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. + - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. + - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. 4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation. @@ -29,8 +27,7 @@ Beyond that, there are two more things vLLM depends on HuggingFace for. 1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). 2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. - - - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: This completes the integration between vLLM and HuggingFace. diff --git a/pyproject.toml b/pyproject.toml index 29186d5ff027..762ac9e11566 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -166,6 +166,7 @@ markers = [ [tool.pymarkdown] plugins.md004.style = "sublist" # ul-style plugins.md007.indent = 4 # ul-indent +plugins.md007.start_indented = true # ul-indent plugins.md013.enabled = false # line-length plugins.md041.enabled = false # first-line-h1 plugins.md033.enabled = false # inline-html From 022d8abe29531d0023ea022a64af230719df93ca Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Fri, 23 May 2025 23:25:03 +0800 Subject: [PATCH 102/192] [Doc] Use a different color for the announcement (#18616) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/mkdocs/stylesheets/extra.css | 4 ++++ mkdocs.yaml | 3 +++ 2 files changed, 7 insertions(+) create mode 100644 docs/mkdocs/stylesheets/extra.css diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css new file mode 100644 index 000000000000..1c57779c52c9 --- /dev/null +++ b/docs/mkdocs/stylesheets/extra.css @@ -0,0 +1,4 @@ +.md-banner { + background-color: var(--md-warning-bg-color); + color: var(--md-warning-fg-color); +} diff --git a/mkdocs.yaml b/mkdocs.yaml index b6fabbeed15a..8468b2bd9690 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -113,6 +113,9 @@ markdown_extensions: - mdx_math: enable_dollar_delimiter: true +extra_css: + - mkdocs/stylesheets/extra.css + extra_javascript: - mkdocs/javascript/run_llm_widget.js - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML From 6a7988c55bb0a54240f479033313eb98344e6431 Mon Sep 17 00:00:00 2001 From: youkaichao <youkaichao@gmail.com> Date: Fri, 23 May 2025 23:43:43 +0800 Subject: [PATCH 103/192] Refactor pplx init logic to make it modular (prepare for deepep) (#18200) Signed-off-by: youkaichao <youkaichao@gmail.com> --- .../device_communicators/all2all.py | 101 +++++--- .../base_device_communicator.py | 89 ++++++- .../device_communicators/cuda_communicator.py | 45 ++-- vllm/distributed/parallel_state.py | 56 +--- vllm/envs.py | 3 + vllm/model_executor/layers/fused_moe/layer.py | 240 +++++++----------- .../layers/fused_moe/pplx_prepare_finalize.py | 1 - .../model_executor/layers/quantization/fp8.py | 21 +- vllm/platforms/cuda.py | 1 + vllm/v1/worker/gpu_worker.py | 3 +- vllm/v1/worker/tpu_worker.py | 3 +- vllm/worker/cpu_worker.py | 3 +- vllm/worker/hpu_worker.py | 6 +- vllm/worker/tpu_worker.py | 3 +- vllm/worker/worker.py | 3 +- vllm/worker/xpu_worker.py | 3 +- 16 files changed, 297 insertions(+), 284 deletions(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index b69647b00586..a250ec89cd5b 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,44 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 +import importlib.util +from typing import TYPE_CHECKING + import torch +import torch.distributed as dist from vllm.forward_context import get_forward_context +from vllm.logger import init_logger +from .base_device_communicator import All2AllManagerBase, Cache -class All2AllBase: - - def __init__(self, cpu_group, model): - self.cpu_group = cpu_group - - # compute some common properties - from vllm.distributed.parallel_state import (get_dp_group, - get_ep_group, - get_tp_group, - in_the_same_node_as) - - # all2all lives in ep group, which is merged from dp and tp group - self.dp_group = get_dp_group() - self.tp_group = get_tp_group() - self.ep_group = get_ep_group() - self.dp_rank = self.dp_group.rank_in_group - self.dp_world_size = self.dp_group.world_size - - # all2all communication often has separate implementations for - # intra-node and inter-node communication - self.intranode = in_the_same_node_as(cpu_group, source_rank=0) - self.internode = not self.intranode - - def dispatch(self, hidden_states: torch.Tensor, - router_logits: torch.Tensor): - raise NotImplementedError - - def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: - raise NotImplementedError +logger = init_logger(__name__) - def destroy(self): - pass +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.layer import FusedMoE +else: + FusedMoE = None -class NaiveAll2All(All2AllBase): +class NaiveAll2AllManager(All2AllManagerBase): """ A naive implementation of all2all communication. It uses all-reduce under the hood, which is not @@ -46,8 +26,8 @@ class NaiveAll2All(All2AllBase): debugging. """ - def __init__(self, cpu_group, model): - super().__init__(cpu_group, model) + def __init__(self, cpu_group): + super().__init__(cpu_group) def naive_multicast(self, x: torch.Tensor, cu_tokens_across_dp_cpu: torch.Tensor): @@ -91,3 +71,56 @@ def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: def destroy(self): pass + + +class PPLXAll2AllManager(All2AllManagerBase): + """ + All2All communication based on PPLX kernels. + """ + + def __init__(self, cpu_group): + has_pplx = importlib.util.find_spec("pplx_kernels") is not None + assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa + super().__init__(cpu_group) + + if self.internode: + # inter-node communication needs nvshmem, + # intra-node communication uses p2p mapping directly + from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, + nvshmem_get_unique_id, + nvshmem_init) + logger.debug( + "Initialize NVSHMEM for pplx_kernels: " + "rank=%d, world size=%d", self.rank, self.world_size) + uid = nvshmem_get_unique_id( + ) if self.rank == 0 else nvshmem_alloc_empty_unique_id() + dist.broadcast(uid, + src=dist.get_process_group_ranks(self.cpu_group)[0], + group=self.cpu_group) + logger.debug("PPLX NVSHMEM UID = %s", uid) + nvshmem_init(uid, self.rank, self.world_size) + + self.handle_cache = Cache() + + def get_handle(self, kwargs): + import pplx_kernels as pplx + return self.handle_cache.get_or_create( + kwargs, pplx.AllToAll.internode + if self.internode else pplx.AllToAll.intranode) + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + raise NotImplementedError + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + with self.handle_cache._lock: + for _, handle in self.handle_cache._cache.items(): + handle.destroy() + + if self.internode: + from pplx_kernels.nvshmem import nvshmem_finalize + logger.debug("PPLX NVSHMEM finalize") + nvshmem_finalize() diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index ead79872bd49..52b970949144 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -1,11 +1,76 @@ # SPDX-License-Identifier: Apache-2.0 +import threading from typing import Optional +from weakref import WeakValueDictionary import torch import torch.distributed as dist from torch.distributed import ProcessGroup +class Cache: + + def __init__(self): + self._cache: WeakValueDictionary = WeakValueDictionary() + self._lock = threading.RLock() # Reentrant lock for thread safety + + def get_or_create(self, kwargs, func): + # Create a hashable key from the kwargs + key = tuple(sorted((k, v) for k, v in kwargs.items())) + + with self._lock: + instance = self._cache.get(key) + if instance is None: + instance = func(**kwargs) + self._cache[key] = instance + return instance + + +class All2AllManagerBase: + + def __init__(self, cpu_group): + self.cpu_group = cpu_group + + # compute some common properties + from vllm.distributed.parallel_state import (get_dp_group, + get_tp_group, + in_the_same_node_as) + + # all2all lives in ep group, which is merged from dp and tp group + self.dp_group = get_dp_group() + self.tp_group = get_tp_group() + # no self.ep_group since self.ep_group is still in construction + # when we create this object + self.dp_rank = self.dp_group.rank_in_group + self.dp_world_size = self.dp_group.world_size + self.rank = dist.get_rank(cpu_group) + self.world_size = dist.get_world_size(cpu_group) + + # all2all communication often has separate implementations for + # intra-node and inter-node communication + self.intranode = in_the_same_node_as(cpu_group, source_rank=0) + self.internode = not self.intranode + + def get_handle(self, kwargs): + # get a handle for the all2all communication, + # based on the kwargs. + # different layers can have different configs, + # e.g. one layer has hidden size 1024, another has 2048. + # usually the underlying implementation caches the handle + # and reuse it for the same config. + raise NotImplementedError + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + raise NotImplementedError + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + pass + + class DeviceCommunicatorBase: """ Base class for device-specific communicator. @@ -31,6 +96,18 @@ def __init__(self, self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank) + use_ep = False + from vllm.config import get_current_vllm_config + config = get_current_vllm_config() + if config is not None: + # as long as we use data parallel (coupled data parallel + # where all data parallel ranks execute forward together), + # we initialize the all2all manager used in expert parallel. + use_ep = config.parallel_config.data_parallel_size > 1 + + self.use_all2all = "ep" in unique_name and use_ep + self.all2all_manager: Optional[All2AllManagerBase] = None + def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: dist.all_reduce(input_, group=self.device_group) return input_ @@ -154,9 +231,17 @@ def prepare_communication_buffer_for_model(self, model: torch.nn.Module) -> None: """ Prepare the communication buffer for the model. - This is a no-op in the base class. """ - pass + if not self.use_all2all: + return + + moe_modules = [ + module for module in model.modules() + if module.__class__.__name__ == "FusedMoE" + ] + for module in moe_modules: + module.quant_method.init_prepare_finalize(module.moe_config, + module.quant_config) def dispatch( self, hidden_states: torch.Tensor, diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 13303f94b8ea..a05a13f51d4b 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -6,10 +6,12 @@ from torch.distributed import ProcessGroup import vllm.envs as envs +from vllm.logger import init_logger -from .all2all import All2AllBase from .base_device_communicator import DeviceCommunicatorBase +logger = init_logger(__name__) + class CudaCommunicator(DeviceCommunicatorBase): @@ -31,8 +33,6 @@ def __init__(self, use_pynccl = "ep" not in unique_name self.use_pynccl = use_pynccl - self.use_all2all = "ep" in unique_name - self.all2all_impl: Optional[All2AllBase] = None self.use_custom_allreduce = use_custom_allreduce # lazy import to avoid documentation build error @@ -56,6 +56,19 @@ def __init__(self, device=self.device, ) + if self.use_all2all: + all2all_backend = envs.VLLM_ALL2ALL_BACKEND + if all2all_backend == "naive": + from .all2all import NaiveAll2AllManager + self.all2all_manager = NaiveAll2AllManager(self.cpu_group) + logger.info("Using naive all2all manager.") + elif all2all_backend == "pplx": + from .all2all import PPLXAll2AllManager + self.all2all_manager = PPLXAll2AllManager(self.cpu_group) + logger.info("Using PPLX all2all manager.") + else: + raise ValueError(f"Unknown all2all backend: {all2all_backend}") + def all_reduce(self, input_): # always try custom allreduce first, # and then pynccl. @@ -136,31 +149,19 @@ def destroy(self): self.pynccl_comm = None if self.ca_comm is not None: self.ca_comm = None - if self.all2all_impl is not None: - self.all2all_impl.destroy() - self.all2all_impl = None - - def prepare_communication_buffer_for_model(self, - model: torch.nn.Module) -> None: - """ - Prepare the communication buffer for the model. - """ - if not self.use_all2all: - return - all2all_backend = envs.VLLM_ALL2ALL_BACKEND - if all2all_backend == "naive": - from .all2all import NaiveAll2All - self.all2all_impl = NaiveAll2All(self.cpu_group, model) + if self.all2all_manager is not None: + self.all2all_manager.destroy() + self.all2all_manager = None def dispatch( self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - assert self.all2all_impl is not None - hidden_states, router_logits = self.all2all_impl.dispatch( + assert self.all2all_manager is not None + hidden_states, router_logits = self.all2all_manager.dispatch( hidden_states, router_logits) return hidden_states, router_logits def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: - assert self.all2all_impl is not None - hidden_states = self.all2all_impl.combine(hidden_states) + assert self.all2all_manager is not None + hidden_states = self.all2all_manager.combine(hidden_states) return hidden_states diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 10da6ad59246..b674d05a7771 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -23,7 +23,6 @@ """ import contextlib import gc -import importlib.util import pickle import weakref from collections import namedtuple @@ -43,7 +42,7 @@ from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, - run_once, supports_custom_op) + supports_custom_op) @dataclass @@ -791,10 +790,14 @@ def dispatch( if self.device_communicator is not None: return self.device_communicator.dispatch(hidden_states, router_logits) + else: + return hidden_states, router_logits def combine(self, hidden_states) -> torch.Tensor: if self.device_communicator is not None: return self.device_communicator.combine(hidden_states) + else: + return hidden_states _WORLD: Optional[GroupCoordinator] = None @@ -959,49 +962,9 @@ def init_distributed_environment( "world group already initialized with a different world size") -PPLX_DID_INIT: bool = False - - -@run_once -def pplx_init(rank, world_size): - has_pplx = importlib.util.find_spec("pplx_kernels") is not None - - if has_pplx and world_size > 1: - from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, - nvshmem_get_unique_id, nvshmem_init) - try: - global PPLX_DID_INIT - logger.debug( - "Initialize NVSHMEM for PPLX kernels: rank=%d, " - "world size=%d", rank, world_size) - uid = nvshmem_get_unique_id( - ) if rank == 0 else nvshmem_alloc_empty_unique_id() - uid_gpu = uid.cuda() - get_world_group().broadcast(uid_gpu, src=0) - uid = uid_gpu.to(device='cpu') - logger.debug("PPLX NVSHMEM UID = %s", uid) - nvshmem_init(uid, rank, world_size) - PPLX_DID_INIT = True - except Exception as ex: - logger.error("Failed to initialize NVSHMEM for PPLX: %s", ex) - - -@run_once -def pplx_finalize(): - global PPLX_DID_INIT - if PPLX_DID_INIT: - from pplx_kernels.nvshmem import nvshmem_finalize - logger.debug("PPLX NVSHMEM finalize") - from vllm.model_executor.layers.fused_moe.layer import ( - _all_to_all_cache) - _all_to_all_cache.destroy() - nvshmem_finalize() - - def initialize_model_parallel( tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, - enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """ @@ -1104,14 +1067,10 @@ def initialize_model_parallel( _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group, _EP.rank_in_group) - if enable_expert_parallel: - pplx_init(rank, world_size) - def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, - enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """Helper to initialize model parallel groups if they are not initialized, @@ -1122,8 +1081,7 @@ def ensure_model_parallel_initialized( get_world_group().device_group) if not model_parallel_is_initialized(): initialize_model_parallel(tensor_model_parallel_size, - pipeline_model_parallel_size, - enable_expert_parallel, backend) + pipeline_model_parallel_size, backend) return assert ( @@ -1202,8 +1160,6 @@ def destroy_model_parallel(): """Set the groups to none and destroy them.""" global _TP - pplx_finalize() - if _TP: _TP.destroy() _TP = None diff --git a/vllm/envs.py b/vllm/envs.py index 2d330b8fbee8..363ba14ce4c8 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -809,6 +809,9 @@ def get_vllm_port() -> Optional[int]: lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")), # all2all backend for vllm's expert parallel communication + # Available options: + # - "naive": naive all2all implementation using all-reduce + # - "pplx": use pplx kernels "VLLM_ALL2ALL_BACKEND": lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), } diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 31efe16d1c27..31295582c1b1 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,12 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import importlib -import threading from abc import abstractmethod from dataclasses import dataclass from enum import Enum from typing import Callable, Optional -from weakref import WeakValueDictionary import torch import torch.nn.functional as F @@ -73,7 +71,8 @@ class FusedMoEParallelConfig: @property def use_pplx_kernels(self): - return self.dp_size > 1 and self.use_ep and has_pplx + return self.dp_size > 1 and self.use_ep and \ + envs.VLLM_ALL2ALL_BACKEND == "pplx" @staticmethod def make(tp_size_: int, dp_size_: int, @@ -196,6 +195,8 @@ class MoEConfig: # TODO: add more quantization params, blocked, per-token, etc. block_size: int = 128 + max_num_tokens: int = MOE_DP_CHUNK_SIZE + @property def tp_size(self): return self.moe_parallel_config.tp_size @@ -244,13 +245,59 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, params_dtype: torch.dtype, **extra_weight_attrs): raise NotImplementedError - def set_prepare_finalize( - self, - dp_size: int, - world_size: int, - prepare_finalize: FusedMoEPrepareAndFinalize, - ) -> bool: - return False + def init_prepare_finalize(self, moe: MoEConfig, + quant_config: Optional[QuantizationConfig]): + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + + prepare_finalize = None + if moe.use_pplx_kernels: + all_to_all_args = dict( + max_num_tokens=moe.max_num_tokens, + num_experts=moe.num_experts, + experts_per_token=moe.experts_per_token, # topk + rank=all2all_manager.rank, + world_size=all2all_manager.world_size, + # dp_size actually means tp_size, bug in pplx kernels + dp_size=all2all_manager.tp_group.world_size, + hidden_dim=moe.hidden_dim, + hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize, + # For blocked per token: set to + # ceil_div(hidden_dim, block_size) * sizeof(float32) + # For per-token: set to sizeof(float32) + hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else ( + (moe.hidden_dim + moe.block_size - 1) // moe.block_size * + torch.float32.itemsize)), + group_name=all2all_manager.cpu_group.group_name, + ) + + handle = all2all_manager.get_handle(all_to_all_args) + + prepare_finalize = PplxPrepareAndFinalize( + handle, + max_num_tokens=moe.max_num_tokens, + world_size=all2all_manager.world_size, + rank=all2all_manager.rank, + # dp_size actually means tp_size, bug in pplx kernels + dp_size=all2all_manager.tp_group.world_size, + quant_dtype=moe.in_dtype, + ) + + if prepare_finalize is not None: + experts = self.select_gemm_impl(prepare_finalize) + self.fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + def select_gemm_impl( + self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize] + ) -> FusedMoEPermuteExpertsUnpermute: + # based on the all2all implementation, select the appropriate + # gemm implementation + raise NotImplementedError( + "Subclass must select appropriate gemm implementation" + " based on the prepare_finalize") @abstractmethod def apply( @@ -274,53 +321,13 @@ def apply( raise NotImplementedError -class AllToAllCache: - - def __init__(self): - self._cache: WeakValueDictionary = WeakValueDictionary() - self._lock = threading.RLock() # Reentrant lock for thread safety - - def destroy(self): - with self._lock: - # TODO: can we do del self._cache? - for _, a2a in self._cache.items(): - a2a.destroy() - - def get_or_create(self, **kwargs): - assert has_pplx - import pplx_kernels as pplx - - # Create a hashable key from the kwargs - key = tuple(sorted((k, v) for k, v in kwargs.items())) - - with self._lock: - instance = self._cache.get(key) - if instance is None: - # TODO (varun): Add support to switch to intranode - # when all communications are within the same - # node. - logger.debug("Create AllToAll %s", kwargs) - instance = pplx.AllToAll.internode(**kwargs) - self._cache[key] = instance - return instance - - -# Global singleton -_all_to_all_cache = AllToAllCache() - - -# Factory function as a cleaner interface -def get_all_to_all(**kwargs): - return _all_to_all_cache.get_or_create(**kwargs) - - @CustomOp.register("unquantized_fused_moe") class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" def __init__(self, moe: MoEConfig): super().__init__() - self.fused_experts = fused_experts + self.fused_experts = fused_experts # type: ignore self.moe = moe self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() @@ -330,6 +337,42 @@ def __init__(self, moe: MoEConfig): else: self.rocm_aiter_fused_experts = None # type: ignore + def select_gemm_impl( + self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]): + + assert self.fused_experts == fused_experts + + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + + experts: Optional[FusedMoEPermuteExpertsUnpermute] = None + + if isinstance(prepare_finalize, + (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): + logger.debug("BatchedTritonExperts %s", self.moe) + experts = BatchedTritonExperts( + max_num_tokens=MOE_DP_CHUNK_SIZE, + world_size=all2all_manager.world_size, + # dp_size actually means tp_size, bug in pplx kernels + dp_size=all2all_manager.tp_group.world_size, + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + ) + else: + logger.debug("TritonExperts %s", self.moe) + experts = TritonExperts( + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + per_channel_quant=False, + ) + return experts + def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -429,47 +472,6 @@ def apply( activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) - def set_prepare_finalize( - self, - dp_size: int, - world_size: int, - prepare_finalize: FusedMoEPrepareAndFinalize, - ) -> bool: - assert self.fused_experts == fused_experts - - experts: Optional[FusedMoEPermuteExpertsUnpermute] = None - - if isinstance(prepare_finalize, - (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): - logger.debug("BatchedTritonExperts %s", self.moe) - experts = BatchedTritonExperts( - max_num_tokens=MOE_DP_CHUNK_SIZE, - world_size=world_size, - dp_size=dp_size, - use_fp8_w8a8=False, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - block_shape=None, - ) - else: - logger.debug("TritonExperts %s", self.moe) - experts = TritonExperts( - use_fp8_w8a8=False, - use_int8_w8a8=False, - use_int8_w8a16=False, - use_int4_w4a16=False, - block_shape=None, - per_channel_quant=False, - ) - - self.fused_experts = FusedMoEModularKernel( - prepare_finalize, - experts, - ) - - return True - def forward_cuda( self, layer: torch.nn.Module, @@ -679,45 +681,6 @@ def determine_expert_map( return (local_num_experts, expert_map) -def _construct_prepare_finalize( - moe: MoEConfig, quant_config: Optional[QuantizationConfig] -) -> Optional[FusedMoEPrepareAndFinalize]: - max_num_tokens = MOE_DP_CHUNK_SIZE - world_size = moe.ep_size - dp_size = moe.ep_size // moe.dp_size # dp_size actually means TP. - rank = moe.ep_rank - - if moe.use_pplx_kernels: - logger.debug("using PplxPrepareAndFinalize") - - all_to_all = get_all_to_all( - max_num_tokens=max_num_tokens, - num_experts=moe.num_experts, - experts_per_token=moe.experts_per_token, # topk - rank=rank, - world_size=world_size, - dp_size=dp_size, - hidden_dim=moe.hidden_dim, - hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize, - # For blocked per token: set to - # ceil_div(hidden_dim, block_size) * sizeof(float32) - # For per-token: set to sizeof(float32) - hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else - ((moe.hidden_dim + moe.block_size - 1) // - moe.block_size * torch.float32.itemsize))) - - return PplxPrepareAndFinalize( - all_to_all, - max_num_tokens=max_num_tokens, - world_size=world_size, - rank=rank, - dp_size=dp_size, - quant_dtype=moe.in_dtype, - ) - - return None - - class FusedMoE(torch.nn.Module): """FusedMoE layer for MoE models. @@ -831,7 +794,10 @@ def __init__( moe_parallel_config=self.moe_parallel_config, # TODO (bnell): this needs to be fixed for quantized types. in_dtype=params_dtype, + max_num_tokens=MOE_DP_CHUNK_SIZE, ) + self.moe_config = moe + self.quant_config = quant_config # Note: get_quant_method will look at the layer's local_num_experts # for heuristic purposes, so it must be initialized first. @@ -839,25 +805,13 @@ def __init__( if quant_config is None: quant_method = UnquantizedFusedMoEMethod(moe) - prepare_finalize = _construct_prepare_finalize(moe, quant_config) else: quant_method = quant_config.get_quant_method(self, prefix) - # No pplx for quantized types yet. - prepare_finalize = None assert quant_method is not None assert isinstance(quant_method, FusedMoEMethodBase) self.quant_method = quant_method - if prepare_finalize is not None: - world_size = moe.ep_size - dp_size = int(moe.ep_size // moe.dp_size) - success = self.quant_method.set_prepare_finalize( - dp_size, world_size, prepare_finalize) - if not success: - logger.warning("DP+EP not supported for %s.", - type(self.quant_method)) - moe_quant_params = { "num_experts": self.local_num_experts, "hidden_size": hidden_size, diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index b1126b94e45a..783ebebbfec9 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -9,7 +9,6 @@ moe_kernel_quantize_input) -# Note use: layer.get_all_to_all() to get an AllToAll instance # The max_num_tokens, world_size and dp_size must be the same # as the ones used to create the AllToAll. class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f4cdc3db1a0d..652bf76673c5 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter import vllm.envs as envs -import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -461,7 +460,7 @@ def __init__(self, quant_config: Fp8Config): logger.warning_once( "DeepGemm not supported on the current platform.") - self.fused_experts = functools.partial( + self.fused_experts = functools.partial( # type: ignore fused_experts, block_shape=self.quant_config.weight_block_size, allow_deep_gemm=self.allow_deep_gemm) @@ -791,17 +790,12 @@ def process_weights_after_loading(self, layer: Module) -> None: del layer.w13_input_scale del layer.w2_input_scale - def set_prepare_finalize( - self, - dp_size: int, - world_size: int, - prepare_finalize: mk.FusedMoEPrepareAndFinalize, - ) -> bool: + def select_gemm_impl(self, prepare_finalize): from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts) - if self.use_marlin or self.rocm_aiter_moe_enabled: - return False + assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( + "Marlin and ROCm AITER are not supported with all2all yet.") experts = TritonOrDeepGemmExperts( use_fp8_w8a8=True, @@ -809,12 +803,7 @@ def set_prepare_finalize( allow_deep_gemm=self.allow_deep_gemm, ) - self.fused_experts = mk.FusedMoEModularKernel( - prepare_finalize, - experts, - ) - - return True + return experts def apply( self, diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 0bdf15959302..8bb3dfe7457a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -158,6 +158,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: "currently not supported with CUDA Graphs.") vllm_config.model_config.enforce_eager = True compilation_config.use_cudagraph = False + # FIXME: inductor breaks cudagraph (from @bnell) compilation_config.use_inductor = False @classmethod diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a7c70fec042c..bce5cbb5f9d0 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -348,8 +348,7 @@ def init_worker_distributed_environment( distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ae3735ab0255..fa4eb30ccd9a 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -265,8 +265,7 @@ def init_tpu_worker_distributed_environment( backend="gloo", ) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) try: diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index a92cf1e5a3b3..1436a404335a 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -390,8 +390,7 @@ def init_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) def get_cache_block_size_bytes(self) -> int: """Return the size in bytes of a single KV cache block. diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index d7fe0fe0fe4c..533fead0e669 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -415,8 +415,7 @@ def init_worker_distributed_environment( backend='hccl') ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) if torch.distributed.is_initialized(): torch_world_size = torch.distributed.get_world_size() @@ -442,8 +441,7 @@ def init_worker_distributed_environment( torch.distributed.all_reduce(dummy_tensor_hpu) assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len, diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 891ed66599dc..4bb9bea022f9 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -76,8 +76,7 @@ def init_device(self) -> None: ) ensure_model_parallel_initialized( self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size, - self.parallel_config.enable_expert_parallel) + self.parallel_config.pipeline_parallel_size) # Device initialization should happen after initializing the distributed # runtime. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 5e3b6e4b62ea..6e45b8423e5e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -529,8 +529,7 @@ def init_worker_distributed_environment( init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index a78a41e03ea3..a5109a982cbf 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -175,8 +175,7 @@ def init_worker_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.enable_expert_parallel) + parallel_config.pipeline_parallel_size) # global all_reduce needed for overall oneccl warm up torch.distributed.all_reduce(torch.zeros(1).xpu()) From 3d28ad343f72c950be36aed7fb8c18ab39f14dd2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 17:09:54 +0100 Subject: [PATCH 104/192] Fix figures in design doc (#18612) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/design/kernel/paged_attention.md | 38 +++++++++------------------ 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md index ad8b5c9264d2..fdfa38a29f83 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -140,22 +140,18 @@ title: vLLM Paged Attention const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` - <figure markdown="span"> - ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" } - <figcaption> -</figcaption> - </figure> +<figure markdown="span"> + ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" } +</figure> - Each thread defines its own `q_ptr` which points to the assigned query token data on global memory. For example, if `VEC_SIZE` is 4 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs. - <figure markdown="span"> - ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" } - <figcaption> -</figcaption> - </figure> +<figure markdown="span"> + ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" } +</figure> ```cpp __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; @@ -192,11 +188,9 @@ title: vLLM Paged Attention points to key token data based on `k_cache` at assigned block, assigned head and assigned token. - <figure markdown="span"> - ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" } - <figcaption> -</figcaption> - </figure> +<figure markdown="span"> + ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" } +</figure> - The diagram above illustrates the memory layout for key data. It assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is @@ -209,11 +203,9 @@ title: vLLM Paged Attention elements for one token) that will be processed by 2 threads (one thread group) separately. - <figure markdown="span"> - ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" } - <figcaption> -</figcaption> - </figure> +<figure markdown="span"> + ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" } +</figure> ```cpp K_vec k_vecs[NUM_VECS_PER_THREAD] @@ -372,20 +364,14 @@ title: vLLM Paged Attention <figure markdown="span"> ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" } - <figcaption> -</figcaption> </figure> <figure markdown="span"> ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" } - <figcaption> -</figcaption> </figure> <figure markdown="span"> ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" } - <figcaption> -</figcaption> </figure> - Now we need to retrieve the value data and perform dot multiplication From 9520a989dfdf3a1db36798458ce525e9755f7438 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Fri, 23 May 2025 12:33:21 -0400 Subject: [PATCH 105/192] [Docs] Change mkdocs to not use directory urls (#18622) Signed-off-by: mgoin <mgoin64@gmail.com> --- mkdocs.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mkdocs.yaml b/mkdocs.yaml index 8468b2bd9690..b33998bb4084 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -119,3 +119,8 @@ extra_css: extra_javascript: - mkdocs/javascript/run_llm_widget.js - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML + +# Makes the url format end in .html rather than act as a dir +# So index.md generates as index.html and is available under URL /index.html +# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls +use_directory_urls: false From 6550114c9cd23198f3a88094ebe3e0d2bc8cb8de Mon Sep 17 00:00:00 2001 From: Chen Zhang <zhangch99@outlook.com> Date: Sat, 24 May 2025 00:39:47 +0800 Subject: [PATCH 106/192] [v1] Redo "Support multiple KV cache groups in GPU model runner (#17945)" (#18593) Signed-off-by: Chen Zhang <zhangch99@outlook.com> --- tests/v1/core/test_kv_cache_utils.py | 71 ++++- tests/v1/core/test_prefix_caching.py | 36 +-- tests/v1/worker/test_gpu_input_batch.py | 39 ++- tests/v1/worker/test_gpu_model_runner.py | 57 ++-- .../v1/shared_storage_connector.py | 6 +- .../attention/backends/mla/rocm_aiter_mla.py | 4 +- vllm/v1/core/kv_cache_manager.py | 34 ++- vllm/v1/core/kv_cache_utils.py | 13 +- vllm/v1/core/sched/output.py | 12 +- vllm/v1/core/sched/scheduler.py | 16 +- vllm/v1/kv_cache_interface.py | 42 +++ vllm/v1/worker/block_table.py | 41 +++ vllm/v1/worker/gpu_input_batch.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 253 +++++++++++------- vllm/v1/worker/tpu_model_runner.py | 36 +-- 15 files changed, 469 insertions(+), 203 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 1e2767e2d198..43a27da2dbe4 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -19,7 +19,8 @@ hash_request_tokens, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor) + KVCacheGroupSpec, KVCacheTensor, + SlidingWindowSpec) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -54,12 +55,14 @@ def new_kv_cache_spec(block_size=16, num_kv_heads=2, head_size=64, dtype=torch.float32, - use_mla=False): + use_mla=False, + sliding_window=None): return FullAttentionSpec(block_size=block_size, num_kv_heads=num_kv_heads, head_size=head_size, dtype=dtype, - use_mla=use_mla) + use_mla=use_mla, + sliding_window=sliding_window) def test_none_hash(monkeypatch): @@ -492,6 +495,68 @@ def test_unify_kv_cache_configs(): unify_kv_cache_configs(diff_kv_cache_config) +def test_merge_kv_cache_spec(): + same_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=32), + ] + merged_layer_spec = same_layer_specs[0].merge(same_layer_specs) + assert merged_layer_spec.block_size == 16 + assert merged_layer_spec.num_kv_heads == 32 + assert merged_layer_spec.head_size == 64 + assert merged_layer_spec.dtype == torch.float32 + assert merged_layer_spec.sliding_window is None + + different_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=16), + ] + with pytest.raises(AssertionError): + different_layer_specs[0].merge(different_layer_specs) + + full_spec = new_kv_cache_spec(num_kv_heads=32) + different_type_layer_specs = [ + full_spec, + SlidingWindowSpec( + block_size=full_spec.block_size, + num_kv_heads=full_spec.num_kv_heads, + head_size=full_spec.head_size, + dtype=full_spec.dtype, + use_mla=full_spec.use_mla, + sliding_window=1, + ), + ] + with pytest.raises(AssertionError): + different_type_layer_specs[0].merge(different_type_layer_specs) + with pytest.raises(AssertionError): + different_type_layer_specs[1].merge(different_type_layer_specs) + + different_sliding_window_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=2), + ] + with pytest.raises(ValueError): + different_sliding_window_layer_specs[0].merge( + different_sliding_window_layer_specs) + + same_sliding_window_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + ] + merged_layer_spec = same_sliding_window_layer_specs[0].merge( + same_sliding_window_layer_specs) + assert merged_layer_spec.sliding_window == 1 + + same_sliding_window_layer_spec_with_none = [ + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=None), + ] + merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge( + same_sliding_window_layer_spec_with_none) + assert merged_layer_spec.sliding_window == 1 + + @pytest.mark.parametrize( ("model_id", "max_model_len", "want_estimated_max_len"), [ ("Qwen/Qwen1.5-7B", 16385, 16384), diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 2d7411381e16..3da27786b1f2 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -84,7 +84,7 @@ def test_prefill(hash_algo): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] # Check full block metadata parent_block_hash = None @@ -107,13 +107,13 @@ def test_prefill(hash_algo): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -141,13 +141,13 @@ def test_prefill(hash_algo): req2 = make_request("2", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(manager.req_to_block_hashes[req2.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [6] + assert blocks.get_block_ids() == [[6]] # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. @@ -171,7 +171,7 @@ def test_prefill(hash_algo): len(computed_blocks.blocks) * 16, computed_blocks) # This block ID order also checks the eviction order. - assert blocks.get_block_ids() == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1] + assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]] assert manager.block_pool.free_block_queue.num_free_blocks == 0 assert manager.block_pool.free_block_queue.free_list_head is None assert manager.block_pool.free_block_queue.free_list_tail is None @@ -208,7 +208,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0_block_hashes = [b.block_hash for b in blocks.blocks] # Check full block metadata @@ -233,13 +233,13 @@ def test_prefill_plp(): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -277,11 +277,11 @@ def test_prefill_plp(): block_ids = blocks.get_block_ids() # Duplicate cached blocks have different ids but same hashes vs request #0 assert [b.block_hash for b in blocks.blocks] == req0_block_hashes - assert block_ids != [1, 2, 3, 4] + assert block_ids != [[1, 2, 3, 4]] # Request #2 block hashes are valid since request #0 hashes are. # Check block reference counts. - for block_id in block_ids: + for block_id in block_ids[0]: assert manager.block_pool.blocks[block_id].ref_cnt == 1 manager.free(req2) @@ -307,7 +307,7 @@ def test_decode(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] # Append slots without allocating a new block. req0.num_computed_tokens = 55 @@ -379,12 +379,12 @@ def test_evict(): # Touch the first 2 blocks. req2 = make_request("2", list(range(2 * 16 + 3))) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert computed_blocks.get_block_ids() == [1, 2] + assert computed_blocks.get_block_ids() == [[1, 2]] assert num_computed_tokens == 2 * 16 blocks = manager.allocate_slots(req2, 3, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [10] + assert blocks.get_block_ids() == [[10]] assert manager.block_pool.free_block_queue.num_free_blocks == 7 @@ -625,7 +625,7 @@ def test_mm_prefix_caching(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -686,7 +686,7 @@ def test_cache_key_salting(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -797,7 +797,7 @@ def test_reset_prefix_cache(): all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) blocks = manager.allocate_slots(req0, 55) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids @@ -808,7 +808,7 @@ def test_reset_prefix_cache(): blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] # Failed to reset prefix cache because some blocks are not freed yet. assert not manager.reset_prefix_cache() diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 7b1359c8576f..27741bd156be 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -9,9 +9,11 @@ from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState, - InputBatch) +from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 @@ -22,6 +24,27 @@ MAX_NUM_PROMPT_TOKENS = 64 +def get_kv_cache_config() -> KVCacheConfig: + return KVCacheConfig( + num_blocks=10, + tensors={ + "layer.0": KVCacheTensor(size=1024), + }, + kv_cache_groups=[ + KVCacheGroupSpec( + layer_names=["layer.0"], + kv_cache_spec=FullAttentionSpec( + block_size=1, + num_kv_heads=1, + head_size=16, + dtype=torch.float16, + use_mla=False, + ), + ), + ], + ) + + def _compare_objs(obj1, obj2): attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a))) attr_names = set([ @@ -41,6 +64,10 @@ def _compare_objs(obj1, obj2): elif isinstance(a, np.ndarray): if np.allclose(a, b): is_same = True + elif isinstance(a, MultiGroupBlockTable): + for a_i, b_i in zip(a.block_tables, b.block_tables): + _compare_objs(a_i, b_i) + is_same = True elif isinstance(a, (BlockTable, SamplingMetadata)): _compare_objs(a, b) is_same = True # if we make it here must be same @@ -198,7 +225,7 @@ def _construct_cached_request_state(req_id_suffix: int): sampling_params=_create_sampling_params(), mm_inputs=[], mm_positions=[], - block_ids=[], + block_ids=[[]], generator=None, num_computed_tokens=len(output_token_ids), output_token_ids=output_token_ids, @@ -220,11 +247,11 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + block_size=1, ) reqs: list[CachedRequestState] = [] req_id_reqs = {} @@ -310,20 +337,20 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + block_size=1, ) ref_input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + block_size=1, ) reqs: list[CachedRequestState] = [] diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 725747294fd8..b8c3d88617d0 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,15 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 -import weakref import pytest -import torch -from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, + SchedulerConfig, VllmConfig) from vllm.sampling_params import SamplingParams from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) -from vllm.v1.kv_cache_interface import FullAttentionSpec +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -17,13 +18,34 @@ def initialize_kv_cache(runner: GPUModelRunner): """ Only perform necessary steps in GPUModelRunner.initialize_kv_cache() """ - kv_cache_spec = FullAttentionSpec(block_size=16, - num_kv_heads=1, - head_size=64, - dtype=torch.float16, - use_mla=False) - runner.attn_metadata_builder = runner.attn_backend.get_builder_cls()( - weakref.proxy(runner), kv_cache_spec, runner.input_batch.block_table) + kv_cache_config = KVCacheConfig( + num_blocks=10, + tensors={ + "layer.0": KVCacheTensor(size=1024), + }, + kv_cache_groups=[ + KVCacheGroupSpec( + layer_names=["layer.0"], + kv_cache_spec=FullAttentionSpec( + block_size=16, + num_kv_heads=runner.model_config.get_num_kv_heads( + runner.parallel_config), + head_size=runner.model_config.get_head_size(), + dtype=runner.kv_cache_dtype, + use_mla=False, + )) + ]) + runner.kv_cache_config = kv_cache_config + runner.input_batch = InputBatch( + max_num_reqs=runner.max_num_reqs, + max_model_len=runner.max_model_len, + max_num_batched_tokens=runner.max_num_tokens, + device=runner.device, + pin_memory=runner.pin_memory, + vocab_size=runner.model_config.get_vocab_size(), + block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size, + ) + runner.initialize_attn_backend(kv_cache_config) @pytest.fixture @@ -48,10 +70,12 @@ def model_runner(): swap_space=0, cache_dtype="auto", ) + parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, cache_config=cache_config, scheduler_config=scheduler_config, + parallel_config=parallel_config, ) device = "cuda" @@ -73,7 +97,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[0], + block_ids=[[0]], num_computed_tokens=0, lora_request=None, )) @@ -111,13 +135,14 @@ def _is_sampling_metadata_changed(model_runner, def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: req_index = model_runner.input_batch.req_id_to_index[req_id] - block_table = model_runner.input_batch.block_table + block_table = model_runner.input_batch.block_table[0] req_state = model_runner.requests[req_id] - if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids): + if block_table.num_blocks_per_row[req_index] != len( + req_state.block_ids[0]): return False num_blocks = block_table.num_blocks_per_row[req_index] return (block_table.block_table_np[req_index, :num_blocks] == - req_state.block_ids).all() + req_state.block_ids[0]).all() def test_update_states_new_request(model_runner): @@ -200,7 +225,7 @@ def test_update_states_request_resumed(model_runner): req_id=req_id, resumed_from_preemption=False, new_token_ids=[], - new_block_ids=[], + new_block_ids=[[]], num_computed_tokens=0, ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 0fedb6fd5ed9..0421a65a2c81 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -288,7 +288,7 @@ def build_connector_meta( for new_req in scheduler_output.scheduled_new_reqs: if new_req.req_id in self._requests_need_load: meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size, is_store=False) total_need_load += 1 @@ -299,7 +299,7 @@ def build_connector_meta( # the original prompt tokens. if not self._found_match_for_request(new_req): meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size, is_store=True) @@ -319,7 +319,7 @@ def build_connector_meta( # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. - block_ids = cached_req.new_block_ids + block_ids = cached_req.new_block_ids[0] meta.add_request(token_ids=token_ids, block_ids=block_ids, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 56ac834b4d7e..31980e94a037 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -69,13 +69,13 @@ def __init__(self, runner, kv_cache_spec: AttentionSpec, max_model_len = self.runner.model_config.max_model_len assert max_model_len == 32768,\ "AITER MLA requires max_model_len=32768" - assert self.runner.block_size == 1, "AITER MLA" \ + assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ "only supports block size 1." def _get_paged_kv_tensors( self, block_table: torch.Tensor, seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]: - page_size = self.runner.block_size + page_size = self.kv_cache_spec.block_size block_table_bounds = (seq_lens + page_size - 1) // page_size device = self.runner.device diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 598fc871110e..da18ece7555a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -32,9 +32,16 @@ def create_empty(cls) -> "KVCacheBlocks": """Creates a new KVCacheBlocks instance with no blocks.""" return cls([]) - def get_block_ids(self) -> list[int]: - """Converts the KVCacheBlocks instance to a list of block IDs.""" - return [block.block_id for block in self.blocks] + def get_block_ids(self) -> list[list[int]]: + """ + Converts the KVCacheBlocks instance to block_ids. + + Returns: + list[list[int]]: A two-level list where + * the outer list corresponds to KV cache groups (only 1 group now) + * each inner list contains the block_ids of the blocks in that group + """ + return [[block.block_id for block in self.blocks]] def get_unhashed_block_ids(self) -> list[int]: """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" @@ -300,9 +307,9 @@ def get_num_common_prefix_blocks( self, request: Request, num_running_requests: int, - ) -> int: + ) -> list[int]: """Calculate the number of common prefix blocks shared by all requests - in the RUNNING state. + in the RUNNING state for each kv cache group. The function determines this by selecting any request and iterating through its blocks. A block is considered a common prefix block if its @@ -332,11 +339,14 @@ def get_num_common_prefix_blocks( requests in the current step. Returns: - int: The number of common prefix blocks. + list[int]: The number of common prefix blocks for each kv cache + group. """ assert request.status == RequestStatus.RUNNING - return self.single_type_manager.get_num_common_prefix_blocks( - request.request_id, num_running_requests) + return [ + self.single_type_manager.get_num_common_prefix_blocks( + request.request_id, num_running_requests) + ] def free_block_hashes(self, request: Request) -> None: """Discard the block hashes for the request. @@ -354,10 +364,8 @@ def take_events(self) -> list[KVCacheEvent]: """ return self.block_pool.take_events() - def get_block_ids(self, request_id: str) -> list[int]: + def get_block_ids(self, request_id: str) -> list[list[int]]: """Get the block ids of a request.""" assert request_id in self.single_type_manager.req_to_blocks - return [ - block.block_id - for block in self.single_type_manager.req_to_blocks[request_id] - ] + return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id] + ).get_block_ids() diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 27c515835087..403b5401be75 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -577,14 +577,12 @@ def create_kv_cache_group_specs( """ kv_cache_groups = [] for layer_names_one_group in grouped_layer_names: - layer_spec = kv_cache_spec[layer_names_one_group[0]] - assert all( - kv_cache_spec[layer_name] == layer_spec - for layer_name in layer_names_one_group[1:]), ( - "All layers in the same KV cache group must share the same " - "KVCacheSpec.") + layer_specs = [ + kv_cache_spec[layer_name] for layer_name in layer_names_one_group + ] + merged_layer_spec = layer_specs[0].merge(layer_specs) kv_cache_groups.append( - KVCacheGroupSpec(layer_names_one_group, layer_spec)) + KVCacheGroupSpec(layer_names_one_group, merged_layer_spec)) return kv_cache_groups @@ -683,6 +681,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): head_size=spec.head_size, dtype=spec.dtype, use_mla=spec.use_mla, + sliding_window=spec.sliding_window, ) diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 24032498e50b..257234430983 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -26,7 +26,7 @@ class NewRequestData: mm_hashes: list[str] mm_positions: list[PlaceholderRange] sampling_params: SamplingParams - block_ids: list[int] + block_ids: list[list[int]] num_computed_tokens: int lora_request: Optional[LoRARequest] @@ -34,7 +34,7 @@ class NewRequestData: def from_request( cls, request: Request, - block_ids: list[int], + block_ids: list[list[int]], ) -> NewRequestData: return cls( req_id=request.request_id, @@ -85,7 +85,7 @@ class CachedRequestData: # request's block IDs instead of appending to the existing block IDs. resumed_from_preemption: bool new_token_ids: list[int] - new_block_ids: list[int] + new_block_ids: list[list[int]] num_computed_tokens: int @classmethod @@ -94,7 +94,7 @@ def from_request( request: Request, resumed_from_preemption: bool, new_token_ids: list[int], - new_block_ids: list[int], + new_block_ids: list[list[int]], ) -> CachedRequestData: return cls( req_id=request.request_id, @@ -131,9 +131,9 @@ class SchedulerOutput: # E.g., if a request has [0, 1], it could mean the vision encoder needs # to process that the request's 0-th and 1-th images in the current step. scheduled_encoder_inputs: dict[str, list[int]] - # Number of common prefix blocks for all requests. + # Number of common prefix blocks for all requests in each KV cache group. # This can be used for cascade attention. - num_common_prefix_blocks: int + num_common_prefix_blocks: list[int] # Request IDs that are finished in between the previous and the current # steps. This is used to notify the workers about the finished requests diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c873ced343bf..1f54560a10a7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -173,7 +173,7 @@ def schedule(self) -> SchedulerOutput: # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, list[int]] = {} + req_to_new_block_ids: dict[str, list[list[int]]] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -486,7 +486,8 @@ def schedule(self) -> SchedulerOutput: # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. - num_common_prefix_blocks = 0 + num_common_prefix_blocks = [0] * len( + self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] num_common_prefix_blocks = ( @@ -573,7 +574,7 @@ def _make_cached_request_data( request: Request, num_scheduled_tokens: int, num_scheduled_spec_tokens: int, - new_block_ids: list[int], + new_block_ids: list[list[int]], resumed_from_preemption: bool, ) -> CachedRequestData: # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -949,7 +950,9 @@ def _connector_finished( """ if self.connector is None: return False, None - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + assert len(self.kv_cache_config.kv_cache_groups + ) == 1, "KV connector only supports one KV cache group now" + block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] return self.connector.request_finished(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: @@ -966,9 +969,10 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: """ if request.request_id not in self.finished_recving_kv_req_ids: return False - + assert len(self.kv_cache_config.kv_cache_groups + ) == 1, "KV connector only supports one KV cache group now" # Now that the blocks are ready, actually cache them. - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] num_computed_tokens = len(block_ids) * self.block_size if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 4fc0844cd1f4..2747fc7fabd1 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,8 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +import copy from dataclasses import dataclass +from typing import Optional import torch +from typing_extensions import Self from vllm.config import VllmConfig from vllm.logger import init_logger @@ -53,6 +56,16 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: """ raise NotImplementedError + @classmethod + def merge(cls, specs: list[Self]) -> Self: + """ + Merge a list of KVCacheSpec objects into a single KVCacheSpec object. + """ + assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), ( + "All layers in the same KV cache group must share the same " + "type_id.") + return copy.deepcopy(specs[0]) + @dataclass class AttentionSpec(KVCacheSpec): @@ -71,6 +84,16 @@ def page_size_bytes(self) -> int: @dataclass class FullAttentionSpec(AttentionSpec): + sliding_window: Optional[int] = None + """ + When hybrid allocator is disabled and the model contains both full + attention layers and sliding window attention layers, sliding + window attention are regarded as full attention in KV cache manager + (blocks are allocated for all tokens), while computed as sliding window + attention in model runner. + In this case, we use FullAttentionSpec and record the sliding window size. + Default to None for not using sliding window attention. + """ @property def type_id(self) -> str: @@ -80,6 +103,25 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len return cdiv(max_model_len, self.block_size) * self.page_size_bytes + @classmethod + def merge(cls, specs: list[Self]) -> Self: + """ + Merge a list of FullAttentionSpec objects into a single + FullAttentionSpec object. + """ + merged_spec = super().merge(specs) + sliding_window = set(spec.sliding_window for spec in specs + if spec.sliding_window is not None) + if len(sliding_window) == 0: + merged_spec.sliding_window = None + elif len(sliding_window) == 1: + merged_spec.sliding_window = sliding_window.pop() + else: + raise ValueError( + "All sliding window layers in the same KV cache group " + "must have the same window size.") + return merged_spec + @dataclass class SlidingWindowSpec(AttentionSpec): diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 581d3d9bd11b..576086ebeb7f 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -4,6 +4,7 @@ import torch from vllm.logger import init_logger +from vllm.utils import cdiv logger = init_logger(__name__) @@ -96,3 +97,43 @@ def get_cpu_tensor(self) -> torch.Tensor: def get_numpy_array(self) -> np.ndarray: """Returns the numpy array of the block table.""" return self.block_table_np + + +class MultiGroupBlockTable: + """The BlockTables for each KV cache group.""" + + def __init__(self, max_num_reqs: int, max_model_len: int, + max_num_batched_tokens: int, pin_memory: bool, + device: torch.device, block_size: int) -> None: + self.block_tables = [ + BlockTable(max_num_reqs, cdiv(max_model_len, block_size), + max_num_batched_tokens, pin_memory, device) + ] + + def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: + for i, block_table in enumerate(self.block_tables): + block_table.append_row(block_ids[i], row_idx) + + def add_row(self, block_ids: list[list[int]], row_idx: int) -> None: + for i, block_table in enumerate(self.block_tables): + block_table.add_row(block_ids[i], row_idx) + + def move_row(self, src: int, tgt: int) -> None: + for block_table in self.block_tables: + block_table.move_row(src, tgt) + + def swap_row(self, src: int, tgt: int) -> None: + for block_table in self.block_tables: + block_table.swap_row(src, tgt) + + def commit(self, num_reqs: int) -> None: + for block_table in self.block_tables: + block_table.commit(num_reqs) + + def clear(self) -> None: + for block_table in self.block_tables: + block_table.clear() + + def __getitem__(self, idx: int) -> "BlockTable": + """Returns the BlockTable for the i-th KV cache group.""" + return self.block_tables[idx] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 871654fca366..b3e65917d3cc 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -14,7 +14,7 @@ from vllm.v1.outputs import LogprobsTensors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import copy_slice -from vllm.v1.worker.block_table import BlockTable +from vllm.v1.worker.block_table import MultiGroupBlockTable _SAMPLING_EPS = 1e-5 @@ -29,7 +29,7 @@ class CachedRequestState: sampling_params: SamplingParams generator: Optional[torch.Generator] - block_ids: list[int] + block_ids: list[list[int]] num_computed_tokens: int output_token_ids: list[int] @@ -58,15 +58,14 @@ def __init__( self, max_num_reqs: int, max_model_len: int, - max_num_blocks_per_req: int, max_num_batched_tokens: int, device: torch.device, pin_memory: bool, vocab_size: int, + block_size: int, ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len - self.max_num_blocks_per_req = max_num_blocks_per_req self.max_num_batched_tokens = max_num_batched_tokens self.device = device self.pin_memory = pin_memory @@ -99,12 +98,13 @@ def __init__( self.num_computed_tokens_cpu_tensor.numpy() # Block table. - self.block_table = BlockTable( + self.block_table = MultiGroupBlockTable( max_num_reqs=max_num_reqs, - max_num_blocks_per_req=max_num_blocks_per_req, + max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, pin_memory=pin_memory, device=device, + block_size=block_size, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6d4888363d50..c7072d0ca330 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -12,6 +12,8 @@ import torch.nn as nn from vllm.attention import AttentionType, get_attn_backend +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadataBuilder) from vllm.attention.layer import Attention from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import (CompilationLevel, VllmConfig, @@ -32,8 +34,8 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LayerBlockType, LazyLoader, cdiv, - check_use_alibi, is_pin_memory_available) + GiB_bytes, LazyLoader, cdiv, check_use_alibi, + is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -51,6 +53,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.utils import is_spec_decode_supported from vllm.v1.utils import bind_kv_cache +from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin @@ -103,59 +106,17 @@ def __init__( self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] - # NOTE(woosuk): sliding_window is None for models with interleaved - # attention. Use interleaved_sliding_window instead. - self.sliding_window = model_config.get_sliding_window() - self.interleaved_sliding_window = getattr( - model_config.hf_text_config, "interleaved_sliding_window", None) - self.window_size = (self.sliding_window - or self.interleaved_sliding_window) - self.is_multimodal_model = model_config.is_multimodal_model - self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len - self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs # Model-related. - self.num_attn_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) self.num_query_heads = model_config.get_num_attention_heads( parallel_config) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - self.head_size = model_config.get_head_size() self.hidden_size = model_config.get_hidden_size() self.attention_chunk_size = model_config.attention_chunk_size - self.attn_backend = get_attn_backend( - self.head_size, - self.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) - if self.attn_backend is None: - error_msg = ( - f"Error with get_att_backend: {self.head_size=}, " - f"{self.dtype=}, {self.kv_cache_dtype=}, {self.block_size=}, " - f"{self.model_config.is_attention_free=}, " - f"{self.model_config.use_mla=}") - logger.error(error_msg) - raise NotImplementedError( - "Non-Attention backend is not supported by V1 GPUModelRunner.") - - if self.vllm_config.compilation_config.full_cuda_graph: - attn_backend_name = self.attn_backend.__name__ - flash_attn_version = get_flash_attn_version() - if attn_backend_name != "FlashAttentionBackend" or \ - flash_attn_version != 3: - raise ValueError( - f"full_cuda_graph is only supported with " - f"FA3. Current attention backend is {attn_backend_name}, " - f"FlashAttention version is {flash_attn_version}.") - self.cascade_attn_enabled = not self.model_config.disable_cascade_attn # Multi-modal data support @@ -177,8 +138,10 @@ def __init__( # self.model: nn.Module # Set after load_model # Initialize in initialize_kv_cache self.kv_caches: list[torch.Tensor] = [] + self.attn_metadata_builders: list[AttentionMetadataBuilder] = [] + self.attn_backends: list[type[AttentionBackend]] = [] # self.kv_cache_config: KVCacheConfig - # self.attn_metadata_builder: type[AttentionMetadataBuilder] + # self.input_batch: InputBatch # Persistent batch. # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} @@ -207,15 +170,15 @@ def __init__( # Request states. self.requests: dict[str, CachedRequestState] = {} - # Persistent batch. + self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, max_model_len=self.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, max_num_batched_tokens=self.max_num_tokens, device=self.device, pin_memory=self.pin_memory, - vocab_size=model_config.get_vocab_size(), + vocab_size=self.model_config.get_vocab_size(), + block_size=self.cache_config.block_size, ) self.use_cuda_graph = (self.vllm_config.compilation_config.level @@ -311,6 +274,31 @@ def __init__( pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: + """ + Update the order of requests in the batch based on the attention + backend's needs. For example, some attention backends (namely MLA) may + want to separate requests based on if the attention computation will be + compute-bound or memory-bound. + + Args: + scheduler_output: The scheduler output. + + Returns: + True if the batch was reordered, False otherwise. + """ + batch_reordered = self.attn_metadata_builders[0].reorder_batch( + self.input_batch, scheduler_output) + + # For models with multiple KV cache groups, the groups should agree on + # the same order of requests. We ensure this by only allowing the first + # group to reorder the batch and asserting that all other groups do not + # reorder the batch. + for i in range(1, len(self.kv_cache_config.kv_cache_groups)): + assert not self.attn_metadata_builders[i].reorder_batch( + self.input_batch, scheduler_output) + return batch_reordered + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -447,7 +435,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - req_state.block_ids.extend(req_data.new_block_ids) + for i in range(len(self.kv_cache_config.kv_cache_groups)): + req_state.block_ids[i].extend(req_data.new_block_ids[i]) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones. @@ -505,11 +494,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: if removed_req_indices: self.input_batch.condense(removed_req_indices) - # Some attention backends (namely MLA) may want to separate requests - # based on if the attention computation will be compute-bound or - # memory-bound. This gives them a hook to do that. - batch_reordered = self.attn_metadata_builder.reorder_batch( - self.input_batch, scheduler_output) + batch_reordered = self._may_reorder_batch(scheduler_output) if batch_changed or batch_reordered: self.input_batch.refresh_sampling_metadata() @@ -577,21 +562,29 @@ def _prepare_inputs( torch.from_numpy(token_indices), out=self.input_ids_cpu[:total_num_scheduled_tokens]) - # Calculate the slot mapping. - # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] - # where K is the max_num_blocks_per_req and the block size is 2. - # NOTE(woosuk): We can't simply use `token_indices // block_size` here - # because M (max_model_len) is not necessarily divisible by block_size. - block_table_indices = (req_indices * self.max_num_blocks_per_req + - positions_np // self.block_size) - block_table_cpu = self.input_batch.block_table.get_cpu_tensor() - block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() - block_offsets = positions_np % self.block_size - np.add(block_numbers * self.block_size, - block_offsets, - out=self.input_batch.block_table. - slot_mapping_np[:total_num_scheduled_tokens]) + # Calculate the slot mapping for each KV cache group. + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + block_size = kv_cache_group_spec.kv_cache_spec.block_size + block_table: BlockTable = self.input_batch.block_table[ + kv_cache_group_id] + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` + # here because M (max_model_len) is not necessarily divisible by + # block_size. + block_table_indices = ( + req_indices * block_table.max_num_blocks_per_req + + positions_np // block_size) + block_table_cpu = block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten( + )[block_table_indices].numpy() + block_offsets = positions_np % block_size + np.add( + block_numbers * block_size, + block_offsets, + out=block_table.slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. self.query_start_loc_np[0] = 0 @@ -633,10 +626,6 @@ def _prepare_inputs( attn_metadata: dict[str, FlashAttentionMetadata] = {} # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. - # NOTE(Chen): there is exactly one KV cache group that contains all - # attetnion layers in the model for now, so the current logic for - # getting attn_metadata is not related to kv_cache_group information. - # Will extend this part to support multiple KV cache groups later. for kv_cache_group_id, kv_cache_group_spec in enumerate( self.kv_cache_config.kv_cache_groups): @@ -645,15 +634,19 @@ def _prepare_inputs( if self.cascade_attn_enabled: common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, - scheduler_output.num_common_prefix_blocks, + scheduler_output. + num_common_prefix_blocks[kv_cache_group_id], + kv_cache_group_spec.kv_cache_spec, + self.attn_metadata_builders[kv_cache_group_id], ) - attn_metadata_i = self.attn_metadata_builder.build( - num_reqs=num_reqs, - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, - common_prefix_len=common_prefix_len, - common_attn_metadata=common_attn_metadata) + attn_metadata_i = ( + self.attn_metadata_builders[kv_cache_group_id].build( + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata)) for layer_name in kv_cache_group_spec.layer_names: attn_metadata[layer_name] = attn_metadata_i @@ -691,6 +684,8 @@ def _compute_cascade_attn_prefix_len( self, num_scheduled_tokens: np.ndarray, num_common_prefix_blocks: int, + kv_cache_spec: KVCacheSpec, + attn_metadata_builder: AttentionMetadataBuilder, ) -> int: """Compute the length of the common prefix for cascade attention. @@ -709,7 +704,7 @@ def _compute_cascade_attn_prefix_len( Returns: int: Length of common prefix in tokens. """ - common_prefix_len = num_common_prefix_blocks * self.block_size + common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size if common_prefix_len == 0: # Common case. return 0 @@ -758,15 +753,19 @@ def _compute_cascade_attn_prefix_len( common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) # common_prefix_len should be a multiple of the block size. - common_prefix_len = (common_prefix_len // self.block_size * - self.block_size) - use_cascade = self.attn_metadata_builder.use_cascade_attention( + common_prefix_len = (common_prefix_len // kv_cache_spec.block_size * + kv_cache_spec.block_size) + use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or + (isinstance(kv_cache_spec, FullAttentionSpec) + and kv_cache_spec.sliding_window is not None)) + assert isinstance(kv_cache_spec, AttentionSpec) + use_cascade = attn_metadata_builder.use_cascade_attention( common_prefix_len=common_prefix_len, query_lens=num_scheduled_tokens, num_query_heads=self.num_query_heads, - num_kv_heads=self.num_kv_heads, + num_kv_heads=kv_cache_spec.num_kv_heads, use_alibi=self.use_alibi, - use_sliding_window=self.window_size is not None, + use_sliding_window=use_sliding_window, num_sms=self.num_sms, ) return common_prefix_len if use_cascade else 0 @@ -1661,7 +1660,7 @@ def _dummy_run( dtype=np.int32) if skip_attn: - attn_metadata = None + attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs] @@ -1669,13 +1668,19 @@ def _dummy_run( common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, seq_lens=seq_lens) - attn_metadata = self.attn_metadata_builder.build( - num_reqs=num_tokens, - num_actual_tokens=num_tokens, - max_query_len=num_tokens, - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - ) + attn_metadata = {} + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + attn_metadata_i = ( + self.attn_metadata_builders[kv_cache_group_id].build( + num_reqs=num_tokens, + num_actual_tokens=num_tokens, + max_query_len=num_tokens, + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + )) + for layer_name in kv_cache_group_spec.layer_names: + attn_metadata[layer_name] = attn_metadata_i with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): @@ -1909,6 +1914,56 @@ def capture_model(self) -> None: logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) + def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize the attention backends and attention metadata builders. + """ + assert len(self.attn_backends) == 0 and len( + self.attn_metadata_builders + ) == 0, "Attention backends are already initialized" + for i, kv_cache_group_spec in enumerate( + kv_cache_config.kv_cache_groups): + kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if not isinstance(kv_cache_spec, AttentionSpec): + raise NotImplementedError( + "Only AttentionSpec is supported for now.") + attn_backend_i = get_attn_backend( + kv_cache_spec.head_size, + self.dtype, + kv_cache_spec.dtype, + kv_cache_spec.block_size, + self.model_config.is_attention_free, + use_mla=kv_cache_spec.use_mla, + ) + if attn_backend_i is None: + error_msg = ( + f"Error with get_attn_backend: {kv_cache_spec.head_size=}, " + f"{self.dtype=}, {kv_cache_spec.dtype=}, " + f"{kv_cache_spec.block_size=}, " + f"{self.model_config.is_attention_free=}, " + f"{kv_cache_spec.use_mla=}") + logger.error(error_msg) + raise NotImplementedError( + "Non-Attention backend is not supported by V1 " + "GPUModelRunner.") + + if self.vllm_config.compilation_config.full_cuda_graph: + attn_backend_name = attn_backend_i.__name__ + flash_attn_version = get_flash_attn_version() + if attn_backend_name != "FlashAttentionBackend" or \ + flash_attn_version != 3: + raise ValueError( + f"full_cuda_graph is only supported with " + f"FA3. Current attention backend is " + f"{attn_backend_name}, FlashAttention version is " + f"{flash_attn_version}.") + + block_table_i = self.input_batch.block_table[i] + attn_metadata_builder_i = attn_backend_i.get_builder_cls()( + weakref.proxy(self), kv_cache_spec, block_table_i) + self.attn_backends.append(attn_backend_i) + self.attn_metadata_builders.append(attn_metadata_builder_i) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -1921,10 +1976,11 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: "Hybrid models with more than one KV cache type are not " "supported yet.") self.kv_cache_config = kv_cache_config + self.initialize_attn_backend(kv_cache_config) kv_caches: dict[str, torch.Tensor] = {} - for kv_cache_group in kv_cache_config.kv_cache_groups: + for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups): kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: tensor_config = kv_cache_config.tensors[layer_name] @@ -1939,7 +1995,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: # the min of all `num_blocks`. Verify it here. assert num_blocks >= kv_cache_config.num_blocks if isinstance(kv_cache_spec, AttentionSpec): - kv_cache_shape = self.attn_backend.get_kv_cache_shape( + kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype @@ -1959,11 +2015,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: if has_kv_transfer_group(): get_kv_transfer_group().register_kv_caches(kv_caches) - self.attn_metadata_builder = self.attn_backend.get_builder_cls()( - weakref.proxy(self), - kv_cache_config.kv_cache_groups[0].kv_cache_spec, - self.input_batch.block_table) - def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index b4daf5a34678..b13ff9f97e6f 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -171,19 +171,10 @@ def __init__( self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} + # self.input_batch: InputBatch # Persistent batch. # Request states. self.requests: dict[str, CachedRequestState] = {} - # Persistent batch. - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.vocab_size, - ) # Cached torch/numpy tensor # The pytorch tensor and numpy array share the same buffer. @@ -199,7 +190,7 @@ def __init__( self.block_table_cpu = torch.zeros( (self.max_num_reqs, self.max_num_blocks_per_req), - dtype=self.input_batch.block_table.get_cpu_tensor().dtype, + dtype=torch.int32, device="cpu") self.query_start_loc_cpu = torch.zeros(self.max_num_tokens + 1, @@ -524,12 +515,12 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. - block_table_cpu = self.input_batch.block_table.get_cpu_tensor() + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, block_offsets, - out=self.input_batch.block_table. + out=self.input_batch.block_table[0]. slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. @@ -554,15 +545,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): self.position_ids = self.positions_cpu[: padded_total_num_scheduled_tokens].to( self.device) - self.input_batch.block_table.slot_mapping_cpu[ + self.input_batch.block_table[0].slot_mapping_cpu[ total_num_scheduled_tokens:] = _PAD_SLOT_ID slot_mapping = ( - self.input_batch.block_table. + self.input_batch.block_table[0]. slot_mapping_cpu[:padded_total_num_scheduled_tokens].to( self.device)) block_tables = self.block_table_cpu[:self.max_num_reqs] block_tables[:num_reqs, :self.max_num_blocks_per_req] = ( - self.input_batch.block_table.get_cpu_tensor()[:num_reqs]) + self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs]) block_tables = block_tables.to(self.device) query_start_loc = self.query_start_loc_cpu[:self.max_num_reqs + 1].to( self.device) @@ -1263,6 +1254,19 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: "Hybrid models with more than one KV cache type are not " "supported yet.") + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec. + block_size, + ) + assert self.block_table_cpu.dtype == self.input_batch.block_table[ + 0].get_cpu_tensor().dtype + kv_caches: dict[str, torch.Tensor] = {} for kv_cache_group in kv_cache_config.kv_cache_groups: From 8ddd1cf26ae484481126f6f9af198879b2da8a41 Mon Sep 17 00:00:00 2001 From: David Xia <david@davidxia.com> Date: Fri, 23 May 2025 12:41:17 -0400 Subject: [PATCH 107/192] [Doc] fix list formatting (#18624) Signed-off-by: David Xia <david@davidxia.com> --- docs/getting_started/installation/gpu/cuda.inc.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 8653f980501f..a76ef1ccf32e 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -102,10 +102,11 @@ VLLM_USE_PRECOMPILED=1 pip install --editable . ``` This command will do the following: + 1. Look for the current branch in your vLLM clone. -2. Identify the corresponding base commit in the main branch. -3. Download the pre-built wheel of the base commit. -4. Use its compiled libraries in the installation. +1. Identify the corresponding base commit in the main branch. +1. Download the pre-built wheel of the base commit. +1. Use its compiled libraries in the installation. !!! note 1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol. From 273cb3b4d9fb3d8454e8662ad42ded2f096bbf99 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 00:46:56 +0800 Subject: [PATCH 108/192] [Doc] Fix top-level API links/docs (#18621) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/benchmarks/datasets.py | 22 ++++--- vllm/config.py | 2 +- .../kv_transfer/kv_connector/v1/base.py | 9 +-- .../kv_transfer/kv_pipe/pynccl_pipe.py | 40 ++++++------ vllm/engine/llm_engine.py | 40 +++++------- vllm/entrypoints/llm.py | 63 ++++++++++--------- vllm/multimodal/__init__.py | 2 +- vllm/multimodal/registry.py | 4 +- vllm/outputs.py | 9 --- 9 files changed, 88 insertions(+), 103 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 13c37c979dac..74a9b2b03391 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -129,16 +129,17 @@ def get_random_lora_request( Args: tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no - LoRA is selected. max_loras (Optional[int]): The maximum number of - LoRAs available. If None, LoRA is not used. lora_path - (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA - is not used. + LoRA is selected. + max_loras (Optional[int]): The maximum number of LoRAs available. + If `None`, LoRA is not used. + lora_path (Optional[str]): Path to the LoRA parameters on disk. + If `None`, LoRA is not used. Returns: - tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first - element is a LoRARequest (or None if not applicable) and the second - element is the tokenizer associated with the LoRA request (or the - base tokenizer). + A tuple with the following elements: + - A new [LoRARequest][] (or `None` if not applicable). + - The tokenizer associated with the LoRA request + (or the base tokenizer). """ if max_loras is None or lora_path is None: return None, tokenizer @@ -167,7 +168,7 @@ def sample(self, tokenizer: PreTrainedTokenizerBase, Args: tokenizer (PreTrainedTokenizerBase): The tokenizer to be used - for processing the dataset's text. + for processing the dataset's text. num_requests (int): The number of sample requests to generate. Returns: @@ -184,7 +185,8 @@ def maybe_oversample_requests(self, requests: list[SampleRequest], Args: requests (List[SampleRequest]): The current list of sampled - requests. num_requests (int): The target number of requests. + requests. + num_requests (int): The target number of requests. """ if len(requests) < num_requests: random.seed(self.random_seed) diff --git a/vllm/config.py b/vllm/config.py index cd2eb4508de3..40dbc2824bcb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4552,7 +4552,7 @@ def contains_object_print(text): text (str): The text to check Returns: - bool: True if a match is found, False otherwise + result (bool): `True` if a match is found, `False` otherwise. """ pattern = r'at 0x[a-fA-F0-9]{2,16}>' match = re.search(pattern, text) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index ef4460a592bd..bc9258e9d07b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -210,10 +210,11 @@ def get_num_new_matched_tokens( computed tokens for this request Returns: - * the number of tokens that can be loaded from the - external KV cache beyond what is already computed. - * true if external KV cache tokens will be loaded - asynchronously (between scheduler steps). + A tuple with the following elements: + - The number of tokens that can be loaded from the + external KV cache beyond what is already computed. + - `True` if external KV cache tokens will be loaded + asynchronously (between scheduler steps). """ pass diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index fcc38d7fbd12..761c56f7e41f 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -118,11 +118,11 @@ def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata: """ Create the metadata as a dictionary based on the input tensor. - Parameters: - - tensor: The input tensor or None if no tensor is provided. + Args: + tensor: The input tensor or None if no tensor is provided. Returns: - - metadata: A dictionary with the following keys: + metadata: A dictionary with the following keys: - "dtype": The data type of the tensor or None. - "shape": The shape of the tensor or None. """ @@ -135,13 +135,13 @@ def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor: """ Create a buffer to receive the tensor based on the provided metadata. - Parameters: - - metadata: A dictionary with keys "dtype" and "shape", describing - the tensor's data type and shape. + Args: + metadata: A dictionary with keys "dtype" and "shape", + describing the tensor's data type and shape. Returns: - - buffer: A tensor of the specified type and shape, allocated on - self.device. + buffer: A tensor of the specified type and shape, + allocated on `self.device`. """ return torch.empty(metadata["shape"], dtype=metadata["dtype"], @@ -151,8 +151,8 @@ def _send_metadata(self, metadata: Metadata): """ Send the metadata dictionary to the target rank. - Parameters: - - metadata: A dictionary with keys "dtype" and "shape". + Args: + metadata: A dictionary with keys "dtype" and "shape". """ self.group.send_obj(metadata, self.target_rank_for_send) @@ -161,8 +161,8 @@ def _recv_metadata(self) -> Metadata: Receive the metadata dictionary from the target rank. Returns: - - metadata: A dictionary with keys "dtype" and "shape" describing - the tensor. + metadata: A dictionary with keys "dtype" and "shape" + describing the tensor. """ return self.group.recv_obj(self.target_rank_for_recv) @@ -171,9 +171,9 @@ def _send_impl(self, tensor: Optional[torch.Tensor]) -> None: The actual implementation of sending the tensor and its metadata to the target rank. - Parameters: - - tensor: The input tensor to be sent, or None if no tensor is - being sent. + Args: + tensor: The input tensor to be sent, or `None` if no tensor is + being sent. """ metadata = self._make_metadata(tensor) self._send_metadata(metadata) @@ -187,7 +187,7 @@ def _recv_impl(self) -> Optional[torch.Tensor]: the target rank. Returns: - - buffer: The received tensor, or None if no tensor is received. + buffer: The received tensor, or `None` if no tensor is received. """ metadata = self._recv_metadata() if metadata["dtype"] is None: @@ -227,8 +227,8 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None: Sends a tensor and its metadata to the destination rank in a non-blocking way. - Parameters: - - tensor: The tensor to send, or None if no tensor is being sent. + Args: + tensor: The tensor to send, or `None` if no tensor is being sent. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) @@ -250,8 +250,8 @@ def recv_tensor(self) -> Optional[torch.Tensor]: """ Receives a tensor and its metadata from the source rank. Blocking call. - Returns: - - tensor: The received tensor, or None if no tensor is received. + Args: + tensor: The received tensor, or `None` if no tensor is received. """ if self.transport_thread is None: self.transport_thread = ThreadPoolExecutor(max_workers=1) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c48d8a386969..2e5361c4891b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -130,26 +130,16 @@ class LLMEngine: iteration-level scheduling and efficient memory management to maximize the serving throughput. - The {class}`~vllm.LLM` class wraps this class for offline batched inference - and the {class}`AsyncLLMEngine` class wraps this class for online serving. + The [LLM][vllm.LLM] class wraps this class for offline batched inference + and the [AsyncLLMEngine][] class wraps this class for online serving. - The config arguments are derived from {class}`~vllm.EngineArgs`. (See - {ref}`engine-args`) + The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See + [engine-args][]) Args: - model_config: The configuration related to the LLM model. - cache_config: The configuration related to the KV cache memory - management. - parallel_config: The configuration related to distributed execution. - scheduler_config: The configuration related to the request scheduler. - device_config: The configuration related to the device. - lora_config (Optional): The configuration related to serving multi-LoRA. - speculative_config (Optional): The configuration related to speculative - decoding. + vllm_config: The configuration for initializing and running vLLM. executor_class: The model executor class for managing distributed execution. - prompt_adapter_config (Optional): The configuration related to serving - prompt adapters. log_stats: Whether to log statistics. usage_context: Specified entry point, used for usage info collection. """ @@ -695,11 +685,12 @@ def add_request( Args: request_id: The unique ID of the request. - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` + prompt: The prompt to the LLM. See + [PromptType][vllm.inputs.PromptType] for more details about the format of each input. params: Parameters for sampling or pooling. - {class}`~vllm.SamplingParams` for text generation. - {class}`~vllm.PoolingParams` for pooling. + [SamplingParams][vllm.SamplingParams] for text generation. + [PoolingParams][vllm.PoolingParams] for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. lora_request: The LoRA request to add. @@ -711,10 +702,11 @@ def add_request( Details: - Set arrival_time to the current time if it is None. - Set prompt_token_ids to the encoded prompt if it is None. - - Create `n` number of {class}`~vllm.Sequence` objects. - - Create a {class}`~vllm.SequenceGroup` object - from the list of {class}`~vllm.Sequence`. - - Add the {class}`~vllm.SequenceGroup` object to the scheduler. + - Create `n` number of [Sequence][vllm.Sequence] objects. + - Create a [SequenceGroup][vllm.SequenceGroup] object + from the list of [Sequence][vllm.Sequence]. + - Add the [SequenceGroup][vllm.SequenceGroup] object to the + scheduler. Example: >>> # initialize engine @@ -861,9 +853,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: request_id: The ID(s) of the request to abort. Details: - - Refer to the - {meth}`~vllm.core.scheduler.Scheduler.abort_seq_group` - from class {class}`~vllm.core.scheduler.Scheduler`. + - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][]. Example: >>> # initialize engine and add a request with request_id diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0465302c5a1c..f818e1737975 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -116,7 +116,8 @@ class LLM: to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode. - disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig` + disable_custom_all_reduce: See + [ParallelConfig][vllm.config.ParallelConfig]. disable_async_output_proc: Disable async output processing. This may result in lower performance. hf_token: The token to use as HTTP bearer authorization for remote files @@ -128,12 +129,12 @@ class LLM: compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. - **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See - {ref}`engine-args`) + **kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See + [engine-args][]) Note: This class is intended to be used for offline inference. For online - serving, use the {class}`~vllm.AsyncLLMEngine` class instead. + serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead. """ DEPRECATE_LEGACY: ClassVar[bool] = True @@ -142,7 +143,7 @@ class LLM: DEPRECATE_INIT_POSARGS: ClassVar[bool] = True """ A flag to toggle whether to deprecate positional arguments in - {meth}`LLM.__init__`. + [LLM.__init__][]. """ @classmethod @@ -403,7 +404,7 @@ def generate( Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. @@ -669,7 +670,7 @@ def chat( Generate responses for a chat conversation. The chat conversation is converted into a text prompt using the - tokenizer and calls the {meth}`generate` method to generate the + tokenizer and calls the [generate][] method to generate the responses. Multi-modal inputs can be passed in the same way you would pass them @@ -678,8 +679,8 @@ def chat( Args: messages: A list of conversations or a single conversation. - - Each conversation is represented as a list of messages. - - Each message is a dictionary with 'role' and 'content' keys. + - Each conversation is represented as a list of messages. + - Each message is a dictionary with 'role' and 'content' keys. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it @@ -689,27 +690,27 @@ def chat( use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. chat_template: The template to use for structuring the chat. - If not provided, the model's default chat template will be used. + If not provided, the model's default chat template will be used. chat_template_content_format: The format to render message content. - - "string" will render the content as a string. - Example: ``"Who are you?"`` - - "openai" will render the content as a list of dictionaries, - similar to OpenAI schema. - Example: ``[{"type": "text", "text": "Who are you?"}]`` + - "string" will render the content as a string. + Example: `"Who are you?"` + - "openai" will render the content as a list of dictionaries, + similar to OpenAI schema. + Example: `[{"type": "text", "text": "Who are you?"}]` add_generation_prompt: If True, adds a generation template to each message. continue_final_message: If True, continues the final message in the conversation instead of starting a new one. Cannot be - ``True`` if ``add_generation_prompt`` is also ``True``. + `True` if `add_generation_prompt` is also `True`. chat_template_kwargs: Additional kwargs to pass to the chat template. mm_processor_kwargs: Multimodal processor kwarg overrides for this chat request. Only used for offline requests. Returns: - A list of ``RequestOutput`` objects containing the generated + A list of `RequestOutput` objects containing the generated responses in the same order as the input messages. """ list_of_messages: list[list[ChatCompletionMessageParam]] @@ -908,7 +909,7 @@ def encode( Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -997,7 +998,7 @@ def embed( Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. @@ -1007,7 +1008,7 @@ def embed( generation, if any. Returns: - A list of ``EmbeddingRequestOutput`` objects containing the + A list of `EmbeddingRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ if self.llm_engine.model_config.task != "embed": @@ -1041,7 +1042,7 @@ def classify( Args: prompts: The prompts to the LLM. You may pass a sequence of prompts - for batch inference. See {class}`~vllm.inputs.PromptType` + for batch inference. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1049,7 +1050,7 @@ def classify( generation, if any. Returns: - A list of ``ClassificationRequestOutput`` objects containing the + A list of `ClassificationRequestOutput` objects containing the embedding vectors in the same order as the input prompts. """ if self.llm_engine.model_config.task != "classify": @@ -1159,11 +1160,11 @@ def score( lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: - """Generate similarity scores for all pairs ``<text,text_pair>``. + """Generate similarity scores for all pairs `<text,text_pair>`. - The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``. - In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N`` - times to pair with the ``text_2`` sentences. + The inputs can be `1 -> 1`, `1 -> N` or `N -> N`. + In the `1 - N` case the `text_1` sentence will be replicated `N` + times to pair with the `text_2` sentences. The input pairs are used to build a list of prompts for the cross encoder model. This class automatically batches the prompts, considering the memory constraint. For the best performance, put all @@ -1171,9 +1172,9 @@ def score( Args: text_1: can be a single prompt or a list of prompts, in which - case it has to have the same length as the ``text_2`` list + case it has to have the same length as the `text_2` list text_2: The texts to pair with the query to form the input - to the LLM. See {class}`~vllm.inputs.PromptType` for + to the LLM. See [PromptType][vllm.inputs.PromptType] for more details about the format of each prompts. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -1181,7 +1182,7 @@ def score( generation, if any. Returns: - A list of ``ScoringRequestOutput`` objects containing the + A list of `ScoringRequestOutput` objects containing the generated scores in the same order as the input prompts. """ runner_type = self.llm_engine.model_config.runner_type @@ -1282,13 +1283,13 @@ def sleep(self, level: int = 1): def wake_up(self, tags: Optional[list[str]] = None): """ - Wake up the engine from sleep mode. See the {meth}`sleep` method + Wake up the engine from sleep mode. See the [sleep][] method for more details. Args: tags: An optional list of tags to reallocate the engine memory for specific memory allocations. Values must be in - ("weights", "kv_cache",). If None, all memory is reallocated. + `("weights", "kv_cache")`. If None, all memory is reallocated. wake_up should be called with all tags (or None) before the engine is used again. """ diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 70568a195fd8..22fee2f74712 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -12,7 +12,7 @@ dispatch data processing according to the target model. Info: - {ref}`mm-processing` + [mm-processing][] """ __all__ = [ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 8a27d866e88e..0d0d4a4363f4 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -215,7 +215,7 @@ def register_processor( invoked to transform the data into a dictionary of model inputs. Info: - {ref}`mm-processing` + [mm-processing][] """ def wrapper(model_cls: N) -> N: @@ -260,7 +260,7 @@ def create_processor( Create a multi-modal processor for a specific model and tokenizer. Info: - {ref}`mm-processing` + [mm-processing][] """ if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") diff --git a/vllm/outputs.py b/vllm/outputs.py index 05026b569691..33cc50c872b6 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -391,15 +391,6 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": prompt_token_ids, finished) def __repr__(self): - """ - Returns a string representation of an PoolingRequestOutput instance. - - The representation includes the request_id and the number of outputs, - providing a quick overview of the pooling request's results. - - Returns: - str: A string representation of the PoolingRequestOutput instance. - """ return (f"{type(self).__name__}(request_id={self.request_id!r}, " f"outputs={self.outputs!r}, " f"prompt_token_ids={self.prompt_token_ids}, " From 15b45ffb9accfbc160217d51232669ab6c3461be Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 00:58:02 +0800 Subject: [PATCH 109/192] [Doc] Avoid documenting dynamic / internal modules (#18626) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- mkdocs.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mkdocs.yaml b/mkdocs.yaml index b33998bb4084..52de643f5e2b 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -60,6 +60,10 @@ plugins: - api-autonav: modules: ["vllm"] api_root_uri: "api" + exclude: + - "re:vllm\\._.*" # Internal modules + - "vllm.third_party" + - "vllm.vllm_flash_attn" - mkdocstrings: handlers: python: From 371f7e4ca2a44fbd4a63cd641efb279274a717f4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 01:22:40 +0800 Subject: [PATCH 110/192] [Doc] Fix broken links and unlinked docs, add shortcuts to home sidebar (#18627) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/.nav.yml | 11 ++++++++--- docs/contributing/model/tests.md | 4 ++-- docs/features/spec_decode.md | 2 +- docs/models/supported_models.md | 6 +++--- docs/serving/openai_compatible_server.md | 2 +- docs/{ => serving}/seed_parameter_behavior.md | 2 +- 6 files changed, 16 insertions(+), 11 deletions(-) rename docs/{ => serving}/seed_parameter_behavior.md (98%) diff --git a/docs/.nav.yml b/docs/.nav.yml index c410b6b8223b..e2b0ed560700 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -9,8 +9,13 @@ nav: - getting_started/examples/offline_inference - getting_started/examples/online_serving - getting_started/examples/other - - Roadmap: https://roadmap.vllm.ai - - Releases: https://github.com/vllm-project/vllm/releases + - Quick Links: + - User Guide: serving/offline_inference.md + - Developer Guide: contributing/overview.md + - API Reference: api/README.md + - Timeline: + - Roadmap: https://roadmap.vllm.ai + - Releases: https://github.com/vllm-project/vllm/releases - User Guide: - Inference and Serving: - serving/offline_inference.md @@ -38,7 +43,7 @@ nav: - contributing/overview.md - glob: contributing/* flatten_single_child_sections: true - - contributing/model + - Model Implementation: contributing/model - Design Documents: - V0: design - V1: design/v1 diff --git a/docs/contributing/model/tests.md b/docs/contributing/model/tests.md index 26880986181d..67f8eda61dc5 100644 --- a/docs/contributing/model/tests.md +++ b/docs/contributing/model/tests.md @@ -33,14 +33,14 @@ These tests compare the model outputs of vLLM against [HF Transformers](https:// #### Generative models -For [generative models][generative-models], there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>: +For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>: - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. #### Pooling models -For [pooling models][pooling-models], we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>. +For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>. [](){ #mm-processing-tests } diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index dce87c27896c..ee871823b078 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -170,7 +170,7 @@ A variety of speculative models of this type are available on HF hub: ## Speculating using EAGLE based draft models The following code configures vLLM to use speculative decoding where proposals are generated by -an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](<gh-file:examples/offline_inference/eagle.py>). +an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). ```python from vllm import LLM, SamplingParams diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 416fe42fcb79..5a402ee88c61 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -3,7 +3,7 @@ title: Supported Models --- [](){ #supported-models } -vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks. +vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks. If a model supports more than one task, you can set the task via the `--task` argument. For each task, we list the model architectures that have been implemented in vLLM. @@ -376,7 +376,7 @@ Specified using `--task generate`. ### Pooling Models -See [this page](pooling-models) for more information on how to use pooling models. +See [this page](./pooling_models.md) for more information on how to use pooling models. !!! warning Since some model architectures support both generative and pooling tasks, @@ -628,7 +628,7 @@ Specified using `--task generate`. ### Pooling Models -See [this page](pooling-models) for more information on how to use pooling models. +See [this page](./pooling_models.md) for more information on how to use pooling models. !!! warning Since some model architectures support both generative and pooling tasks, diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 27cb9310c516..012bddf3d9c9 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -5,7 +5,7 @@ title: OpenAI-Compatible Server vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client. -In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) +In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 diff --git a/docs/seed_parameter_behavior.md b/docs/serving/seed_parameter_behavior.md similarity index 98% rename from docs/seed_parameter_behavior.md rename to docs/serving/seed_parameter_behavior.md index ff17525cf8e2..301847292b83 100644 --- a/docs/seed_parameter_behavior.md +++ b/docs/serving/seed_parameter_behavior.md @@ -1,4 +1,4 @@ -# Seed Parameter Behavior in vLLM +# Seed Parameter Behavior ## Overview From 2628a69e3529c0bfff6f6bcb0e5456ccef872fb2 Mon Sep 17 00:00:00 2001 From: Jiayi Yao <82156730+YaoJiayi@users.noreply.github.com> Date: Fri, 23 May 2025 12:26:28 -0500 Subject: [PATCH 111/192] [V1] Support Deepseek MTP (#18435) Signed-off-by: Rui Qiao <ruisearch42@gmail.com> Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> Co-authored-by: Rui Qiao <ruisearch42@gmail.com> --- vllm/config.py | 13 ++- vllm/engine/arg_utils.py | 2 +- vllm/model_executor/models/deepseek_mtp.py | 3 +- vllm/v1/spec_decode/eagle.py | 122 +++++++++++---------- vllm/v1/spec_decode/utils.py | 27 +++++ vllm/v1/worker/gpu_model_runner.py | 19 +++- 6 files changed, 120 insertions(+), 66 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 40dbc2824bcb..5cd08db43712 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2255,7 +2255,7 @@ def __post_init__(self): SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator", - "draft_model"] + "draft_model", "deepseek_mtp"] SpeculativeAcceptanceMethod = Literal["rejection_sampler", "typical_acceptance_sampler"] @@ -2519,6 +2519,15 @@ def __post_init__(self): elif (self.draft_model_config.hf_config.model_type == "mlp_speculator"): self.method = "mlp_speculator" + elif (self.draft_model_config.hf_config.model_type == + "deepseek_mtp"): + self.method = "deepseek_mtp" + if self.num_speculative_tokens > 1: + logger.warning( + "All Deepseek MTP models only have " \ + "one layer. Might need some code changes " \ + "to support multiple layers." + ) else: self.method = "draft_model" @@ -2738,7 +2747,7 @@ def num_lookahead_slots(self) -> int: return self.num_speculative_tokens def use_eagle(self) -> bool: - return self.method in ("eagle", "eagle3") + return self.method in ("eagle", "eagle3", "deepseek_mtp") def __repr__(self) -> str: method = self.method diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 12c306e98048..b561a1a77487 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1338,7 +1338,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: is_ngram_enabled = True elif speculative_method == "medusa": is_medusa_enabled = True - elif speculative_method in ("eagle", "eagle3"): + elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"): is_eagle_enabled = True else: speculative_model = self.speculative_config.get("model") diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 6d7b52aba5f9..03ef7bed0edc 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -19,6 +19,7 @@ from .deepseek_v2 import (DeepseekV2DecoderLayer, get_spec_layer_idx_from_weight_name) +from .interfaces import SupportsPP from .utils import maybe_prefix @@ -145,7 +146,7 @@ def compute_logits( return logits -class DeepSeekMTP(nn.Module): +class DeepSeekMTP(nn.Module, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 460d645a1a6c..3926a86ee591 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -10,9 +10,10 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM -from vllm.triton_utils import tl, triton -from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata, + FlashAttentionMetadata) from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.spec_decode.utils import prepare_eagle_input_kernel logger = init_logger(__name__) @@ -25,12 +26,15 @@ def __init__( self, vllm_config: VllmConfig, device: torch.device, + runner=None, ): self.vllm_config = vllm_config self.speculative_config = vllm_config.speculative_config self.draft_model_config = self.speculative_config.draft_model_config self.method = self.speculative_config.method + self.runner = runner + self.dtype = vllm_config.model_config.dtype self.max_model_len = vllm_config.model_config.max_model_len self.block_size = vllm_config.cache_config.block_size @@ -106,24 +110,46 @@ def propose( # FA requires seq_len to have dtype int32. seq_lens = (target_positions[last_token_indices] + 1).int() - # FIXME(woosuk): The below two ops cause synchronization. Optimize. - max_seq_len = seq_lens.max().item() - max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item() - attn_metadata = FlashAttentionMetadata( - num_actual_tokens=num_tokens, - max_query_len=max_num_tokens, - query_start_loc=cu_num_tokens, - max_seq_len=max_seq_len, - seq_lens=seq_lens, - block_table=block_table, - slot_mapping=target_slot_mapping, - # TODO(woosuk): Support cascade attention. - use_cascade=False, - common_prefix_len=0, - cu_prefix_query_lens=None, - prefix_kv_lens=None, - suffix_kv_lens=None, - ) + if self.method in ["eagle", "eagle3"]: + # FIXME(woosuk): The below two ops cause synchronization. Optimize. + max_seq_len = seq_lens.max().item() + max_num_tokens = (cu_num_tokens[1:] - + cu_num_tokens[:-1]).max().item() + attn_metadata = FlashAttentionMetadata( + num_actual_tokens=num_tokens, + max_query_len=max_num_tokens, + query_start_loc=cu_num_tokens, + max_seq_len=max_seq_len, + seq_lens=seq_lens, + block_table=block_table, + slot_mapping=target_slot_mapping, + # TODO(woosuk): Support cascade attention. + use_cascade=False, + common_prefix_len=0, + cu_prefix_query_lens=None, + prefix_kv_lens=None, + suffix_kv_lens=None, + ) + elif self.method == "deepseek_mtp": + query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1] + max_query_len = query_lens.max().item() + + common_attn_metadata = CommonAttentionMetadata( + query_start_loc=cu_num_tokens, seq_lens=seq_lens) + + assert self.runner is not None + + # FIXME: need to consider multiple kv_cache_groups + attn_metadata = self.runner.attn_metadata_builder.build( + num_reqs=batch_size, + num_actual_tokens=num_tokens, + max_query_len=max_query_len, + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + else: + raise ValueError(f"Unsupported method: {self.method}") + if self.use_cuda_graph and \ num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) @@ -136,11 +162,15 @@ def propose( with set_forward_context(attn_metadata, self.vllm_config, num_tokens=num_input_tokens): - last_hidden_states, hidden_states = self.model( - input_ids=self.input_ids[:num_input_tokens], - positions=self.positions[:num_input_tokens], - hidden_states=self.hidden_states[:num_input_tokens], + ret_hidden_states = self.model( + self.input_ids[:num_input_tokens], + self.positions[:num_input_tokens], + self.hidden_states[:num_input_tokens], ) + if self.method == "deepseek_mtp": + last_hidden_states = ret_hidden_states + else: + last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) draft_token_ids = logits.argmax(dim=-1) @@ -150,6 +180,10 @@ def propose( # [batch_size, 1] return draft_token_ids.view(-1, 1) + # TODO: Currently, MTP module released by deepseek only has + # one layer. Adapt this code to support multiple layers once + # there's a multi-layer MTP module. + # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] @@ -215,9 +249,9 @@ def propose( self.vllm_config, num_tokens=input_batch_size): last_hidden_states, hidden_states = self.model( - input_ids=self.input_ids[:input_batch_size], - positions=self.positions[:input_batch_size], - hidden_states=self.hidden_states[:input_batch_size], + self.input_ids[:input_batch_size], + self.positions[:input_batch_size], + self.hidden_states[:input_batch_size], ) hidden_states = hidden_states[:batch_size] logits = self.model.compute_logits(last_hidden_states[:batch_size], @@ -268,7 +302,7 @@ def prepare_inputs( batch_size = num_rejected_tokens.shape[0] BLOCK_SIZE = 1024 - prepare_input_kernel[(batch_size, )]( + prepare_eagle_input_kernel[(batch_size, )]( token_indices, cu_target_query_lens, cu_num_tokens, @@ -320,9 +354,9 @@ def dummy_run( with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): self.model( - input_ids=self.input_ids[:num_tokens], - positions=self.positions[:num_tokens], - hidden_states=self.hidden_states[:num_tokens], + self.input_ids[:num_tokens], + self.positions[:num_tokens], + self.hidden_states[:num_tokens], ) @@ -367,29 +401,3 @@ def compute_probs_and_sample_next_token( next_token_ids, ) return next_token_ids, probs - - -@triton.jit -def prepare_input_kernel( - out_ptr, - cu_query_lens_ptr, - cu_num_tokens_ptr, - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(0) - - # [start_pos, end_pos) - start_pos = tl.load(cu_num_tokens_ptr + pid) - end_pos = tl.load(cu_num_tokens_ptr + pid + 1) - num_tokens = end_pos - start_pos - - index_start = tl.load(cu_query_lens_ptr + pid) - - num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE) - for i in tl.range(num_blocks): - offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) - tl.store( - out_ptr + start_pos + offset, - index_start + offset, - mask=offset < num_tokens, - ) diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py index ce81a40ee3ae..334258e7f87a 100644 --- a/vllm/v1/spec_decode/utils.py +++ b/vllm/v1/spec_decode/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +from vllm.triton_utils import tl, triton from vllm.v1.worker.gpu_input_batch import InputBatch @@ -16,3 +17,29 @@ def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool: return False return True + + +@triton.jit +def prepare_eagle_input_kernel( + out_ptr, + cu_query_lens_ptr, + cu_num_tokens_ptr, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(0) + + # [start_pos, end_pos) + start_pos = tl.load(cu_num_tokens_ptr + pid) + end_pos = tl.load(cu_num_tokens_ptr + pid + 1) + num_tokens = end_pos - start_pos + + index_start = tl.load(cu_query_lens_ptr + pid) + + num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE) + for i in tl.range(num_blocks): + offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + tl.store( + out_ptr + start_pos + offset, + index_start + offset, + mask=offset < num_tokens, + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c7072d0ca330..42847e2f8c36 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -151,12 +151,16 @@ def __init__( self.use_aux_hidden_state_outputs = False if self.speculative_config: self.use_spec_decode = True + + # NOTE(Jiayi): currently we put the entire draft model on + # the last PP rank. This is not ideal if there are many + # layers in the draft model. if get_pp_group().is_last_rank: if self.speculative_config.method == "ngram": self.drafter = NgramProposer(self.vllm_config) elif self.speculative_config.use_eagle(): - self.drafter = EagleProposer(self.vllm_config, - self.device) # type: ignore + self.drafter = EagleProposer(self.vllm_config, self.device, + self) # type: ignore if self.speculative_config.method == "eagle3": self.use_aux_hidden_state_outputs = True elif self.speculative_config.method == "medusa": @@ -1361,6 +1365,12 @@ def execute_model( device=self.device) eagle_attn_metadata = attn_metadata[self.drafter.attn_layer_name] + # NOTE: deepseek_mtp uses MLA which does not have `block_table` + if hasattr(eagle_attn_metadata, "block_table"): + block_table = eagle_attn_metadata.block_table + else: + block_table = None + if spec_decode_metadata is None: # input_ids can be None for multimodal models. target_token_ids = self.input_ids[:num_scheduled_tokens] @@ -1406,7 +1416,7 @@ def execute_model( target_slot_mapping=target_slot_mapping, next_token_ids=next_token_ids, cu_num_tokens=cu_num_tokens, - block_table=eagle_attn_metadata.block_table, + block_table=block_table, sampling_metadata=sampling_metadata, ) spec_token_ids = draft_token_ids.tolist() @@ -1723,8 +1733,7 @@ def _dummy_run( else: hidden_states = outputs - if self.use_spec_decode and \ - self.speculative_config.method in ('eagle', 'eagle3'): + if self.use_spec_decode and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) self.drafter.dummy_run(num_tokens) From 1645b601960a62e466b9fc6e98f50d465989c52e Mon Sep 17 00:00:00 2001 From: Huy Do <huydhn@gmail.com> Date: Fri, 23 May 2025 14:17:16 -0700 Subject: [PATCH 112/192] Use prebuilt FlashInfer x86_64 PyTorch 2.7 CUDA 12.8 wheel for CI (#18537) Signed-off-by: Huy Do <huydhn@gmail.com> --- docker/Dockerfile | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f28a1618298f..9b232d1fe24b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -257,18 +257,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - # uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ - # TESTING: install FlashInfer from source to test 2.7.0 final RC + # FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use if [[ "$CUDA_VERSION" == 12.8* ]]; then \ - export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \ + uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \ else \ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \ - fi; \ - CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ - if [ "$CUDA_MAJOR" -lt 12 ]; then \ - export FLASHINFER_ENABLE_SM90=0; \ - fi; \ - uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + if [ "$CUDA_MAJOR" -lt 12 ]; then \ + export FLASHINFER_ENABLE_SM90=0; \ + fi; \ + uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \ + fi \ fi COPY examples examples COPY benchmarks benchmarks From 0ddf88e16e6ef4d985716f5bdec60fd053a260fa Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Fri, 23 May 2025 18:09:44 -0400 Subject: [PATCH 113/192] [CI] Enable test_initialization to run on V1 (#16736) Signed-off-by: mgoin <mgoin64@gmail.com> --- .buildkite/test-pipeline.yaml | 5 +---- tests/models/registry.py | 32 +++++++++++++++++++++-------- tests/models/test_initialization.py | 17 +++++++++++---- vllm/model_executor/models/grok1.py | 30 +++++++-------------------- vllm/utils.py | 15 ++++++++------ 5 files changed, 54 insertions(+), 45 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 181fbda57b3f..1f54b70f05dd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -472,10 +472,7 @@ steps: - pytest -v -s models/test_registry.py - pytest -v -s models/test_utils.py - pytest -v -s models/test_vision.py - # V1 Test: https://github.com/vllm-project/vllm/issues/14531 - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4' - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2' + - pytest -v -s models/test_initialization.py - label: Language Models Test (Standard) mirror_hardwares: [amdexperimental] diff --git a/tests/models/registry.py b/tests/models/registry.py index 911a58e99d4c..22d532aa71e0 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -55,9 +55,18 @@ class _HfExamplesInfo: trust_remote_code: bool = False """The ``trust_remote_code`` level required to load the model.""" + v0_only: bool = False + """The model is only available with the vLLM V0 engine.""" + hf_overrides: dict[str, Any] = field(default_factory=dict) """The ``hf_overrides`` required to load the model.""" + max_model_len: Optional[int] = None + """ + The maximum model length to use for this model. Some models default to a + length that is too large to fit into memory in CI. + """ + def check_transformers_version( self, *, @@ -215,10 +224,11 @@ def check_available_online( "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat", trust_remote_code=True), "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), - "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), + "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True), "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct", - trust_remote_code=True), + trust_remote_code=True, + v0_only=True), "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True), "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b", @@ -234,7 +244,8 @@ def check_available_online( is_available_online=False), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 is_available_online=False), - "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), + "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t", + v0_only=True), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", @@ -303,7 +314,8 @@ def check_available_online( "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501 "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501 - extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501 + extras={"6b": "Salesforce/blip2-opt-6.7b"}, # noqa: E501 + v0_only=True), "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 extras={"fork": "Isotr0py/deepseek-vl2-tiny"}, # noqa: E501 @@ -328,9 +340,11 @@ def check_available_online( {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501 - trust_remote_code=True), + trust_remote_code=True, + v0_only=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 - min_transformers_version="4.51"), + min_transformers_version="4.51", + max_model_len=10240), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501 @@ -349,7 +363,8 @@ def check_available_online( extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501 trust_remote_code=True), "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501 - trust_remote_code=True), + trust_remote_code=True, + v0_only=True), "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503", # noqa: E501 extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}), # noqa: E501 "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", @@ -372,7 +387,8 @@ def check_available_online( "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True), "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 - tokenizer_mode="mistral"), + tokenizer_mode="mistral", + v0_only=True), "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL", extras={"chat": "Qwen/Qwen-VL-Chat"}, # noqa: E501 trust_remote_code=True, diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 446c4efbf6af..d403cb392fe0 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -15,12 +15,12 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) -def test_can_initialize(model_arch): +def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - # Avoid OOM + # Avoid OOM and reduce initialization time by only using 1 layer def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update(model_info.hf_overrides) @@ -34,6 +34,12 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: "num_local_experts": 2, }) + if hasattr(hf_config, "vision_config"): + hf_config.vision_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + return hf_config # Avoid calling model.forward() @@ -46,7 +52,7 @@ def _initialize_kv_caches_v1(self, vllm_config): scheduler_kv_cache_config = get_kv_cache_config( vllm_config, kv_cache_specs[0], - 20 * GiB_bytes, + 10 * GiB_bytes, ) # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config @@ -55,7 +61,9 @@ def _initialize_kv_caches_v1(self, vllm_config): with (patch.object(V0LLMEngine, "_initialize_kv_caches", _initialize_kv_caches_v0), patch.object(V1EngineCore, "_initialize_kv_caches", - _initialize_kv_caches_v1)): + _initialize_kv_caches_v1), monkeypatch.context() as m): + if model_info.v0_only: + m.setenv("VLLM_USE_V1", "0") LLM( model_info.default, tokenizer=model_info.tokenizer, @@ -65,6 +73,7 @@ def _initialize_kv_caches_v1(self, vllm_config): "num_speculative_tokens": 1, } if model_info.speculative_model else None, trust_remote_code=model_info.trust_remote_code, + max_model_len=model_info.max_model_len, load_format="dummy", hf_overrides=hf_overrides, ) diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 578d31a851a9..bc9e9a3c0206 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -28,7 +28,7 @@ import torch.nn.functional as F from torch import nn -from vllm.attention import Attention, AttentionMetadata +from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -182,25 +182,20 @@ def __init__( quant_config=quant_config, logits_soft_cap=attn_logits_soft_cap, prefix=f"{prefix}.attn") + self.attn_multiplier = getattr(self.config, "attn_output_multiplier", + 1.0) if self.config else 1.0 def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) - - # Apply attention output multiplier if specified in config - attn_multiplier = getattr(self.config, "attn_output_multiplier", - None) if self.config else None - if attn_multiplier is not None: - output = output * attn_multiplier + output *= self.attn_multiplier return output @@ -261,8 +256,6 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention @@ -276,8 +269,6 @@ def forward( hidden_states = self.attn( positions=positions, hidden_states=hidden_states, - kv_cache=kv_cache, - attn_metadata=attn_metadata, ) # Post attention normalization @@ -341,8 +332,6 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: list[torch.Tensor], - attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: @@ -359,9 +348,7 @@ def forward( for i in range(self.start_layer, self.end_layer): layer = self.layers[i] - hidden_states, residual = layer(positions, hidden_states, - kv_caches[i - self.start_layer], - attn_metadata, residual) + hidden_states, residual = layer(positions, hidden_states, residual) if not get_pp_group().is_last_rank: return IntermediateTensors({ @@ -529,13 +516,10 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: list[torch.Tensor], - attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors, + hidden_states = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) return hidden_states diff --git a/vllm/utils.py b/vllm/utils.py index fcc0ab3b237a..25694c121581 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2794,14 +2794,17 @@ def wrapper(*args, **kwargs): # Only relevant for models using ALiBi (e.g, MPT) def check_use_alibi(model_config: ModelConfig) -> bool: - return (getattr(model_config.hf_text_config, "alibi", False) # Falcon + cfg = model_config.hf_text_config + return (getattr(cfg, "alibi", False) # Falcon or ("BloomForCausalLM" in getattr(model_config.hf_config, "architectures", [])) # Bloom - or getattr(model_config.hf_text_config, "position_encoding_type", - "") == "alibi" # codellm_1b_alibi - or - (hasattr(model_config.hf_text_config, "attn_config") # MPT - and model_config.hf_text_config.attn_config.get("alibi", False))) + or getattr(cfg, "position_encoding_type", "") == + "alibi" # codellm_1b_alibi + or (hasattr(cfg, "attn_config") # MPT + and ((isinstance(cfg.attn_config, dict) + and cfg.attn_config.get("alibi", False)) or + (not isinstance(cfg.attn_config, dict) + and getattr(cfg.attn_config, "alibi", False))))) def sha256(input) -> int: From 7d9216495c389410c2901084336239bc626611d5 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 06:49:21 +0800 Subject: [PATCH 114/192] [Doc] Update references to doc files (#18637) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .github/mergify.yml | 6 ++---- docker/Dockerfile | 4 ++-- docs/contributing/overview.md | 5 ++--- .../dockerfile-stages-dependency.png | Bin 121821 -> 0 bytes tools/update-dockerfile-graph.sh | 2 +- vllm/config.py | 4 ++-- vllm/engine/arg_utils.py | 2 +- vllm/engine/output_processor/multi_step.py | 2 +- vllm/platforms/cpu.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 2 +- vllm/utils.py | 2 +- vllm/worker/multi_step_model_runner.py | 2 +- vllm/worker/utils.py | 2 +- 13 files changed, 16 insertions(+), 19 deletions(-) delete mode 100644 docs/source/assets/contributing/dockerfile-stages-dependency.png diff --git a/.github/mergify.yml b/.github/mergify.yml index ccfd571625b5..e595060c325a 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -58,7 +58,7 @@ pull_request_rules: - files~=^benchmarks/structured_schemas/ - files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/run_structured_output_benchmark.sh - - files=docs/source/features/structured_outputs.md + - files=docs/features/structured_outputs.md - files=examples/offline_inference/structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -135,9 +135,7 @@ pull_request_rules: - files~=^tests/entrypoints/openai/tool_parsers/ - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files~=^vllm/entrypoints/openai/tool_parsers/ - - files=docs/source/features/tool_calling.md - - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md - - files=docs/source/getting_started/examples/chat_with_tools.md + - files=docs/features/tool_calling.md - files~=^examples/tool_chat_* - files=examples/offline_inference/chat_with_tools.py - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 9b232d1fe24b..24986a1b73b1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/contributing/dockerfile/dockerfile.md and -# docs/source/assets/contributing/dockerfile-stages-dependency.png +# docs/contributing/dockerfile/dockerfile.md and +# docs/assets/contributing/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.8.1 #################### BASE BUILD IMAGE #################### diff --git a/docs/contributing/overview.md b/docs/contributing/overview.md index 48f0bab5e9b3..2517436afcc1 100644 --- a/docs/contributing/overview.md +++ b/docs/contributing/overview.md @@ -130,9 +130,8 @@ The PR needs to meet the following code quality standards: understand the code. - Include sufficient tests to ensure the project stays correct and robust. This includes both unit tests and integration tests. -- Please add documentation to `docs/source/` if the PR modifies the - user-facing behaviors of vLLM. It helps vLLM users understand and utilize the - new features or changes. +- Please add documentation to `docs/` if the PR modifies the user-facing behaviors of vLLM. + It helps vLLM users understand and utilize the new features or changes. ### Adding or Changing Kernels diff --git a/docs/source/assets/contributing/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png deleted file mode 100644 index 0838bfa37fe62d60fba9adcbd18c81de0809f253..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 121821 zcmcG$cU+Et-#>mLWHii(R8~kO?b0+tT4-oWXlZHhBtnw$Y0*+r+EGeri9%`bol1M} z{e2$ind`c~*YCc6zrTLx{kSjk={(Qlc)wrcIbQGMp@N(=&3eZ5Boc|{+}TqXNhDeq z5@}^Q^-BE9;pF$~_+!mA8R=7`CF1`=Gb3M+NGzmtrz9`i2KF}@SSock3A6@1+)qWV zL%Z!%1PjY1n!6kK>N7nR?z@=#Zsm55u#eYbs+9X;1;fw2I~h?L&GhDmLgEqLtB0vr z)z&F*syyhna-HtVoy~M}cB9jVUr9ge)@jsu#5)X)i90O4&rsejy3i?R?&IUL0++J< zQdl*jxc2W4B$DfzlXKhu{iEUe{~vxj!re>$2KtES&!3lw&BdmrrA0;Q*Tico7Od#i zsCX+|QBm<qz_dADGb8E)`B!P4PjV1{SsS>GUHS9qPxeorKH1sXF*^=R;*ZLQuGDWL zBlT|FnCP#w7;epF3$na;=FFMOhs1ka@3Y3ge}BApBk=?M2E0>Vd||vOfJc0$;{`r{ zLgw1F->xfG6^jgg_T<VE6crWi6(oLFBJ-`&2K<@hY=UdUWjT!gVDmeAWz@&-;-yOt z<K5=9Ivx>`kqwiZY3U;(BEEh5)>o6TG?l%SdGpuHgwHR1efh6_ePG_6&t(2}lj56I z#D_AH28ci)O{#C(wyh#kK6~I}gV_9VUi6E1@7|sCpj}v;$`-fmE*)~7%2>B<-KK+7 zVM-brpDlaJg)IIQ1qxYIMJp+FwYRr(eIEYOV4A&<m`Ce*Y!C5~3gQMWM64ycs3q%{ z)h6k-*mVU(^VHSVsc6}Cu4d49>g&5*=UAw-{Zx|;n|$yItKQ1rzkd&3xOlO=N=4hL zfXRl}pnQM*E#ku@GQ%i0zj(8+I!>U)acqnH2}MQ4O~FDVU$@|UCl?pyq<3wMTKZ_* zz<=ZCKN&+=b7LjqbS0L(l?J07KZrd@&Luv|b(RuSt_weL!A^spy5hAmg-`TXM3yW} zwTNH;{`Bn06*NWFaq7&@(`^nznKpBi18o6D@t^+Wu4TSe8KwC5b0o;o=(<o)`R-D| z6Q*4sJFyWiGlkmCW_dJNEyb>tjBmWfX1hJa=i5HpjeW`M@9!619PthkwtB?7mADvs z(dG4Di}kP3c3IH%7TvyQk3|tTHbcDQ>B%tJ)x4I-kxOqce44B`k~x3gjYYrW?XTD? zAMB?FnXEX8PhmM<pGy46Yunk`shj`UZn91<FONMw?e|Cl^VI4Q3F9kSjx(RSg6G;i znAb>R;U~MqmuB9)kb5Ldxs08Z%V7K%Bw`b6Q7puxSG>`U<?vyHpZ^^8)3P_HeC@jM z>#gjoVzIe?tUeF5TSrF+gSO+yc!j+8@@LR2e@5b?M~^n`JA;7OAW6TUjZHp8Y@z={ z$rItNi7rPtI5NA#JQt@6#MelcMk$5|@aP5G{(h+FL4*ovZzy?}C06lU_t*Y-!M2vT zR(`$WKr9P?@2Q-{*w3Hm*NRW)V#V*W=vT+q?K1srK3*m*%|sMB(gVKbYlvRA<=|sx zaWQJQGhU1f6A3P&$gXfXs{W;)KYtdBPBrf2xk)AYG$7#9t-ANt1NEnu-+y@d{cmQ* zx?F}FTG-jyJ-xj{#GGblx<Z((_O?nWUc7i4iN~dy$U^Owdw1>9!jOyng$s|D7eQpX zez}S&tE$@Ou4eeh$7i*qSs_PiRP)NDf1W=djTB#ar(5tulK!Y_Vym9`+?D0J;QxM# zTvTXE>hwYu{Icf{P`hoB6Kq2+$eum>KzNf`&P+}ABA@MWYk7J3^z<|?k&cn~Q12<? z&Ltk#QlfFqp9nv#^)aad0RhNKw_XnVhgnW@&4^=3`pcIuKi<jnHaVI1@0;|=DO&n; z>$YtR-O}RgYrMr5Z3}%l!hfyW{XO4nS53Cd5)1X{*cj4jFshySCK5L#6(vp{*ZJaZ z)mM$%LlO~DgMvw`KQc0cDthbu`SbKtbSH*$T$biqqPCE0e9}|12IJJzLF69{JCL8u zLbp4LgG6@m8S)}-@xpJeO;?UnGp<kWiA!xhf(_u{5S^&Jgv}$qyi&-4a>-X4ElNby zKi*pBcD27Y$z%8Jvz6u~Hx_D>^q#i})HRaFPz6$qYR)i``$?3=a*AwJPc?D+6QJkD za#ltrW9HAR@F%N=#nn;_C)$~p1gYJS0Alz$+zRn^y@8ZUPm?#^Q{n09d6#8}(CzCD zAB~B3xu7JxL>{nEJCD6;`N!Ay?{}&H{jL{m^0W*{e!6hkmBZWO)KY}!`aj%dnf~p` zrR6-+xxAoOGRs?c_18;2EdB<!P~?m>vy&xmDa)|PpS$h1yvyvR<a5L&xu#S4fW)d} zM%A%qn+7@8DYR_db0SSCMp=^n^K5U7h|S<*VOrvM6+DtniTlte-7E=a;=E6@YrFAg zSHlDQ_Xpnod03MER`X{eHa5292E$aIwHm)1y27?EE)`{_CCf=!S-J8GDOzQsYeiea zQQ{-7x<pWN)F~BtBm0VfwdB*FASMO|2CLn!y-ZHyWo>)8xVY-~6Td7GwEU4-F}+E3 z)7zs;vgVqG_)QuGOUYm5x)4NJ--#yN?pDrpt3Kiao6ep++g%<ZLr;Ygtb+@{%T@Tz z-|s~WGb@W$iqU^T>ere!GDJJXrg{kz-I1iALb>Irp+}ycd8h5|?Xhgu33g+D+QOou z4lQ3-QyJw}vtsI{-k*DMu<W3YHcINjL8JQ{cW$<dXZ+OvzA$Ug;xB=gMo$?pCXvZL zzgg7GD-F*co<Z-;`{v7`9wtjJT9r(cqS2dy+G4f|D?!|Xr}t8bd=vYTBmebVEZc>x zdN0ppKM@&Bi)wdNI(_<^l8UBJaBy>m&CvY(e8T5hME1QOEw=4mPM^yz$%}5&GAs@h znzKDj+#YG~Yw|LX=;vojT!PUd|NMGoVPWAi9?pE466&tAlz1Ys9OB|iQ1kQgIqgr* z9anofa-fk|1g9D;q`mhvy*c+HHZ~SZ&r1mbQl<98W&HI(OG@g?KZntG0ybOGlfSxG zgpzyYA4n3il>Yu-Kb4`<Oyzqsx5TI<pod1jd4mqzgIih{3t7^*a%HDga%QUiL~qE_ ze3vwn)n(7^9E<`cj=(NK*7XJij6gjIf3k-7*qr7H@-~seE?>D~*WxsF<HikH|6@Ae z9&JPGIc;FjQ=ekAG#)7)gyKV>m5vwkS);xYGJg3^zk+O<XMB>R_t<sx>4rGZ_Ne63 ziq8)xc%zvPw3k>dElf>ym$eUO3~8Y^qGF)qB;mTykhIKlSKnPjqxRwEFEK4<;zC@b zD20-wn2ti|JozDn$z^^d0niUUFv)3d^7H4<yG+tp@uGGk>hjLx9et}C?FaAgGMVm? zcTu}|aWj=9@<|(6Khak+RGYL<Tzm;JV2jmyBEC2c<P)V!4yQx`N&vfl|Gcy~m)dOK zmuBAnr#Og!d#ShTj`Q)Q=#>N$m9=GYGBwdP0~fu4Gc7bMEUZ^E(_tzeI~p6?`2I$r z38!<DRn0wJ>|xz6RcB{sL2H1j%Aa%-b1h4Y05cnF^n$FazuenUBG~?Td9w!UCw$dS zt^l%OO-w$I<g0yRB9?%}O({F1Q*RR!6OC%(=|sm%YqS~pZ`I=ZzD$Ij_OxyZjT8TP ztM1~qj2nf%*$V@vTefV;KK{bTXCh}6o#XsS0k=H47)x+a(w`J&xiC@V;n3*ZG(FPZ z{`>criGAKDHs>ignD$n_BepngkBIdx<Xa5WTrQn+DeA+HKjc*kt^T_s!UM?_Sv?U& zMMday_xq4S#Qq<tY<|UWtX5?&Ag+c$9&XFieNIi>hO64Y?n5)f8f~WTS5BEUvu27` zvc6&yTRj_~GeCKpu$WkuN`h9#&f=dxADufyyp?1aK&e*r0r<8iI?KVvPliE$ekuj# zlT~v=+1hGqjPZ2B-J$3SO&e29Tf`2&CciK02BjfLTz|=H@P6(N@VHW(fJvh&@kSKo z=<1%Ho($&2>Jai$yUw1Y<mrP8j`OLGrvhHSR7+`oe?x|;piTvqE=~<u)$c6G%KG{V zr6nt{Eyv8P5rC*tAGYp{|C@tZfr)S4?EgY8#iYMwisiUAXN__USdL*6HZ3!w5X5G3 z(90njo2Jx<FlwSRZTJ}X-yi+`R&Aor)GC7HR0*@Q_pvABfBU8sXEWG1Bg-&z`(CP? zRZsZ=yfg^T(X=p`+A{lrOfvp50n)RX?r?AG?A5nSOiZfmb6i(w+wmcsqg^vlo=wxg zyDW}{IDJoB%gAR~iE3!3si>IbI5XO`F?((>gL@s(oJbNJ%Qa!|B}GMUe7mXrL^ePe z+C|<3;H{R@t+>t1%puR7ZHljny)q_Sb0zU=Zq}N(Bu~!tD8mInC@X;%AM{G%=g72* zR4GO&-z?{eh!)7SL}gG>R=(fY7oQPdAAo>7u5ncBbDH6Fp_a?S1fAIQ)ezn4%F_&* zl~`soQM=I&)B0yN`Hy!J&8p}k88;|+{!QZj`}ZRp8qEusv|S7)`|IRSjGRi_bL?u4 zKeujl=9iwxkUg9L)gOSed)O09Y7^UjvL~GA4!vv0HJS>D@=DPBT3Rzw?0}!fMP=nl zpMa_u6>Y03B@hg5ndK#+bXn3~9<$aQo?^jvhV1EY0eZnRvwaB++qU`q^Ur52pg^Np z9==XXOKT5RF004xQP2`rv8-7-9+%;}%*hB>4Wc}GpBVkl7UK9GDUM1d|B62jVC3-O z!{T#wCD*3*t<D9Fo2m#|<kcv_EnpLQZWbXjPNEunFgptZ6E<f$m;$~{Hkf+fI6_VU zL(k>J*#OA$a87%ytAtJ-;RE<!{Pn>me1c$``<CO!j}uIGvVW%aF5UgA27vxaG~oWy z*Leb({Gy@@TrM**pnrE+YQgd*YP4IDbPIs*Q;ZuX!#p{AYm?f+vYx+q5%WjHvYY;9 zRQ>D8;XFFJl1BT!I3?Y4=gzhLAn2DQy@qw&jkdPZ*C%6c-lt_^4E{Wr-k0^f7brRC zn2wa8p&>n$jPFsQu_7Mz6hoiq&&xkqbd9tZAZ2a;6a|37Dvd47P0?{#{o<?X?;IoV z0*mtUmLEa|!Ho^z0-1SkUPv`*!j0joM6CO5LCluW4_=D9I3rPQT<)P}#cO;T{n%gv zvaSu4;Nr!L9<)qyLY@CGXxX-<)My!@{M~YXnlSxC-g$HgX!_49Q?b$S`yfdK2L?92 zKIzd8(4JJV#{oPVf15r_3Bkmn-$2jd#pDgh;ISX?jtp@YrgpPwaY{w+2f&>^<E%|Y zYtEgVW-@;y|0+r05~W)G*G~!}__(~&8X6jZacKB5AoBx}=fsYXp<6?`l~J00`I~e9 z{aZEI8#6@>jokz~0xnw5pp>0Pe243T_c9371X@dricVQkDc_dxZhG35f;V!UFHivT zY&@EL+i%cxZ~k>TCYO<=*>7o4aHnmVvx!oZ<6Kw(l?yb%=E{{Te}QPm2xzc}Lkpn5 zGetdee)c~7`;}eALAwM@c2nh!b{2*guN?MLj#b^nX+PP&(|9xSNuhESn4{1F)Vv$; z78o0S8QI|_Oe-+~^u@qAJKnS9|NiY#(#Bo<M>UFqMMZ!(Tdr5fULkYg5VJ4q{_7HG za?qUezG*Mb)&S66X6ND>3ilRU@79{u{=}$0*#H~>uYbYcRZo3<tiH2)P)Qy$ZT?Il zIIS_k061pm`8)May&^-sD5GTxLNYfu4?p9z-YxX=XGf5EV%wL1bac^RCF$S1eTyKw z_m}MWQWk|K55yc|m>bLVFH`3afHxp2^C9mpMViznAEtKO!1ejqzJ2?aSwqt@#Mgmj z)+E#wxP>YgZJHqU#JjxlIZ(2AG_>tHeah~W7ysI_aI&TyVJ~yx!bZ2&V28oOM~*zg ziqcc%LfRSm!4^U#X@(mbA8Zm?{-z*G<qys6u21Rv{%jxWK(fK5D!CV=O={WB3t;tI zs3aX7=b3C8H(Bi_R*hx*f8+pP3i|F~Qw9OPSXtWuKqHM0-qH8k!TlQz%ZZ-KPviCv zQ<u4hWm=YWb%?BuF+P_I5?=VtkWEcq#PRMjGQrwp*7}mqNLe-0xGK8JechI@C*)@L z^}+HY7SRYa63mDy_t7?X0AgmV-L0ev6yU|ywc^w@D6?l#%Rsb=3;lXQ1K755C-L{I zB}cWhKD#U}qL)h2&qIOBZgtxbCP%ef)q1c|2m%#AAv?8Og}k^Ex{e|n84z77pyV*` ziA?(mdtj}+36y8Bl<mfQS|uz{sNAa>PNKzus(Vt{A^mcq^PUYyLEtmIOiu-LK`13- zV`JdkyAPRUPE>;eKq-N60n&Q&`t?&mWEk~eSt*&kXl_1&x}RpzRUEbQLZF~I1W!qN z$B}OgdG~4NAP<F~U%6Xw4-$t618O%27hS;)oFXD3%Nj)NHzj4|Gh|J`b&WyiPoJg? z8#*_Zef!!=xqcKC-2!R`!F$DPaIt15ziIOcsF46F5FLhd*Jgw50>#Dky;`|87A<b* z?JJY@;wmm3&!posC+nAGhfNf;d6i4)^5tacJyB6n^(sJ{AP+Xkq}`7ABxXmmNV%Zs z?T|fFcIY;xTLCFIf@6XPH>E?z{opV)sOed=7BLVO_8WcGr9Y|QOP=;ZuL^d<DES$& zbAM?<;*^}&Ob2?#U{6N<N8`E9<0jJ~5dF@}$TVc^K)E>qg@J)fYd4kT`SU+fzQs_T zGc&g4)EJ}xqm|ir`lz_EJn-}+WK44GkO3&maY|PAXKK&?X0T~Amp0an1MSx~>AX)X zHr<N&cItD0RPBhw`3!BNtqqw@$e929^H2THHM<jik6yXmY}Xa1nZZx(_Vw#my<odP z*jH$C#P!o3pg^I?W^mywfU24vj&#eOQLwj(p_V0eHW?WiQ&58Fp?l)}iFwWoecH_6 zK~{`(wcY?3jRf_>?}ETVoWrs~Gs&LH96B9DX66t4DVgCg1YH{%rYw`wR|sOaD`W*0 zz~N-&<l<*C-Xhn)#0DBt*GkeeFbLc%@+a2V7=YB%J;(VLm~f4j!vKL4hs{buB|#Xa zpwEWOl6!FF+T{g$K?v3aRE^V~1St##h(e*C`kgs5GlO8)N;A{3_4oC)9!RNq!YrNz z_?hx@Q>{3MM%pk!XtqFnanqi+Z{9TCF60nsw$(*bUKV}r_li;M0^VB-vlB5*8*bjb z2^yB|gdLlVx8FsT3k;fdHP`Lo7B+t4dR}TbTzZq!R1=ekHQ^Mf#HuRB=-gS5K6ma1 z^e^$NIj)j-*C;8O0_)Uv7W#n|i>qXkZzg(K(Q_J|Zuj=yB@nO7>ErKjTcxDZ1T+b$ zD8^?)yms~?!rr;}T_l1lMvlm{NDyfhv;~OG0QqX4HkFz|T+T$QK-CT8GpYt`jV`w7 zs~&*IaH22v!+aRA`J^l<N|$vy4hYtk=P@*A_5Q;LHrbbzk@6u<*_A4>;0cfcq2M^D zA_ciZ<mKgu>OSZ}B*NkncqBMjOFd3Ilq}mFAzo@gKLk1lKL&6{51KFJL48UG-r`K} z4s&;RkGf4RUlYsJ%HbxG9VUv9kPuiHUI?jXF|9=GP->HP17D5CrO4B%9O@r00AT^* zUP-rnSMt?0K}KF)9Jk^;oySb5=YM<bvL2kJDJhu?o|~6H0G@!CQRjO!Mg~pGGE{Cf z*-PvUu@Ap|`EmuE9|}O-`x^vMt#i6#e+UEx)$#6Hx;<i!(|}p8{`qIC)m~!3LRU}< zODH2!T4VkIz-+WNGUp~bW)t}L?{A({<J&dUE$x0kWXnP#3|Q1=s%U7;W5>2{-Fgen zbpH4()N<-E5}yF5D4ZY!%=q~D9#Q+<Fllz~+}Sc4Us+iRl%aS1IxiYNa0e91_zKNb zqLsQXEIUz>^i=2{G0Jg2fBmAFO8Zq(lJL>v<<qC700@W!ybxtfR+WlSX9<l3_KU)p zL&V#+&5)S{nk+BfzI}UOfhuRk_4a%(4<>6>cq>%_0H=fBLdZRv)PL`56uDK=2e`Sp zH8wUTnrG;jg&|Q@M5@6c0s2+=OtP~wuA>zjI#&=#l@OCrRVRh{B_Q*x->{+U*DC?o zJbv2FcGzb0+jEt#iIsF+buW`FRMN6QTXmQlNX^SPmXj-nGIe=C71^h(T%B`g75>qC zk7(A!Pl($gPLp+mdRb#ZLygET#@iADM~)s1giwFOPqP|}ziid5r6QL!*M-L3su+vg zx4+~MJU<|VK6+(9_2$h#p^|sM<Qoq)XI{E^ak86}D2GtUvwEsf8)M?rd*d?(4a?vB z2#%nbl2Lk_!^y&`1P41Or`G%H=W5fOAeKS|Q5aA~N&E2rJ-&gvV0Q&DkkcmvtVPy4 zCkONjG=vw@*wicyvZ6!^4W<N-{xDQS2;M1XZC|jYw>y6tW)B?#wZ~3_Xy~+mL{G-9 zrTDzo&UV%oufqpJYgj6m;qvnIjEUQ|qVf?q`}nwJb`W_fGwD?ZmbMUEB_XP!pz!q7 zt6lsie#jV@4u%ECd3Zj2_<%fs#R%O^L)s34>o^6u}Dj<QEipGx8BM$ha)*G*i>2 ztJ;Asi!+2o2c5ptPrDh?$}&2lc}TA25_yTR08qF*yc$+-*dD8%8egCE=FJ;OP<K|X zz3)Xp4&IyVur6pjK`hiyz%HS;qMeA}OkGpE@AT=@*i0auwl-W590!}lnJ(#9WH(3u z(BIXb_*qy8Oah$~R}P>aW9_@}2pSB6`kq$<HXSxofuLa&RtU0VRvTouteHQchNqve z2tA11IR?~#etbV$g@Hqj_xg8wN}~(C-%sW+3f?e4I5g9FA%+VG*x=@o(Xr%Q7Im21 zxGNQG&lFbYV+&=Yk>B6eGFg_|AoQZ{k}A*Y=(M4qP0x>&aKVlMegPV?{p<Bgl1nPf z`3&m;d*})boEkg$O;qAFt_&3LJ$(2Owc$r`G3~BYU><MS3r=dT3m2W8vwF!+kixQO zk)%xCM)j&Rj>CuFSs&%$;SmwZh&F%q@+D(?AhGErlB4FeN^+u3y5K6VBwagQn-(PQ zvN$`@7sG^t*9X<0HjU|oEdg_`?b<?qldEYY<@V?PbA20Lszo*aodqz>h6tlL(DB-1 zk1)iAMDqsdT3Fa`f=5UWtZZyuAlSspR7w(`=t}+dUtsR4i-M|Z8l2)p^H(okT!QyB zpbCBmFAW)jWL8hs--N604}bsuJuQ>4BlaEaF7x*z_Tm#`MF^p&ljIvpqR`AlKiCaE zetv#0UTonvX#vzQEZDqdONM37asARzTo<HIw7VVBt3$W{3KSYku8@lnzSZz?AdgPm zvF#qinH}qhD|`^KT*i3!uU@@+0|O^ESH_Q%gM%aP|FN)@L0}8s%;JkS*>jpYIy&@J z0PIF6P|HQDHi_)m<qQ%w3OtWnjX)(!^T1QlOh%|0&!qS4<TpX%h`ql$u_Y!d$`)#8 z2oefeggy#dI`S91pCZRe>(V7<<&TC{(d6>_DkMZ(QE_A<30)Ee_NJa5axDCb@cE(j ztvQ#qw1S^LrMDs{3X3&GdO%_RUn?`u@#EJ5m$&`<fiq;Gk!9uoceVZ3n`Y0DuTfAT z7Ir!E<W?i+mG{@t_^Y1WxOtNnPUNk|Nsu+<a=y@t+96Sl5r6`S;E!-xa*yx#N<*!S z;vx2-@*x!O12VquUJV3(#xGVq;R9ZR-*>B#=-_ecsZY%1NVJEBK99)(VbD{d8WN84 zHwNuS=$6aT;XvV%wJz{kQA7>fzTDGkw?-CLIgda4SN}tJ;PXH|Y9m*-=2!ytiA~n( zDtQ<f7+{P+2!G%An9Q5|H<1fNs1Ey~L*LO#ccsFQYe>?SPSHh~BG+SUNM2sM1c-~H zCU6zZJlK3%NIH5d*_XVB4;|7cSo<g9E?r+z$T6cozMfM%>jV}}HSsFwMb#6m6IwRS zZoy+mn<l%<_F>VkPTjV!AQ~GKLokwKy=2ENXAh-QFuEd#yE;0sWT>}zsfhRn1PD>0 z9}o?1+R;V63)0nWza3WO3ou+RZ1`e~VW7EVnE`F!AfU5=V`9x7;ZDdeT3#}GmdI3~ zMB%*+3@G3}>?+E0UZVeFwOW3kD&>9G*RNlX8;D^{#l4ZE0x=6vaOu(|U4&k*H+*4q z!aW)A)<7#ZTaib51SP&HWq1<fFcO-zXxw1SC=l~AW6;C22UOM6)DpBZ@h`2pcSWpv zw>J2b6kJ4yb5;?!oE#J?8s^{bF=#;xsGtNz&YjcbP>ni?aR>l@ph%te$$`|CcrZWf z(bAK2OoC=tYtw*TOrecj+r5p%=&qG%AE%lW)|&|Y22x{=Qsqu2b*&OFC~JAu<<kFN z(9pcpKHl1if>uhOIg-DND|5UY-rWR5(6y3gMcf6j|A||CTt~jNS`~Bz6y3rfKi2I^ z1;*7%(WR{LPF{TlTpHjMl(xs=D|4K6e>^`xtQ;f}g4dl2BC`Y9qvS=;X+Cu5kSgd* zZ5o;h;@RMDq$+te06G4R2-kWa%3Xx&)PA<JfC!8?e6}6w3USdIP<?SwmT1h0<|4Mk ztx)pv1=%QDTS=k$q4xhqm%CE2qzHU%W*NA&;Lz6A*0pJ(y;0%6+uL<RmdCZOQHE@$ z#ZhI+>u-&wiuF#g28*yJY%0S>s0m;3MTs_xgvJ3;ZQq&4XXP<ZLZ*wnmx<XhMLps| z_I?11N^Nl#xz+fmqr(vcAOD#@s5_MWaaA*iNmddhB5>e9CmKKonh7Q-+=+=Uax;-w zMQJ8$1OJsus6qXWpFY$l{R>WD>EW*sSEi7Gh!(c&RzM51x;Z`E1_Jf?7+DtNJ+*9< z_G&-?0i;BPs+(q}=;YluEO^Op{L;r~8^4LS&>w&JY6Zk+Mb@X9E&=(I*&+rI#FpmA zfQJYhm~w4-x@5hnmJYZ5U!`p2)BmxQk*>wQdD8(^yb!YxQ$Qf87=EINWu-Y}e=Ns2 zEHsowKtKTb4_r|2go%=uO9>FyKY~<o7n9o}h6L3p0Kzph`TrfzftwI(Lw@zklN(Pc z>y=D<D>*qi_&O#ggV~D{3D{(%|Ft~1F8GkaW2F(`AG$BfEHGw$nz?R@E_iJpI3-e` zCEEprhUh<n^cOHgl+m(a0`w3@X(oz~Vq#+-3+JW6J1G%coNgDjmzf%DB6Om$8Biot zTSsq`GBnO7Od^A2>cj*Lig2|fyo{cx@F(@*>bA;!U013C^^`1vm44Zgf6&+%yp5(3 zsu{665b?J$FsNL<e2Qsplm*BbTsq+BqVL}y2p2XVVrPdrP(@*tKYiF^7MO?r=OWpf zlZ5b{9B2T4`ve_{&>5l4p&oCtIz+;x5NsXH{NTG_)(k+M>=l#<l981)ZvJeCGK5O8 zdCS0MYKegd26NUZ;o!#T+ZmXzn(Fof>2R2C^O)-9C^<290U!0N3x(4Kg1}%I$C)R) z?SMIk%G5ct=SM-pUzVgARNRCO<zTs?B>K;Zn3rR*hQ%MEg(yFG=6(o!P25<T88NU1 zDg2qAUutB^t*hD3(Ei|>p-d9i_sRPk&&kO>5iS!rLT;3ChK;`e{BxPu@c?L^Ks_6G zl)|(k$`9>zTv-t$<!IW9^74bSFD=o-r2)r6?-TomPvz43EC|^frnC(T7<qmYV*^9Y zqU`K3fb@~U_PxO-{8S|E?Vj0Pgx!pBNo|LL6nMR0&yZRA4T!302p;s?2h)XslG;ua zRrnEPrHuV3@d|`!3*#UJvu~M0PNp5-pMmV%1huRzbHk>_swVBd)mR)N?gCk->Ip2I ziQXze5n2q^T#A%S@>)k4b~*ESM;W>dA=M{jPO8yUVH^6&!lem3XstI;pF)gs7IJ0J zLs&tXexL=B2OSOJGznP@gEElrk@-&;`OecYxMnJ(4m4m-i1{EwRYRyo%R$h|N=w7k zXXc-pp61Ls40EMRr<!2mZ_h8nev&s$ifp+DQ8`g3Z?ap{VdQk}zk>JQk*p$JVhD2@ zcm=oCSaITOIW!-v;ydEf(f((C${?I!k$}l4AjA+v;b+%=g;v>sshJHMHbABt!Ni+N zP#`zvvUc)-HcERjT1`}`q#D(5`o8<W_`?6n5stHwas*(z2?4_q^Q<<_c6#t2&_Bq- zO<1+)sot0bIjF|a_0X)f#X|i3YoS?iYNjUx2cs8f-Q7b=Xd4)3-Nk=P6&Xi3?aIpA zgG<mp3kx$~)840Jc80(m2+yJH4^^nnLzISC&%SuIlwLGV4FvO2(_zkdRGKycUG$1P zah(;eYf!3b@;vC|&z~1YvU7dvcn<yabkKtYVi{n;2*b)m2~Y}xWZZc;GVr+L^e{N> ztiuh#oNe2;XH8VALoIp<f~Xhbq--U|!C~Bp*evia`TjksB7+NblG3g!J~pCqIL(fu z)vsE!ahQ#)05}-ux~(QAS^fIf!^=h3?SjDckS*9B^*=Chx6TO`HQ*>{VPlYgFm6kf zRW8$KW`I!5kUTJ<@(7c*G-n4=O(7nQ1J-0hU{q2uDGU3yQ!miG`ifqPX-k&%&}x!G z4Pg+0ci$sM15gkru^||2^yc5DL@2J#^V2B$_dLKzA~6|;91?=t58k2wVa0^(jT@a9 zvmB|sB%e7yTBy`gT8`R)zSSNb6CWMdGVnh_nKtbxfJYFhf^Xld&J*3jbyYRcLPFMT zFSb=Fs0ViKHWdwS!^)@~F6%azvTWVHU2I{z0@Erp5Ike#h)dqWCI|cXq<bz&pWg73 zlL(Yak3IHLhFl?N*9c9*WdkSPKD3fV&qI&{XsSz4Z_U9=+5!*-=*bgLPtrc*-$5wV zP{X^50=Dcqc<><X_6W?tmMo~Oa*ToUw|0;1Rk)T|@wNc$vRuIN2na6@Ho4$b6d(-N z=IiULyj<fntI}j;Wt$2q^Fd46nB#(PWj^*xz_cly??eS44lxfM`=R*#rB#Uz2yS9B z*ctz?Z2YzqRm1^9ld$@e_tB9u6paQN2%Xq@ZXh9h(cacSICwFSd2tx;F3(s^np_3q z`Rat-C^7Lf+#=`2$Ttf&_;#^S*QubsoC!h;!psB;O<No69fulC8)6ot{<1_K@K8u2 zK9*<>(;1>dLNf#_#ZV1ROBUoi#1@AZVBqg>WWnPtAzsdo1GVTvI6xoCMsclFv&SHQ z6o%H>*z_Q|TQ9678P-6C;)IY06u$2S;m~W^H1S>gN}Z`5`Ul0qWh$ek3F0{huMBMe zxNnj37jsH4v05>GFr^}EfzS+4@Q~;CJusME4yUXXl+G#pI7wn53F(Gf@u4?HPY#2h zuS~OquqpMI*S>^Mpl<T1b8&8nZi(p?sE_&Yuju_Ey&4iH3Hc0_xT~wn!(*THlfV;p zx7F0@Q?tzBb%LHP{`~Q87VXfu2TJJ2GbB>fD4ZO^lLcS^t{Eq^LnJ@p8_nGXG7Ln; z($Qq)<TSw$CPuC#knmx}t9j2t>?ZP`O)d~h=_aaNLJa~rH3v7u)OV;@bkO7gF>1iP zG+lsn2B?|)#^uZn@6IXvAcVQgYHGr`dUU+0HXUf>gf)#eapqNTP7?OyL$Pp7@7#9I ztm)tI5oWh;tChwYK&P!hU1D6M>%bqdsOwtlR}-_|=s2S+)JU!xOwB~+8)J;%sMIPF zuLLndfH3kfYeM^lU$}tslIXpal*knf83OAyRG$JT@h+_}!C9ek!w4#E&4hD=H9UIg z5Zo@s@H0eL!??><fR)~$dV`iVlEd&+z$d%06{s*yaT@fb&}MtkYUha<ZR3_KQEE3x zIwvrizqoGeDumW<lr|#USg0`wF$7o3C?S)-S0QJS=q|*7In;Q1DopWyZ~SC|PW}1S z4D!xpzMZ*Xw?z@`x!p03fl(XKTu&I3zkvnB(ReBvAd{eKL=*pngB6Cqzdpo&F`kz9 zYzMBUbD2Jg(iBF!x}S}JbHq$KM(8E!zkd7H<g&DYd6<*v+)>(waZc($_|TZKN@|}N zu|HE72?NQGPfR>U?FMxg#J-0pBf^!7v7w=@s88A{hD!8QK$MFOEsNF26rlLIS0Kwo zn?;LV=N1YZfEa$J!`LAK@%v4Q@@7!+mT*|HzQjxp#)wJ0mkDPB^mI>rf!D4n)NxE2 zoZ7Xnwgj4nCPt1>1UJ%|W(<7PUL5r+JF_xW>s<=)bIzTcXc{#d7QY~E!*GMvg)JUP z=<PcTO7fKTq%W8D9x6$=V+0u5_e5|nvEc6#6Nmq~B=@7RaD!W`NXtwy0Iad`29g2; zv>7;W1i*o=Nuntyd`XkWG_wT^`-}rVd-2mHG82X*G5>h44-@wD9c<zc`RPa#9>nD| zLlQ@EkAYBomj&ayg@gwO%_ZvOfGLv7nlM3-Gm`X}o14LK%n%AYaju{as;|6o&l`wd zEYv6~AK{*1$tffGu1h0;PMVlYLK-1RhyfQ}m#y(+F!H4V{}sH2={-bh*nE`1LY zsXr4A3ev_JBMul924KFF8E1vJ6LAK+z9nk`L8fGSC%qa)2aVNopq`J~jo3PjkpgN^ z4kKW^zZKI*yD`F(R#=Pj7Gb~PYmT?`csRw##&RKnmM}Gr*?iK38)0BfDnTYhC?TW6 zXx}rxJf`yngN=7tzJLGjkCC5}U{SKI4!=YMvJJ)VAxZ=>R7Oa~w3g@-#L1CJ5@ueZ z`UV9pfEmE-28tI)i-%rF%YyRWMAQK)NjMn5mq>gDI8u9w!A1$u6f7Uk4ZMzxJt`)) zXo`7;cag!2IJ%HCn&7fv1bZIf7Ar|86vVg&-s!rko+Gsp;2{B0$~J`N1sNG)Mu6T3 zs{v8=7gP$jRzytzpONoS_^NB~Fie7(F94GbZddV%Xj<vnbX0;l#H@@0Gf@FBAqeFa zqa8TO&}p37w4FFSfpO(GuvLywyP?=+O*K2f^m>TZ%UKZ}8{3bmFHWN7uwc{+WOk{{ z+u8E#Ch1eKj~@gP-ZJ6tyM@BXfTBPMSHzi!I<Q7->q(4axUpa~_zQ-lULX`GR(5YV zd3D9-&FHB>S!OWzYC^Y)<p@-dMTqu0sB#}{+6P3m9kUrrNj6<~6ylebrnydTR@|g^ zZ&LQayF(HCWirAm-U?S4XeS5u>Yr2-j5{Bc%vE6UTB-h2^`$HI2i)~p=W0_&=GteS zhDsK0{yBL6-3exw%*H9cnuV9#gFAT==B!qbB=*6(6~vJQC~w^Av2k%dbG*Pr`Wvt5 z>dsB{si>+R;^1J~y!p88@YQpAR3xv<`ub6@W$f%e!8C=bk(`opAZQbbbavyT9R!+c z&DgSM&tPxwd1_EJap!r#^QJ2#h^l)^<ejapEh1dHyodNj-it3^zI^uVSzB8h<Zc8N zi#x^{Rm;yoU6;ta0_jLchn4j@=*}Vd&g+gJJEod+Z8htg&`{En58i&eo9q#gT7nSn zb2BrWV(V9tT0HPswY9bMQkZ=j`~7W$23Zwh0j0PCZ3**zei;0*<iz)v(tpm(EbwNQ zr!Ffm$E@<U%&lul5*MMYSXx>(XS>K#6Py^xjB<NxqRB>J>)h|YK9QS6JEYFSDGUz_ zyMF!pe&YVQd3iU|&>V0Fs@K6R-gCLN#7*KAngn%khoD(2SSmFuuuF)bdDt7Cl_VC} z($&?~?oSCr>fXJ!pAsb3{C8()95!#>jBeB?vW@(HsqctM{nqa82j%LdX*uAAyKBJj zFpuST>p1y0@IPK2k&$d1nN>A4A;)yw-c+n0vAENNrQ)MEOYPme7aZ<jk0loNxZ!0d zhxuu9Bcnbz|HCu$vvTCQ8O9G#YQ>!A{prKP!+CHL;9TS?((uwQq1$iXy*td>0i&t8 zx%ofu%FN0t;touYbeWYoHd5!$KQgGx34BZ3@4Kv7exw<;1!)Rn6gOk!O7??7%Y^O6 z57FE*Hn#iBsCi@!sq~m~f|l%Eq|P~hvI!7%50~Zc;UPm7+ot(Zv~qKE-JgQrd+p>o zLxj$J2*xWXdn!EWH`CJ(|M|6xxKz?+1t-Zov__oop=O0{_#9%@(m%xdgg*OOT$}}6 zm-P*1QnD9kV~F*cW`6tjZGC;c`%~;P8rxZ6vh^^%rX)nX1v~-{EXs9bQxjMxIRZ$P z810hQPqgg0_~Rz3Nfpdaxx1iMGQ^cXeb5_Yx-dUa&5EuMU>CpSNdE4Tbc-(9g*BD= z`DYM+<jbeq<DGgnccs(=36?bHg>>R#KtfD&92xd(+LQ$F3*sZ{Jg@rxFp&u<m{!2< zogvaGH7V(tr7(%)^D9{NRs3FO<i?{%>&drI6Zf*W0o<dzTVLud^ri-YE*9bgkl|Ky zlg5k}0*;@4A3!2ef79^!8JIv^T&qVSP!hs&a;PO_QT6iW$`}={A8!e%zKTF`T_qvw zSV2j!q{PODzDFQJBX~srJTP#6v{UBaUku0Rj)Vj?a^Tie@_jq=gBPQoKQI2hRzXY4 z8JdaIS!ih+H*VY?L`!m|2dM3C{KUh$|JbpoC0#O<#gBT2npYewItT3qyN;uV+KE?J z9I?d(Wx0rniSZs25b(Qo5IqR;dpU_?7Y>TPHjFk6VCTWOtlH8K;;!Z|uUxeXzA`_) z0Y&YcuphI;g$oz#?S)ydV@3{$iM)ni%D5abnv$27cK~%9FGrq}Cr{GRc>^7tL+J+y z*V5NNM1wWm|7sn{^-$5eM?yDHFxwSnCyX&1$m(9uZUu*4a+Fx*X}jxsdg6AYZ*Z1} zlT)tz(>w|MMe;5Z@H`P8wCU;TyjP3Wh;JfZ3e4RFT-aLFz~{xQNfLs_#>Nn@zx0|S z9lh47OvFjBJn;5rbB96v8tjM0hbUw@@?o&wc^`Or`P@1v?v8T+WuXwFp8ujkue5k_ z?cblzVhte$s?oveuRBOt4j@mg?l7Y<t#MYE0rg!$qbe1@aqr$ecY1h?_kR-Qs<rEN zT%7D(6oPa7FPK+m4tmm;f6$XB!ae7XkaVWYk`@s{h^f1WQGP55lh2EGtZsuC_ZrtB z4frAY5QJIkE+Fadm@BpX`SYhd6gm^lbHYZ+k&gh#v|O=5EQG5^!<mGV%!_HPl<Pbz zNUf!@S9l=Fe-Sn!Rx?L_2u>^%j1y=(@SWdS;9vER4nO23%^&iw*eb@mhNOQhE!uQ! zbX30-*}CQ&aWJP<FFY#htSSTF3XlH&TT;~C1$nRJNz=t;?3a74kT==c9*aCUI9OIz z_VMFKC}VJY6)#`@=gkYaAk#CQ9UY~ir#-im7j&1!%P$`JxI?dv1zB!xD;d8R77Cm_ zF8&6j4CN@LJRbo%HPE24cV_+{P<l{_oaVjk?7J8lkGykim9C)<QJkEdMA`J=CytA> z=G@u*!(&$L!%BLQS+VtT=7w%svk{@8Iql|}tZT%de*Jn1J8^!O^y3{P@D0wrS;fLX z{&W2}AF(z1`!RwJUH7!ESjf=e;2s8sso}PhKW@Tv8Ll|_;TUOJ@zCMJWuyTl3*)Z% zUUlT-Dku<GnBE%9yXorcu4Bb%=Kh+5kT(@GLf;As8tUqhq?@DygMu(fq&e2`r?d0k z{rezk`Cq>}F3txy`z1t0)uQ;odDyjUSB><!YtY42RsCTAqgeQHX^)PM+MWo8o{6_X z0x$aY3&iHg<lT!_cCn<^S1`+|R<Fif)e(MvepuODT=EctQPSbONlO<L6~#irY!U7P zYeg?cv<BluiJDq;baeeE3q6bz?AsR(UG~8OXGdpeQ?^S>P0bVvE<~$Gk-5T{Ag4SR z;L)Sr-d@1Ll;q^f($c8HB0o+|@aJs(6NZ(CAYPky%!%u>8k|0TuU2nyrc`>}(1{hT za~>8VSXYT~LV^SgkUnz;o74~1fiNueLv&xne-;=2hn<w;?ItA4%?8k*zaKHT>n@tO zh_LY6h=>JzjVA+F{I`uToA7dfT3T8Pz<!%<J2e$2Cnt`9l$Dm&W;tcNeS4GRiK+<h z@ohvZAiB7Qwic{;#@(=SFva;?_Ws?-$Sxcf$FxOuk8<+j1jb2F1JOZ!CSge7^%)=q zqg$}k>fn@Nxxf0DA#^drfbteGfW5@R08l|<f~gMIA{@mfjrM%iVA5tk=FXi5&LA4` za)0{Z!2<;OIpLtCmZ^ZM5^Pfkz8T{mp!l*8n)ptv0VYmy_+1DR9Iz;kMn*+PtGzJ5 zs0(qjrZsofWIo=9V>~V{F0!(}(6u3qYu>WC`!pv<LRwmSQi@}|amTi8Rsdq)GDp>t z&z^b9Z)zasve^IU&%vw<D?*P$sCs+C9Fq5n6)T?qq5!7QYD+6CZzhqC7~F*xpf^7~ z@^92Z+8>*w(jqJ@jESHvQqX6)Q66Upsr!b8hI)F=QQOT;#(8=cqEGiunEBAk-o@Xp z@#!VO&NR2QjMbF<ft^Nl{G1oOnnN-&GGbz4f{F+M2jdj8v$H7KN${nhtH9!b%*KRC zq$Z0+bb3%D^@|s!mb>syQF}|AhQK~z)FU%96AwdJy;Ab()vK14<1H>rngxT;AE%#C zCJoCvIywdh2I?P$sSXQDN2e^)!REk$2si}({{D#j=1&%pVPSh&SXyyB4{1119k**L z5CS<k3=Y-rbtj$^0E=?fT3YP1r<WIs;w=RQ1w4!aP?~PfiKv7GE?)<L=p-DJv1b4w zWdQC2-oAe02GL}{KHLId2H@_=lP6+kt**(*<Ee6k9UTPV_B}xqTf9QKVI{u%Nb?u7 zl6co>DEXt#z4g-cpkr>nx{mLdb<GsgeO7Gu96gPhAiwp5@yR`@+|(O3V7!o#gG~t^ zP<VO6_M>JtHUfNn#uab9P!qw&z&U+6HSPZt?G*kX!D41<xsi@8LAMx&4Bi+PkqW?# z(Vtm(ASo#+J(3O+y(!KM_Ii4H>0)1yXaoKIU*Df{+riB4%KRPO6rc>RuS!p6<8Wy0 zc5BWGR1(&c_aV!lN9};BFE3wGR%Q+iH2(ZC)*k&QX|7ONNlh)?dO%Ihz5#Q#DBKuQ zibk(KbZC4CJGQuhN)(ZYi7Xs?@JQF&vtx%XKAI5+&Ws_)Bj=t0#6rnHDU}Zt#3^5P zsqfHgx_<v2jeDVus@GEyX(dl48<Zu*8SUbR2KC;hb+oj;mco3`{rn0#iju!!#a++1 z#mp*eYC6GlfU4XBv~%}BWe$eO8{b`GTfO=o{-Lk8w|c4#iH-WVpNp#na1Kg%)l}OK zEF711)_!+f`bI|HsN5%W6&Owtfe8T0jT877ZOud|APCe--bO~I+KyaP8&-Sx@#7@K z4*jF_)Ts9Xrvr_;P+%OzLh@0+1~GhE#hss@zkU05Uc^a!d$z!DC!&HEy%H!XEmi1_ z2beE;$>*7mrj01=^zVYR2z!jh9TI||u<((?hm{x;Z~kILN=uzJF^T{1!GfuR^T-ht zayE_y0Un;1`bl-nw*s>APL7V+FV5TjVq1cki$@i#V?}hw>IQpxgoH5n{^w&?$V*9W zGhwV-n(g9(Q^T&VuJ^C7KPqU42Pt&y*f6#ZP26bJ?q};aZhTWEbN~K*06c@l_Qi3n zY-c9~-5rUmIy&Yvqc_k?(u??=W-Xs&u358&h;4HzDJhJ9X+GJudGnXA;2_)h9%r)i z@+NaiZynNIP2yvNbz*G%i~DMI?3LsN4%QAdLKH_S52ztQC+wc*LnSP$u8tQ*&%B9> z!Rw4-hX!R7BESk^Pac@l3!`<1EFa&ycN!kIL4R+r+Sm`9{#t<f$mr;FSV`&o>!KPn zY)qk6pj^XP2J#M|a01-@w9cthrz(>|$a6}>YB-uQI!qYvYa1GpUxl9j<L2m?qLLDN zV|MADr!9uic^g;N(l8p#{ZNBb8zG4@0)^=`O<(EjRa-DVP}%w5RIK+lQe_}40|@g% zYhbGy85ooiMkh#~lm|XboQ=&RAK)$QCJ#TqNp<W_FGQTitH<kD-LYP__V(pzz=({D zjA3Vv|M=`Ub6i+B{oo4-uCKJ*6V2Ptz;jSzs}ppZ9?q@8Sh$J}E(?00%I(|l3F1qB zO~)TNc#v>0QD%#ai&63Zqs)I@Rh2594W$)2r_=+~ktm;n*Veqeyzt+65$Wj{%E1AL z$uC2ri&d4C{{B5{uSoL^rLi11pyapeVQ11xSJUH=b?`fgaAlGA2O}A@wY3>>bnye% zny5E~qcArafzxx|8`iJSV%7Bpo<U15Q`OcE(cSNm0FaZyqj&QrtRg@9Yleoxct}ls z#9H^Kphb{XwJu$%`2M}SyDa>-z0Tiz`T;#T?epgoXdsaHu%AHd#5g_~UXpLyHU0TB z%a!i*&oMC%ClqxVISRYbNMC=)&YiIJWC@!1ZowN1+>~a%Pf$=0?E-@$$bPU@Yzop8 zA--v@4FNZXYK6-_c;LXu%sg<+N3cg?D0~gg`{d-<nY&wd@75r?7KV4|3*x@y<uQxe zZ{4{wz}GjwYi!%rtq^iEtoq)9@!#(F4ol@hmzGi&8WAunFAXejC8a71vEqRxS}H12 zul~$n`UztE-hKNJdKlNWI}r>u1<(fdlkI{(OMFIN8Q({-MD`muuJ?`?#6(2+1O%L; z{!vm=QdG3nO1hSbCZO6sr6R|B+ewX*LsDnqL&J}9yL-1K)4?ww01{0zDk}|*7Ye$6 zKtPfwL!Y|6{Vb+X!OdPge-67B`A3X5#HkCcT6Gse@bkwHKGlO0#AFQ6AvO!M4!}Xg z+<1Ko;auWb0+m%&2mEWjN8KjkY9kq5C+LIrY}&Mm7aL#!i&^MqA0D!Up0JJ;ZU&G5 zaps7FP=c=gM#ft_20)8UPPWAKKfzbcddedjdBFT$m2~0Y`zWN}sMFUJ9!juK<1Opg zW7@9@V-#<(gNQ$Pq%y)>)7+;{pAN!WqTB(wVH%!jka)NPJnVz=!5RZM#E}8m`mC%h zT-o?9JLD$D%6NfqF!z63W#AHs3vqk5JI|}Cs^a!AXc-oE4OPWs<IZd30heN@_;Ab} za010ZTnG2X9KElfUrcl~MpTg_mhd$CZ%`)jLdzI&x)pjDvzX&q>K{LT2sut46C>*- z^cgq;q=5<Gni>H>2h>e~MIda{RwPR*0&9||i};l^Ggh1;2DE2miv+V9-)DrdKrh0{ z-us_GGD*X_2BEzTb5ld08cOwo7&C!a*^GmjfT$1$Ecb<!5MvE6^Z=kB6xE~sP0FI< zjgPcTzxIHs?&IY}GuDo=LQM^gXj%-_Uc_64MmJe8;4shBREtZ+aphj@9M%*o55J)X zOLV&FDG=Q(_)&xxAnY?=-}q!j7Wzhn8U{4rCHD^?is1GGjZc!Nx7D+qGw<D7Jtg@L z*;AHNTdUgb3Wor32U_r6U~~qJ5)%Nnh;!ht?%rOA6b<N$=g;>dC5TB{-XDY^HZ@ez zOgy&7)s-~q3k;@_?W}n@7|{lRFgh}#V_H;Li0L#Unc>I-RIC2dJm5~hdFM_$m;#_@ zm(d_DUmnMj;GT%nL68y=2{F1WjPi6-xAmjfX}&!!AfUItXRsw(9D}K^^Y9mWP`$&L zpikeO)b_A%=V{#AKvM=_m`^4*LK;(NfG2<}Q#b{(olC2zeL?%Axp`1Rtvc1pT%4c8 z?Wn7&e!m6J8U~DvoSZX;%Fyy5Fo?o?dD)U;dc(+Q*Y4d#&0pF_^xqw5fEEVsU8{aj zE-(!U>*emfLPB;xO5EJsMZV9UZx#z-W@2)})GOGKn!0+8w(@WL+`aC#4Gj(s4j)pf zsa9f?<a{1X{&b){sk7jd$TIc;&G*-TU`gxLPn<Y`$s`p6e{XMj&GR{Rk$~;#mzn69 zndJ-B&(Ro;;DmmFAV2>{>#Ed#Hqz84ph*l!sLyu!K7Fd7ruNhPToG!OzDpyVnQ#by zDQ9E1CnqOk2Cg2fa9=3!4W5iMi(Q_c{{G`ffWN=0y86;+A(yzlj&NlNbTtMBKh$IH zuI*Cafq_v;T01*oB|kvYpo1%R8{_d}HUsrmi}N$Te_xxNL{o5HnB_Zqw5Y18>dTkk zfQq}>K&8K8?IDEwXL2Jqpo=1!p+In`C1X4idW$h)<CN_0&dwD33B$>%^X^Z{yy^|U z(*)=f7A+8&Qp+&;kc&k4aqI`m9zHki(<gwC@}EDWbJI16<}y!@X_Ya!`%3kyR!UVU zUAUn1-d|vY@U1%A+H&LRF~^9P%h(^UjEp&GEMQs(4Gjt~Xh7J?A~sK)#(*KwZ}39j zB&OAXQjwv00=DNbT!@`rT@4j5+L;-TZL3Bmw@vNFH#?hcju+dADO+=MJfY(msAF`r zA#?-)3WH~aUymmqSisDm)n}iJqunhCQ+zces+j|N7}y+A7vD27F(HbnXHZ<UG)`Y# zg6UwsfcGt50y;GT_(Dp&A3aJ(1vE1=L#x09Bbtz2k$<kJI-1qlv($tp2USanX&8%- zB&xWb3^Darh`2Ux`pT70;C8~ov#2=9i#Yp^*C-My>?cbvBzk?E7Pzo_7>i3l6}1)H znVDVCLcr1fbakOUUlt)|>I_Puhu}a79&Ds~`ZOBzH15%6sCi;^^avN1xXYp=Kn_Ng zGL%Q>rX~l}8F`KC2?@_Czz|Fp8s_*4aqE76l=+Vzts}Va+_?if{dxrECLBg(zkU42 zxW&S$S;2^eZ+B38VYrD8+MP*L`iSjk1fi{igZ8J}5g_b%(iqG#`0Bn8x+6vK^t)Sn zdeWZtH?CjD*+d-!gE~YBl$q}CZo*RrTM!j>LbF57i=U<I_1+6H5S0a$f?YZG7+@a$ z8y>}S{dxxq8O$HZ-a$b@p!+x*Rg?2@;SiT<iUQ;YfB&<(V9l?CqA>?q^0mr$=Kb14 z(`ngSfy+utagz^{XFmb}L3z4ZC%jKY<dt&zuAMupjGoya0Lr7WjvbOzRD707yryuB z6=z6I0kKU?7V|VmR!;n?tDC`~47|-EY|O;?ICLKzh()bW1xEw@1pWu5$FbD2-6ADW zE6P;z&|r~yi#WRlA7?W)6>Lht7YyWp2u_WR?1aDqJ>K2<ZNRf<+?XChuh=A&|AYzb zKL)R`u#2jy)A$hR;1Gb0;^+um!$;oUQ{&@X?HO<)>ytsnK}Pw2<KL>QKO)7E9_anV z#1F0n?H0c=gT}+I5?}eoi!hduh>h4xYan{q@b4312v+SXeE#g&V>~?JnwnTAaN_ed zUw(c6Zadx$Vo4lAId$qQYAPgDVQgPC59Inzv@-Xn_#Zs9!T?XBfG7D5@+R5`++#!` zFo3>_rnLxEa1yEfYE>!>4ZL`(Yiby%sobA_`631J3p2LZuJjm`GTfm_RkY9DyQ^3` zFkptj3pl2;l80^}u(!QEuQC$ji6PMEl)MFT77kruv(#=TCNymW%tH@uJ;ckbh>I!s zr<rlrE&~1ZXrC@r@_PJuJEsOZ;!o~}etrxza=|C;#=8}tJlhD|fQEaVodwMNCuFhG z(qVL8RJDWdC;$?er_MNd90QQz;vw|q*um)4N`P)icpPzHaR;^S#$^44woo8UWD8zE zO9+Iz)b~kAuZ>0Sy1BtR;^-}feG`$3n&iJ%P7xiWVC#9{<YI8ux+*d*n8n4_H#F2Z zewzUS4#pux^s8}clnJa06BFScCSbrGDTTkG@@%gfn-KXYAmE3_4*`tA2y^yAxpaaU zuy(Bnt{w;+SAn(rgb{r_;){tx>l4Zoiu&r6cOb5yi{asyd!LAunSGspj5yeS$v(rZ zP3rCESM=qO9x%?cap&>ts4O6nNPi%=BIbK(#VU_6HNqW{x|VkLj^o#<n>g9oWtP;M z8XM(8#AK<>EiC#l4PNRtmo8$~O4@R)D?i#lz5#q45g9i#k43;$irNhBt~ie^w1&}v z9Y(3RVQVYM3WaSSw4h_hjzxLLew}bTO`Sx}Ks~(!$@{p%(dEZ!aq!b@eipGO$(2di zQZHV-h|=L>*t{8M9M7}^gRW=FGwIM6=={`w?TR8(=qJ!h?zp$OiI@S#LOFu=hDDnK zrU81r6fj+sjZ1i4jdQxN(_BFIfnL%3pd#0w(k{2V)A8-=R}4PIYJ7U087KwjxCA8? zBRyU?^o;T$CgzNomz0zYi}!(%muWM!XD%faWfl<?46h#sG=w!gQUMhiDWf*=k(QP= zdAz#3d=yQgsNy|J1DgLH*atx3;EzW*-sD(A4R||ljh;GwOTGtywzn4yP3|S{J9zwf z8}!?vUAjavk@PzfZ*t>L38nD{L#YR-RT!7($qx+;#c-y1OJZo~Rg6?7j}vDXA<|x$ zia=;#qnt3Q2jYiG6HvMjA3wsOQ;K*?hXsy^NX3@JdF8%zRbsKlCKtjFH}*r#Cg$C{ zbK)n93ky*N-t59DHsaK`ni%1&46k~Ay}bmd6TElw+<$ld8eFgAH+Yc>N~ZePu6>E! z?$r5n+mn+Wuvx0(Fl!|<@EpTY(}eVmqevSNu+cIFfTLDuZ2*WVBJK*<9W0d;`(;G2 zj0!J3H5E{2M{q$)OUN1wSi~lyAbu$>e&XAV0cyZa8U2!Ch*XuAXqX8YPI%rU$jiEW z&o|=q9w6sSaXcPrhHWm0TNefg`uFP*;2oj$2^crLVPGHx2@=x0d8HQb4)oxcaTtYm zTfL!I>@_BgKYSR&!!*Ghi77jLJEl0fpv?eAen?9CWZ4613H-|g@!>Ou@0erGCGH4O z#Eo32_z9%7fGIDf0!&&yKHNSzJbYDc>!shq+GuEY4h|Kz@Gmd)a5Ufq1Z4Bmx}YfM z`^=XP!9`h`Zl`-Y?#s%~o}pVn3x+e>j*Jd)D_f(`#}Lc>1sp{`6qFd|!E0T8Ps`|f zHPt^gr>I}6xH_^=#zl*2(0K3IvE%I0SNPa5#w{Cu`(TP;a1aR53<eIMCXma8@|}0w zon}TOkeO0keB1&8NX7$Rpt0bWoQ9j#)zz(gIKojmFsm0j0(sA!^W>R?r>yRHdhMm| zxdAK;ghs{6*Py+qZfA5)p3K82rV^8`fx(Z)XFwg&r%pY1NkfDy<z=Xcg@d`mD`q!( zz3QI)D?TG@WL|yU5!`&QTYgc&H^k-^wW~OMt+KTenuY73=dn@kV>6gSP)Z%M{k&q; zno@ten6C%f*$c-R9(#JWN40^g5RMJhZo*>C*pK)GE|4X79!7v)=N-J@pR=|0;l1GC zJr;K^E8(#NywlLjF>866W(ze{@}+~Dl%3|_XW4Js!FnB7QmW<r#bkE9g%NDoPxXNE z+Gs&*C7cEcg^+E?HP#IPbNR?W*MXbyhz(Jod~{?KRX``c6PBWA>U#Bd@mD^a=}sR` zVWFokuc(-Dbgnx73#}M%2ZHb8#~YJ&$FM#e*=Ys-0dbuXCvQ|P7v|&Hs--JC@p}Na z?`<CTIey!^`Nz+n<ynzRBeo}Qs$gmf^+j;7-bO}KQ?vUempeVm7Q&nn>caHQ44<lU zigWJlBFvGpGi7L~I85&u#U|u%?$m2P!^EVurA8;Rw#_N?U5O78J)=JXSi?EcE{2IJ zOgLBVJwWdX1ecbcj>3Zji4D!oxF;>d$yc0QOz%BlT#2E<+(1wN>OV@daO*2Mb#>dp zR!UX7Gty8-D_Gdrq$~GdU>=Xg+Y1BVf<GSWEiy1ND*P<ks&#-&5GbLLrF8!M`ifi~ zWMo*D&MiCu2YA@P5zTdKsO7^{xmEdDV{uMcrmkn?&bSp+S61RgYFw%2h14Y+$wvXU zKfL*sk53h(AOrSNK$~OV<ZoduFtK837Uw6Fg`tKm7Rj!iBesY+NJz>9PrmS*wMwLq z3L;X3OR=3SLPA&I^K~bp`r#nvW-0W-YMk6Z?-1}O)SVtu!IM0i9mgcjoOv5ue*vBB zxKS=zf9c08DGUimpc`IDaX(ZgX5a6z7y2TKzfu82l%$5=D3JHx&AfO3QT9rovD zoR=`jNh54AOG{o>{MHT7Cg>G_OfagKhTHBYBn%<1;WFX{%+fzFfCqzoz1=u_FkkPo zo!tx`0ASDv5BkYoXDh1-)PsWj{LNBmAc;VK))P8SB`UxuHw`<jtgM6#JL@z_NQ7To z7Z)Wk<X>!_zlZypJMfQ}lcOVJze`j4=c`U7(~gS(Oa`BI^46d^EA9(8ZYgyZqbX^T zH^i6f1Y(6I<E~$ic1idmcdmOAjAXyyU`{qR;t>{V9s9O}NyE%|E}X6k!BbM>60BDt zK|$n1Q)#IglpG|js@;dAHe2xu&|W8}1r&06DNOHRqyfmec9lPaJuM+~GY$5vUgLuY zC%R>w6T>+=NKWoqKX67C=l5iyO}JnlIL-2UK@J2eW<TOVjN$iGbfMb2KZOyRk&#i; zL3tPk$@RR{$&>g+FetX3Q$(y~aY1Lg@$v$E0rSVa0s@g%>4+1E-dDNw^BqU*p+7?} zMbu^Y{sLs45k2Zl81XHv=agFvjf_0&za8z6B^1I0wW3OlA41e?ZM`b>AUoRyrKHLv z6jNYWhW`8)Ezxh@xK|(CEM<E_3;0=Z4Y+rTwe-8y);c(y>>Jgq4kwcFvazGOmKG?v z1tSU?mfvD`JhUkk-FV~oRSzCLy4GtC8R?7WjVi>0VT@_afqnb(plKYwjt?p2hFN_r zzm&PpwlrPP;?D&I1$x#~($ab$N7n*)084N-0$LCm$pK(}*dUCXf?XqzPKbR&oTGo? zC~Yh94XpN>kbUOncAj3G=)9mu*ytAWRVcn{PBJCCz<Z10(?b$!3zaNfisM20<Ky9f zpQlFqm&JHpRIytN#~=^!WC{OiYzCG!B`fXstIE#ReHjPnsquUSsSpmF{OfRzyRP!s z$Jh6ENzoqE2o=x9T923?Da`%cO5W3ONXiO{=PeNr!yw{{uy4A5*9oqpazmElh0g{q zYpAVF<Z`+bq6!@a=Q9P@05+HRs!N^K*Vm8eZFqTAZZ#nYPvNmNi@0O=ry!QdW;_^> z7<Yr35%r@8$|WWs2?ElU#X>w50%Q;#+upryPS1!55CAHgy83zyyX|CT#3>j<v;*<R z0w?0XOt?3_%PlQ^s=I$~LpA1!CgUN^YiHW~(1V`|g113P;?zh}_&<cb2{@K(+xCA~ ztCgmeRWxXjwIXRkniW|MNF^CclqRA<BV}GIDvCx$14%N4CLxrf!B9dfDl{i08JqC^ zPU>0D`~BbT|J~cOJ*%y_@9Vy<^E}S|*pL0#592mM>7lKt*>S)^6BCpBBOHn<%4anG z)B^ZMT-uuRM;YuQtV7&{GRjn*(opqsZvKl*kY?%Wb@YV_O?+5v-OH-bQcur2I5;>p zwUJFRojZ*K!=yu`_o{!6FYcsnKQ`L56mnf{g%_HH?yECsg}aquFlX`HaY3W}HA(hS z&O!%4w2f9(z4vXg!TLQBGvcUda1%iqn0Jt@?6Z2Ykjevzn2s*&C3|7<hkrhfR*;rz zqDDkP5}OwiwUAU}n4rsBYpmsN-MWPj)x*YnUA1@TDxdxwBz-a@<k}?J<W<Ya6vI+C zYc4Jvae&iKe!Yof<;F-r9WmSYi_4XJN20zO-s*McV?od)=eKV<ldYh0d9p2V$8qWF zDe6AgEo>~xw~xLRTW6E2)al#4xJ}2kv7lkgODV7*5y!0>zW?uEk!&?l9>h@9F4(ea zqjce{V|igSdnuhRj5fJ(^X7zc<6!4)kw>YhAOIP1h`cC&NzTMYmx$Wrj@~!fvXBe& z%*+zbo<%6Q;_B;8H`bnB_@U5|!_UCt!`Ig3ATjYG3-I#Fvm)UH3#}MwCz9oR0fpZS z%>DM*ty#SqO^NNX;NYwDUO^PNy1F_$f1@QgE+xg{%n**ewA|_YQ|-YzN_M@Hy$?nV zXaa5uEj7uXlmcfDA#lW=2KO5otjDg%&(54oFo*D4^4#ump9isNx)B^zqS|b}Hh$`) z$#5$2T5f|pFH*U&8<~lWNa@h(@agBYZo@0kNlMK#A~SEi>CS+0B!}KTR;lYXcI?!t zn4<@M37i;7<SNS<i&$2r5Z|_%4)5pt&E89akQ#DG-5HKCOBAB<b!X}ZYiMl35a49J zAlb+#r=H1Gc3AZiO>KVkfK?P(Lh}zz2#R|Yh#^np>*0H$@vcM;Z@^e_<w)%ivW=%C z3rBKIC-xiJ)tERNXcR?zR%NC4b0e_=?or4FIU7g;dv<Tz_E-FH&m#9(g+2Rhynp?H z^f&tpN~7<*SMzU*?bdG`Hfji$aL4UgUuv0->-qgXSKfXnTMckH^_<jL12a)j{ZYuD znKRMadONy(o^0~1Z46bH3<}cz^{-#Pd{kV_qPwJIV;QmxCidIQ%mcy_^ztU}MF*j$ zTv+?$&C)t3hHdT~iRbtM+!&`0|8~8bqj)vgZ9HN3xuj)CgT!MEHN#Egk8{6%*D0a0 z$k|waN(72QaVJmy(|COKonBfwSy>dGmmFE7H*;ob;`;Q=Oj}Y!GJs95S-R8?$$(%! z7c5*ja^a=jsZ~S7@Bg*GFk$nsf1Z8XLZTcS8|%3j6EeaeZ5XP~BAJ9)gef6dKm02F ze$j%$`hK!zv-XfMhlc(c(~b(;4dzQAWTbC8H@1;V_W0z+7{jKfCJkt`*vp5ntUonq zVV4ZIz0r406?Yh~q+|{Ac`-T~@|m2OyG7-d$5j-+0>w#bbMXUs`BL0=QIqf^C{C$w zDaHo2Yuh#;u~>F5cuGIozWi$$WsY8Ea&=P)2_a9b?f$WRdbeN1h&Myktg9QN>vNAa zu8OJqvnFQH!uTN$5X^XLDR>4PIWl4V_`;VZble4bN3R*5Gr%x;e<LN9h{#CS#;^X; z`>At6{aYTo(^Ty3nz3`IRB<<JPC2v`@wcb$hM<GH-jey+a0YFT(q?tkt!CZY-Deq_ zam5Og<nC)ehnP4QFKNzv^ZNC9$4_5e0IfiRJb54wM99hTaG`Jcv&Ox-7{!a~2YW9g zYKc`<P`KmVn^F)xk>3VdQk8+`Mlwb4r0%&atfGg?t9ctg|0C#ST4<*rBIPzBA$6iO z(ON~Du<F4uy}`nDVBl6TdHh-fR>|Gi2W#?Ad2id64!uYbfz8!VPA)6w7*+6O)tn=v zE2x=1e|JdynpRg($b+YBV<L+EW!+DlF#&d$s;f6#zT|8vSGvFX!=s0)GZldU?;*T; zZpUdfc%tL%mANrT%3`lHKA4tT(JfhTN&iv5ONpM_(69uK7`A=frHi%6vI7QOfw+ag zA_eDM^gdU6Ncy_Q6_oiaSFY=3-R;DR#M1H5y^q|L=-)aYA0Lywdh__j=SCG>bqBMi zSM1N+<JY)Kc3Huvo1(S&;=It1kdV;OWnDY(VkhNfTvR^P&ROzpd+8}!NOLqfpPvjK zYJG}13Kx!ODhyIq-gzlilgchcWz?hy4my%F=gy`+y+#h$GZy;*MBorRm)1Tt&o57F zh(3FEuQw>8D484}ApbF9>dNBy=n);>@&BRXwamie39R}gP0fJJT|Pei7&>|P=NTRA zXFYNi&RNMk`OAaE14H53gL{E`m=_mT7~4<W@q4`yQD)!1P2o9D9&))N@nzik^VOU+ zFmFMm(H}mz@|dA+R97T|BDX-G!0Vg)h<EI3R`-ETK)bQ76xU6$W22l(m_6h{XAHgN z<>hs5lQg3?N5`it^{Jr&!okeQNDl2~|2H<odyB&5w(sqprffYBXhXn_ACcOKj9F-T zr-EPK{vq|=?k==MasROG0SPHO@i<U<N4xGV3hQk$$-PN)%$P$r3FKO9%Rg}FbF{#A zojG}u8~i%};M3jQCSS9KDYu)N|9+~T^sG<s-uF*+naysk57v9?eCEL`7sV2TaWr%< z#V7vx@dMww7@EY7E_gq9UCc<;#s_Dv%!yCzHDJJi7K#+~bvZHd?6;{K3LZS5z4NrZ z+8`yR4XherGMb>IM=~{BL*roJ-q5d?s6uf6G(;K)0rkO8sxFGFBZvi!vY8!I`Zz7u z^IP3vY3@vV22sYq!q8BAp<@pnnCSlf`epbVQCoumW9MBaXi}j2>NHDgJTafL1QMix zeV`u$i;BrDwzv{We0tO2Uw9rT)ZiwJ=FZhaXrlEX_3NRKlP7hCF6Uz??Nq#eJ^ku_ zXC5ZYf~-7agABj&dHlM~DNqNH4la`ypHH2er1ta=Om{UWO|o@?%+k3HxgBNc7<82E zjZPt7I`b1Bm-?*mYTz5GeL|IQH>q2}=LffM>#kh<SJ$pxPe-Xkx;R{2T0bEL)OT|G zdb0+H@_a9E!q#oqsSZP?n{Wy)e|}lhX@4Dg3tOCAlVgJ&S-3XOVeNZ&&8Xq8mYY2> zwVT}F3R;f>AsbMGd^165`nz>kx;rZ^y>_h4FKyd=cr^Te!h?p5!SG(A0td(N<qL~z zowB0-_I~x=ypVT>f1V+Qo=XThua+ws5wu9bHT+;w+zPi)9PElJkH!6cmYzBh-*mH1 z2nk!@&^@H2kCLI2(xW%h^kw~>hCh`LF_zV7Zz*ZREiPTUlv3FRGRcD_ZK0d4P4alU zHrQ36i02;Ys6-OJtR!Mr9=B(0=@sTG$ysNU-aKlDmNGE0I9kZrf4oiv-ERnwCw10D z3`oX$AuRva4e@WI&F#NBKS;1&e0l^T#D&O7K|COOcCnxq`wzBfZ3&4t5;6|QTWX^6 zwOO;@mr`?WzaYPUiYx8|7*<-XUOjA+WY4X>s#cOw3rWXstp*Iz%g+l`ywo0}tYu66 zVk7w5eg_)f749mZ*Be;<G;80tDX(k3800LHN$h^P)0c*F%J0;ss9fzmeAp2X*=xT# ztt>+RS68>nRmR~{0b3=p!-WnJ(a{gTp6EJ#cWwj-MBP6N6cAniaV@u{cH}hULi?Cp zGd|wnI|aE)jbGaSO@R_BBP>j@;r4{6%svAtlz+SI6_w~d<bCqUi{~6Cttd>RP@E-G z_TcM#^GwJAwz`x8XCkL@_shflNQx;(Ae-~LtLjv{Qtsg4!-9^RU`Wr>M#W4R(p9NV zvnju##+&s5D7s;{OZCf+J+QHha@f)Clkfy5HKB;XV_jx%zqq33?Wzluh|1PR<W9~! zYc}e3PR{bB4s)JF1)TgI1&>b64-N3Ro0!5JNSew(hrd%-v9@xOYuV-|-|Ewfi_K}& z;5Dpy*w2T}e{VFI?tBqF^c~-!Hq@L%uU?F#nyII!RwrvBhE{MqXvEg87j$=8x5chA zrItiJfVqGPTluFRc-zk;YSrEA*Z=Wu*RI|8>DxU$?>DY{e`Ky-tQAabzXL32V&gGs zhO6Y}wRIq)GsvmUlJ!Y?L4IswZ(np?vB$1=x4zIzR%v>inw5d~x+zoI%m}@PUu9)f zdj}g+hxOAH#S!s<d2pOC+*M8n6dDchi-fepX@c~(1)s=c*LOpVtk&aU*vo`#J3sZg z@!}KYqxK8veWHQiHJ8a+1Vrh4q(OduH{vHQMr*ih$YWx}tIGM`Ys=?-{T}yGeQJH5 zGB%CQt#w)}b@F!1m8rja<$cXYR~wTVI?j7H?y;G@vS0W;?rGt=>m0J<#*Y{7ysIAG zy>rL6QMvFK-B2}uY<_dM|J?NNjg5h$Uj@45#;3*VBV^f3={I7ZfjWuI2RMP4JyM&) zPoKWTqyV}ip#J)4ZrS@Zt+&T4v?y2I7vC?T6sb0{l*)|N;(ziHJ}FuEl<DE~;?xxU z<lvutibZOOqEgQ6vX8d8xf5$eWA@CMou#D8#!=?!IcbK_&LEV2($eXJi=3skwNgX) z8u9!~WpTbg(v1x<rr_`nw~DN1!R}2VfBXBJ6sA6TngyXYyIsgEn=3<X_~Fl)-@6nh zz_=Lr=ik`X3l>ZFsfybplCQv<0OCa_i#N6Z<rm}Ro367|V;stlo!tKA>(>ni-J!vw zXL5(f_k7uZvdg%6xsWoW=SHWJ&la>7^0J|_HMVwk>NUH}dw!=%T3CPN`1p-V1+P9K zy*jyl>y|C+<|rPTv1dJ}8AM)>9-;yH{&JW8hy#Vz55|-44YJrAxv{I1RDkAI66#OT zI$jQLoVja+Ht{1x$+?ev_3nLpSR)OL!!HgTr=?YT_wHJ1rhrwBplLA$J^x%kiMl#C z3n6a!x#Z-Dw**+o_en<xxV}qt-4|iera?mcJaJyzzsu0|c}jBw$$HEtuCY9Obb)&6 z?4e225)x*!iH1hy*`n@r&5%-nWBlEGk;R+^g}>LtC29|(UvkBzaD;?znXH58O$Rtt z^U8QoI>8Z;S%nV{&830^T7(^^0f_n^zW(t^-8}hh_4YOtt?2uSn{b+<6h&B#2L%Po z9M7D|c>2_pUVo0RepmefKWLZleMYOGa|`{7>YjAm$?Vo?X@4h|fsWb80nFm+KVj4D zv^0<L9oy|Rs!O@GZvFZtcb(w_-dl=X+dTi*Uw%1j=4%CowP2Rq9b{v3@5cgH`OUmU z>h1pZ9NmY?{p{`SbJt5@t@rg~I%WCjA4+}g_f4Q?$`WLxL|3=qNiF%+qeqRGR7MYa zhcg5h6B|3?!Za*{;2Jf4>(T`(GbF-R%k?WYtLakZKp6^a(Z7{_P7IK~?lQ!*?!{6a zT^<sl&hvE&Wz*tyX7|I3SfN#+-}*$^)1X6F_KX*#5-=WcvG~28GZ41WLhTWd`-ZAh z{5!dLN^!+Ru!KuvWbCNa<%G=@dR)kpLNzq^FDtKS*KS`v!J`w>sQ8U<W}7e2?W`O< z;>HMRDdaU%zRe(ONHH27w|jRE&ESFCOxKM1f<!=|zJ=w5sC7?Ydq!yTGVS?m`}QRE z-ruGSz`Bj+ZTtOw(I^z?QOh{8r~0omk-NmbhEM}6BQaNN?_M_ur4+Y=M3AGEb<kIB z>C_lulPm@UQg$WyA%_EGNBL2B`9cH!N-Wn>kV%EaKZKuvh0u8YX$w3-&##}wpfch) zIWPYLXR*R*k^bJqpt+vX_ybd-$Q!4wzJdCxPQ^fOO&yh)?mEcS?5(YDsx9GI1kV~= z6$-k}jUUm-QSqzbr10EJ{K8Quw%E4fBt*ACHy!bv!b(8te%{_ovJS7>i3S^%h4|xt z>F~jWkBW*Q#|q?;>jyM1>pL)d0!Q4RJx0B<@Ra_VO&bH0e*UZgn`wMt;i_Lb2S|!c z4opBQ=fP!m{G~0A0*cH*DP9!@31+do4p0^7bLi;Ng|=zpFM9OosKCXQo?WsnVk;dK zq{{fWUZGeGIW%{3zHqbP7M7O^<iJ&1+G;amhr45oMalB|4kPYK@FHYz6lS*mJQCrW z)_<bu+r^67Ca5|n<O(CW%h|VX?Qr{5UyAbVXEO65XCy$A>4=ph!^K+i)VssC&}!y< z>$jGT)3I~s9(;U@H?-$oB)F#L+jmRnUvYJQedJhLS{kN?Qm)SH*pq17rF8&(V-SrW zIVJEjdFyn(b*<JJx>6_ziu$%vu&=1&=t>s^Z9i+bsG|Z0V{KpMuhV3g!Ji<O4E@An z_q=rWPEO7-4cqR)qM>=i=FPIjQDX)U)Ha?o0v7!B(IS2M1)~D&zp9Lm3z46KdI|bp z=WSP-56WgVj503mu0y0OvV6eJ_~VfI{H%<O<)l*@ze!1b7V9k<Y`dS#KS;~H<mS!o zxo@Do5-V@z-@Auhz+_*vLF8s{D=V@3G-MGYpuv=?Lx&FZ{rKHLPTdI=h>jHMSMQLJ zCGk7Bt@5cJ?}?=dd#XNtg03M4$i92GzM%nmz~teY{Xy$EUPs-nLcW%_<1%1}Ey9_Z zlbJbW`0z&Pbb=l(@k=Qw<M(R!F_B1r;mq0SXyF{hsH_Dv#mt}IyMO;i##Q1Ye+e3p zTFdi*sc7>#1`Nt`IW|6g_yDJDf`Zn_NJy~JWC8TCH#0d(bdX67_{-R4@j2aDv+y8X zZ*DH9a0-nrWizK^J%^iTs4WECCi{FPTh-aoh=pR#NR0^-oRBJzwt;7Pn4&hAmj^CI zXtI{WSYfKZz8$`r24R~O4ZaLi7=%Oa_B-P$j?NF7d!P6nn6-?aQz-vQTdMFENIi3A zckUbF<0h~xh5ELC3tItj$eH?=^D1fzsi9H&`Adqb7KGOeViJ+=2nr(9z(GN7=u_#n zvPgFDm0pTyr&06wT~Iwt`#5^@+_`hq@mvV?$SUyRKshFqOWZ6DFz@o;=KsZHnN0j3 z)Rt}EvEvrFi-?50j-F;Qqpwe&%-z2i8pj8nj|xSct!g$B`Jt$9!|yXi#&iX`QL~sN z(S05jrQ(aQh5!Zyo}HD2X#3-nB<g(yq?4E(Z;)?*+Q)q5R4-o|(5cR>4_=hmSF?u> zSdxS5amZgCbZ)a|Sn$q#BRX*D1R(NI^?eOf;3<uE2gX*^)N0P=_Jy+O_vR5C2NeaM zo7^;nX4IQT(uS+2mxPB9K`#jQM=I~(LvGrij@8|3g~$wHSojyWT_^_S<cQwXD~qzT zE6|ZGd9b-{gaQW}N;nSh&E!2O<Jj|*;3xu$>Jw}A?E(Nx1(cM@;y>8AcP!S3=qiVl z*V&o20c3futS(4UthKm^X|wFEp$j3=qsd6RXHOB<7A=}Ra?58dM*{sW5lbSS{{BT_ zIV~hY(o~MmU};)f8DAKBm_2YOE33M$F8<P`CU)pAZKsg4H#Iga9yUqTL-0k}bL+(i zp~||~vsbSfTACujd|peU(^XDm=S_G{Is*|R6Jm~n2rU<FASAv2L>%VZoE)0SP=PM} z#@)Mnx01+DYRpn*#gM8pe|C=05D7=!ey#^8B*EHV<R3Vq$$H+<GrD7k{iCzjND)kb zLqogE{1^Z7@Ct|=NKV)kD%bW;62&$%*(fT`!QL{ru2(>0u^b0^4zozyyP!73f#4o! zdeCH6@Z{qoZaRUbPd~Ia;i~(snvY@G!4ak-{`_+zDp2bfyATA=_BJ+y?cy$78WSVG z6m^xbSPHzfYc)uU=MN1~`)uja37b{c9LFi7?pb<=xidC7I3T^!sCj{~jdF)+eCCKk z$SYKUr{xDP<5CG0+}|)+T`G8d@JYYh2#L?~D4;|`oVRM0fLKwz4~x>S0tB&rdLH=z zKttL3qO5EkcSk=N#Co5`)%rt}W~hXitXZR5{`M^?1WJkHP~-Wr`g<0wTX&Wk^gZiX zPigQA(hSt~ChrbfuU&h0Z5oCBWB{p}dw;$Mw2#hUs!ZRZJH0BS1dj(}Y?AQp^XJvT z9Hmibku7}njC;u;%+)XLw2n=3cW$sFG7qAkx6>JApgEPK)BEF9K;*AKPSevXhoj)2 z>@ned<(|T^zds!1gUcJwyxWX5pZEaIZ&3TH{g20@4hNo6a$8sZQm=S#oC>?HZ{MS2 z2WC;#=*JrB>wAtKzOj3gY1_ZkGBeAGMJA5ODT<4VU=GO`Pa6ICn5cPg=u|K3&4`u} zNkc7}f49^F;9=o-O=I$zJ!jsPx356Ob@9Qa?gRFaFWO^Nv-u%ME^LVHW?M!2=DF8E z$1_LLp>G0j*?_X30Hys;DRVbY##V-rt@1>t4lgaBe}EfG%CjM-3i-c-%@Z&ek6_*I zhK2^m4I8>&R_?av7R90mB$-zuE25BJLqx}?ixqL8vu@OAum|&<=)nV*T+UUR0?U|r z=MKA0=wm1KKgmu9l#yl6-&P*J57BshPj~VRDlv;tbKg^H^xR9`WpU!VDM)?ZmVi@P zZ&$W_(s{dxMS!FntY9)>7Ib_QiaCw?y1LB{4yf%FZe7)ZWn*6*!;vhajVD6)V5CIQ zFy3;ioTOKqn-*GH9$(h|^OR>Uf=ZQW?WIy~V<Y%;Etk)Goo{HeyFO+Sv{pvv@#c&I z*3K{2Zbt*+07^jj{LFce<a?&-E|Z0@nF!z0WN98J3QD<7mZ8HYOep5l($oK1IYg?E zt{E6c5T2!RuosLAj`n$#DgMxsh;Mg}nYt~w_gyo3;~(T+&^{X<KRU*ms*&-XnMHQ= zi!DINCuIHW)>W}0ac-od5f&1nw&A5N(1GqSS|`@`UnQuITYe~~J9X>U{Q1)|8vThE zD96F8WWC%AgSdU&vwlj`o;{;H!h>QCb>+v8@-1ylMA7IIC&1_(I6Kj)-5ayxq(P8o z64?v!(P!t*6ES^I<FjN41UJr98}=aLv9&1ukO%DExs!U<i%(lXnm|UHqb_=T>vjFM z?YF*TK*Fo|kuDuOdc)q;e4)mQXXX36&|5I`fG@fXjXn(Gx0->RLaAk8p}GF~)DW-r zrGv>#S%+q3jJQ}$6}Yl<Ic*XvaiIP9v6%yb#X{2;krT+kbSW_ICCE*HF%QCetX+&9 zSQ$PVjz@U!3ACI4c>Ca38<`a=SB}G~4RrBqhYpZS6HpyM+BXH(Ry@ol{w6;-yWc!G z4k^Lv)vNi)Eu5z4ZpjdHat!{~K7q<TCMG64ob8jieb^^TU~s-%Y$}3cVh9CBz7_rO zeObEzw9{jsC1AUg2wX<MKs^d(7eqh~MtM&(oT5=PbM8<vBzl6FI7J9dO-Uj2t(+3$ z<D<{X%@SjJtbNE>0C@u67)lcT2&6Fxkz8zoW2GjH4zf?Zv%V)moah0`YFe!Tnn>vK z^NlF+Q)%LRit?PV+cY|_Ytav~PB?#F=LY9mI?a?)8#9J9IoZAWN}OG}McPj-K%6}f z8%~4Z4v{j7JID9*j>>I!F8b0_b?rM?qGRWzMbGkQI*GPz+eW>fE2CQs{fsqun5@As zK^_Fqxm&lZB54?EMh{(i+qSG1gd*x{YJpnrZotE&Z{uko;dQU+4v!7A{P}C$Dli$> zdHM1L*A)quF2&Z@7_$0V5A?B$J-fAf5cgh7Yq6eQBpeRM4y{y6Bg<Nwup>g300CKt zWIQJ)Zx<bgpWTG5WNSW0$2^6(VZ(<r*Fa2d6r<+ReRfLqW-1`?3X$U;WNy@Fw78&2 zlPp<Yvu4GxFQ5<zWj*S)+M{=%&0sSuB4F&ElQ)2$GA=8CYDf|!08|Vm7He9}#%~Zf z2#Gq$6p2+M{~`Y(;xno?JuAzOg2sFuhVhmSexRLdAF_lag!_SbpRNU>_k*)t2Xp=R zgM&{utZAn`QV+L^veqERUmKR*o7r9BCNqwqe;Vec_rCsq#T}w#)1<E?eJ5*w7<co7 z|Gu=}8@hg*M};T+8fTW`kroT1fB(7VPieV&+m{7n{=4l}o25&@i$I2Q!HkwQv-Wf+ z5JRO$9eXS~;7_Pgl_DG^ow`5>z1jIyCrMTMHSf-N89jUFIusIW8x1l)Jx1nHJrO<t zhd2G9B?0g2=(@R0&MKCaP2}%pNC`WUO@FjpLftFk1DJ$L3|aF&VLNqCS1mXiH^>&# zptS%kjkLm5#L=#1b;D~+{%Evrq3W;5HoEtfH6NYFa8(^$`VJ@0Y>&Z;gTt6F?lvtm zI6Q*)@7gt!86V&(Fv=v7O2CQkgg+54Zwkm=#L5@8HIlKfHN3j{wrTYQd3~hI<lxeQ zKWuW4XqL;hO}rnkgyZv+ZB$!Yj9xHk+r20x>sWltS8$sQ2-xH+JTBdv8y)Fnsr^HP zoAhhCOZ`6&2V$<G7CyvfWf#f4N9#;VVWos$!TpH2gtE0~J;htN$?54e%R4&`=w*`n z+5P*dQSWb`Jn>Ikl_APXb}xFC{T4gwRAk+(S;HS@X^a{^I$>d??SdgkvI_OH&ubj+ zbGRbtpVJyAl~vY;C?C-|;W02L?c<(}BOl72>?J*}U5!)IX6w6ej%}UTt^ec!B{kz8 zvVA?JNyawVM7AZ4g!!@uvKT!WR0dGbw{d<JE|!yFBd}ATIC<CFx$FQ>X>60_Dz>zd zs03m@dvQJONm~y{(hkVF8LZFiEg9xp*rOg50gV`u`6qb;<56DCMFi1vDSA<!_+8k( z012`~4vg(0(VYEkxMOE6ar->%B9R}6IcG$DfFe*@KIw%-GR%*a0{&-dSXdl;vOrR% zcWT#7m_TCQ;fX9eqe-?c^By#PuaCB{Ej^Tx{$ZiGpdKv~7d)l`zuXz`gSYA+M`?+N zq7!DDUeX*V;Qb9YZ9JYorR^U5<+>YOe0T9y9zGl1KT=ch)CrN2lG;0QrYx>G8_C!e zGN|Q*C0}mm(Wm%=%^qo;5W|Q<yCn93cp2Mp0{M>tl@7J<F4@wr+~?euw|{n)SdDLS zaW~#cW_nD^eUVfi(n%y7NiU4sz%R1ud+9*CZF^<kCLkOvM<=6>RETLbwAQQEafA}X z<8|c)Gm0PKjgq1#DO+FblzK6b>Jwj6q9WgN$(`1Ju3BQ-<{>aA>4I?|uZKqX$SZLn z%7<o$ADH|&X|E8;^EmTp<cA{jlonO-?Cb**f0M}gA3d53Eu=6NuuGZcB_A8Hy2TY< z#q;=aOTxFOw6JxUIMY9*(9T4~UwSD&sdPZEa-78QK2&v>FD|81N(S)(9z}@Zaf)?v zaY=bHMItc?!m8rp^1m38M?NIpdYW*8fvVBjNz5`|uppC>=Y~Z;ZW3+3$k$hJejpNI zWvah>7{?g$kqQ~;W5PamYUlA~9K}|mwwJUp*azD`SueiWsCe)Z8$1E>Cc2RH#h8op zh+4z`%D)NY`;tNLdsM!!lmobM6D8+Q2unP}xd-*b%wrBsoF-dyv#-5wUlRL$V>?P@ zK4DO)*Vq2am(${hC6bNxq@JA>*fX_GuFxm2SY&Dnj>_~s2$`a-*Dgpaj)3WWY}H~2 zMp6CJ(ZEF%SiRv^8_8{t7Qg%)UFw1^uC89v1bbxgXSw{cQf%z5G&ec7JOQR`%8HdQ zk-Qc&Rqz`QLr_y&1q?^}erRlW$%%JH$};)ALST<DuU2&IC;1!)<wIMF)B^%!=hi7N z)>Ffbn4&YUI0%%SvP5#?9>Trd(?g|~A|OMyw3kQEq{=HaE}^U6@}>F}zM$|1*0fuY z&+eNjBRSEjFKCzzD7_S5z~LsAi<CkJw}115<I*STEa2cf$X{v?=v3}3ObtB(RRax> zIF4hrOD`q6BBkFu@ps8gN3<xYntI8|V2S{L4)rs)aw@-}UAQ+m*j_ylmhBaD0V0Y` zV=cz7eKJmAs;=&B&g#>~=d*fC#=m;=hCPV*<fG0`j5I((nT)X<*v&eO_fQ?F;%^5= z&WC8adi>BKCHn!Ao(P0#cvQ<DYlenuTAhey%HzX~^!QU%?IQQa&01Po3RCH(70bh~ zU0VLMsZ(_eACt`hpl-7&2lg#a5N|#aF439!L!Q#0To+6%#ZRAfWk4r--CjEQHVMBy z9(>EI=dI#mCl+ugFC8g0@_&4Pl6tf`YXNSh@ngHRPNXs8g6T!o7hk{VV5+N_6680Y zU-sDHib*Ibmy8a~r8SucE5|B?E>PxQw^i`;^}T~qCox8$<%2wiF@Z&$lBz-m{Jk*4 ze)7aZeAHCZC#*7=yeXrDM{{3L&4$5kCATAoT^*N`n~UvjAu%{?qP;*$ukIEUh<?-j zdVIEJ<>g0;TlbdqvAMy$*+xCkQ<_E<TMqmu-r@@1;lwnj5)d&;*foQ?Iw`mz$|9fO zjuBlMV2t|3OPab3C4j?XX;`nYEp=l)Cx#sp&{TtevnBz6Bm?`XEzJOkjqs#^D~1Ar zDPPY1CG8)$E!6I1WZx|@f}QbATt8OxS2=kXx6F*jvc$3yh;o6R$Mg7Tkh&{mAkt<i z(?oF(hOzU!ckYzdp^ciInwOcWCvJD3f0%3E0Ru)nU9%A&b`=fh)*0fRo7NC@?BdO( zIM2KYLLlbb$E7@`0e1Roti0C}Z(xT$GyLOJb9+HMM;JoNP-*ze6Jg|BAP^KHv}H-x zoHUIlrBYe5r{DPfm)m;ihZd9zz4#jH-WO32m5_aIG(5Ceo(i!rjF6gZXvhQpho^YR zQHA%`mQtVw0(k^1RP@~G?cGP!N}E{?PeLTd8<h@-4oAYbjL!Wt3+9CvfQ&Nxi|~F{ zii#MnK{7g&jAEHqgjEuJ!G;}eB>{)oFtbR#q?V`$(w*Y;_MQT_DWVBq@dS9hqLmt* z6tJYXEWhh3ksMtADX0}v8%XU74u)f`6^Gj@uFXS*Q6yBGFKh^pXp+-dc@~&FVe>mF z1+?7-WF$njtp1&}<2h5rAR12T&I)u8z?>DMJxXdBlX7S&G%yW}U_Ii^Y3V<hKara| zr_zweS;K57Vezc9Q%I(mLQY!EgNquKx~P;Ki7Zhe17Bpwl0lBR-X<B!V9oL2Hms+Z zR7ifIBju&@i7+tJ)q-0xlX(rJIWVYSFp*bNRn@R^T%njbPOsH^oE8<P@+)*GE=X8w z$;Cyig;YWfeU)G?r2wr$fAd3@!|m4lI~BJKWFAyMpsf5sA8Gw)>JyzA;RPKP2w?2- zMy4T%J21NTsbeWE$@=DXV>C5g$RkLkTOP%(^9lA|(x73??!7K5?Ufc~BmSLyacKBz zF53DXOeV($2XB-#S(KpMXkpl}BdQ4>uRLxsIZ<Z?jz*DCcWB?9$FE(t9wq<Z?K)uh z)med<&OCwp+~C2x8D&5xI}}vx84uo^*XNO;dE;nb<anFzB>n(x7hBj)dF1raPZ<6} zV}%%>k)t7yaXr~Px=6~FL_w7w{9?tOB@0}*U_m3(xOH{{7#XOU$h|;}_^Ky+IxPpy z#m4HFndlvKcCynk*mwj@Qfb)pT0ub#BcAS+ZxRgf?c1M;<m}NR8Zx|e6dW}qCM5)& zc=Hc1VxH2RbUb`Vdb@*MZX>ZF<1~vg-%Ot{<rFq2@#|Je$rYn&v3Vtu`4hs7A`9(M zK%u!G;i=lY3&-y}tVpO4KFO09Z^gC4Jm%86@z<ez;J)?4j4~NeXtg%Tjvzx)&?p8D zG&i{_0C=8KYk*M6nS;xz;T;$P5b!!u^!&fv-J4+`K_YWXj1P(4R-!C?;GWvEJS|9L zsPF32Z4EaAZ&z{1va}8iU>2Mm50`k@T$2V1AQIkZHpVLcSFc_*k1>rdBs!xfGga}I z?Jn2yWhBk#Fgp^XlE#iTlX~{<-B%_~`wf_M#H6p};4CEQ@K9oU#Qpom3Z5}BAH+;} z=#(UkA9Ln@xyagLZ;25MERL`pWWQoI6^3g`#}J5WFPa{@9=*Bv-|gFbgF3<|VcHA{ z^0KRS;O^-!010Aq&02xn4RkSj$@O0j9s9*lg)>DAwq*}eJsB}cxZtpCDADZi*M!IR zcNPW!0iJ)Tg9qsr`uyVN8Ju9*!*O91uK}Ryb8utO<$B#%!%YmMWF*!FamQZ4<T;@c zr=xf80E{M9lP?f)_$Ns~%!XB^ofMGuz>i%aDt-U%BCt8l6o2WNcH#M}E~KO@3iC`% ze6}JPR7gvh;;>-`C036JsXR?2k6}9qMZqj%4xF`X)(l@YLSmxr=GMds#<vus7Rm#; zuM{m`GOtA?y$AwF=r~WF?DA{6m^*<^Y3UAT`RK2D4x_{fp3|QF7mM$u2wi;ZvZ}=D z95w!rw69T7Gj)nER$&ckI#yc9b(ZKqhYx|t{mQoKJai6fA4=@Gs|uqOS@#}aRC4QM zUQ)IbA49ziFB{{RBaZRbCc5V4M`njVCjVIS(KN(xrN$B7fBeLW*+;wXN6Uj;9nY%( z1Eumn_6~w6JDQnAVl=j|q-GN&6Y$A_hE}iC$JMDmN06}f>NR-u`mVpGvubzsx5m%! z%^Uur&~CG~wq)#Izwnts`RqNxZBc<@xpwP6!8p=5OF*q{?hK0t(G9hTzXp1VF7)d= zb_6ma`L2=yCW)Q)EQ%W|=bL|kmCnhC!LvVCk@nr+{hO&cbT&<}M3F2`$_-jz9qnn! zRC=&sW5WiK-+f-riV<V7rKJZhQ<WJ?{}5Fd=3NarRz#A(1DY0^R|~fWFI9cBoz-t} zV3+JmV;@#P+<mJmzDSjXFd3cLcL$=?8V-ZKM-ri*iP$%O+qizcElqp(GpG|<j~0yZ zi7p(-$##V*w%;&kKd!zai$jG?nG%U*@JN6oa2WhD2iDQ$Jc|y~Ksn#j>~c~zb@B0{ zUmC1gUiX!CN{SR;^@SuECaBSOY)RQROpm+E=3g0pbocJv5LW*0;Hc%PUK;i9PGNw# zf7PFW;L2+<U-MfWjw*!;q%#P1hJ4vl61{)Vn^cwm<qx0rx5P1Bs3dN&mhHkwW4^x| z+d%f=9i60cu!E3&+~sFx2NX%`>6eFutwo?s>4ew1b$f>n^2owM;Q$+$7Yz_B2Zm!M z`Xo(_G^W#N*6|7Pu$=yfvX-kKr<`r2N<Y^O%$P`9FMAb*lD32?#hS+m-DG_H<}fd@ zUZO^%F4D5O+i%;ofs-z_wjz_Ri2Jx9iAWD5qkh3d!?7+QJ|1h5tyWeih<!(o9^I4| zklejZ;le3X{@!$B90VVskdp+<`)4cUJf+c6@oPsZlXA@#<J(9~{QAV(&pAF}G_l91 ze>GT4n4%f)sAUBZj1}ceSx-dg!|xPv?h)9`sQ-EHKDYR=i{?j@%v&{!{Qs9fxV%a+ z!v_qe@|_^5n;PL}^*0Y&zKM?)Q_87Bq;7cS1Nji}Cxj$5-Q5Ewnb$9j=O8-9NO-&F z!VY!4)`?a3BLoc<>HV16?XO+sj5o{{Yt62RaEB-zR;YpMTdxsboh1?7DdQ1h!5HWz zvnq~F51kucfaIg{al*%EN1wLw$hdh^iMAIq7&+hVkCh}6{cQb8-cIs*CD0WN|NL8} z$!t;+1bB2&zvqkUB?Rp8^&M_f-cP%39`S~Z+kq-3@PfsqejwL@iVylaUXZwprS=^= zHcUi=ZP%>m^4YhYmb~da@EJ2^c7uH&DCp|$yf`kF65MKc@xf`s@Vw<k-Sd{fYr`ZS z`F+S?5jCA!L%oW_fFmDE37Ve9O`Q0cKhQExt)!?ZDKSyEorR?(1U2>U<HwHiz3GkS z&-@J?#MIJ2i7{u+Ah^LSrI~HRqY(Tl;Tr*k@`lc%B|UHOh-iPLU<V>C&kOCa>uFm_ zR?hC?1Zq)S$(G39hI~jpd&B*~xuIXSAj8SOG8G}MzVots`6W#O4N)(*#OeF}kN?DO zC>c6-Y(oBkwioh6&UNRA=qHv!WETSkMK^<PgY0ES9X=_S7-M)cxcs!Rp@3+P>z;4~ zyjOybm?u5RlS~0W+M%nT^Dr*0d&E!FIi)U&>(!hz-QrOOf;`m&ONDTBU^2S#B)CWY z;hLH$XI+nS8+b9(?BJVC>(`&7wE?IOaO930J1*LQTtGdb5W2i;o2?dDoN%J!e|_T~ z!FXeX`QU*mz}e%SHc7-lN2<oW3{%*~*hc8idP3{rt;XU^9L9-LMm;dM*%`O5<%c|? zUXdI?v&eq!F6aB2AwBCI9pmnq!SZ@|wXh>1H@8v`WTM*QM;X2(pSigs9Zto?#of;^ zi7w=9=RzW1Mzp+}B&?7FgAO&HaTk7I5|Be>e?o*ub?a1matDmJYT);bdNfdn65c`Q z(t)a?iD&eBw$LNA-1<MJ85(vc+FD{cofP~aH}@kTM`%Nmg8T4shjzDcDR{3>K}UK& z!`H`0ARt^R&ZQ%d4^vu;^$VjnXysC5dYwd)a}NT3w45(o6r@zzNd%Z;B-|_KyJm}B zHxuhcFfJ_+V6t@#uOwpa?&-^PbpE-YF}imbPST<ESDRJ*DX=ZJ6X*ZfRSygzjZNKz zNxAkS0A$zBoohaQ+DiJ%02rH9tA>)DoHbNBkeQzTC$5ARcPSTKMBd6?Ig~9#dLDnY zP+kIJK23EVtE4oYFhbQGE!v|c$THwiIKhx5Mu4wdzs2%A5*I`yw#l4Gf*@5}`b$Mw zTaRJ%RbK&bg=_6Gk7)v9FhQ}pHe7rgpVl#|5=k~S8QDnhRjR!a+cDvFSO3@L<wIJI zespyUBwhI7!a@qLV#v9e4RCdI^umNn$~gV{^?Ql!`XMoSmZk1=WR$Xu59rhxt6i!v zulz;oxkaXhgoUp)Bh6X8`po@|Nq9W)S?AKpi_k;2o*ab#S2x8IQL$LkK@90eSb7MO zV17hm;<$M8H5zi+4qI4RSD9Y~J$T5Tlv7eR_s2s?yQ(;N@XVG)&AMiTv>>vm=1{ml zbO`<}!;vEf(hkn{vw5~q5@7%|gv^?7e0JQ!zFoGf2Zs~P(MP+ITV<&UJD{p@Ki*th z?dKC0%L(9i?xA!mOb0h~>}U(sXcT$o`$c|7#B|sdT=Eh?XIv-SH0ZT4-yG3a!1#@) z^}|%+a2JOb&$}YvhD6dVhFdc)`IJ(n(Adxq@m`f^3?bfnT98Axo)%DYcf|n<A)C7B zDqjqox5IIWe->E~UTUAB<F2S|J#g*TEgM2VnW(I^Ahx!&j*{iV?gL!8*VnA5YH5K| z4BxBL+Y{-HBi$3qmBi?uJ1l1k9POgGhvl1Q`1Ko-Z$x$fElO0==&s$m(RD`mp<SUc zH&PtGhBJ}W){^kpsiUyb&>ZmNaaS4RG}ZF8_{4Fzkny<BoT1=#N&p2ka)!z>{tV|8 zFXJQhwy=DU&rU$ZMMV3<8SykS;jp}wq~l9gt@>6a_(xY}!_JwUL?Q;Dp)@=HZit9U z`$JlXm`JkbvFVM|eKBbU2+&!<TE@4CFW2%dE#R42zoKR;5F8R=VQ5}ep}Iu*;E^Mj z-J9L`eCqY=S=pk#($Xgi?MBPZqWu11vL-@QutJ09;guw&jP{Y~-FyDpmJc^N`QyV? zd@`3$pc$_<=+M?o&UfxlwEzE&j1*^xhn{5xr-Zn`o;FBMe{vh$4!t}Sv8D8x9bQmT z$Hi&L`F8tAYSX!6$GnvqVz3~#(ZS(R=vDRgSD>}dIg7zhvO`TdQ4h{p%e$U1_z^ce z^!M-ExBlJzA(7tBWLAO@ns|nT1Jq~Sl2y{UQd?CO@ls?G+Iq-2H7LWJ?I)VsfhYCv z3TKuWOAG@IlnMYPbxw^G*FjNc%j1%hok0!kMyP5Ynf~V2#2vEcsM`nNCOMk>QAM5S zCC-|xc0U%vz&$TtE^mo1U4H!w+3k-&9NU5)TwCMU02|(xxV47@-+-vXcL4#tP1D*N z3WrG)T#Mr#3fV1=Vk+k!K742vnzu5(7@5%g1q=K?C{pt<jduMUSXNd>ktl>icSRSm z5)Te-;b{4WZe*zN?q0K=KzsC@NW-hZ3TM4}#@ZKHK$7#1fnm(#wo>g4axQ5tFlkTI z!wsW#BSSbKJV;MARxyD-xM;ox-2jaX!A<czqNh)v3Z<N<p^880{875a0+Un3olR<; zN6aB}mpq<`cS>n3V{vGJIhRhQh5G-{(d#LzFdIu{yr}ICq?rB)9|UU3FCn~NVe6$e zW{hcg0m%e&!eoZ@hSwXVp<x`F$8?wE%a;#*oY_$E&>l#_x@`pZ;(NT{%3x9a4r-}b z@dv91JV)vuosT9G?}&Og{Am60Kayw{f0+Y>K>cGIdc0GBE1fXGJiLHemuln3C*3ZY z?mK#DIZ(BAMt~gJH(QM(;26^<%~#m>sT?;Xg-%<x7~h-8XGvDHRv6gQLYj@4#=#?r z+%Yjoz^DJ~w{IWOpg~w;)vz8-T0k&~f*9WlJv1-wAsR>8!fKvb>W7`@E6Mpzk1k|A z=t}|gm*F*MsI2m=r6`#HbK6CoZCPFj7+0wFreMp*v>ac5aYX;~4@Xq%t%W-yb0nr! zY`RQ`RnMenyHitL$q?fO)q+&v;7|o|v)2CHLw46!YA6jFBr|5cFur-exf@yz9!G1q z<SB$1E%;(97LR1_J~qBnZN#EDG4h6Sl0NTafDmABHVVp8aLfulQCtrg^<zbzZ~{24 zH89xCSxBJb)UkZ8iU<jM1z?8t3nm`P`!{k-r>Q@^;yMxCBJiSepk!D!KP!yMiBzCA zR&CG!j#OuLLiQ?(uVJXzI#pgG>G=$T!KlYWD^*Y;<S)y=pkSWeb8afT8bof|@(||l zg#Ma7b!r#bbxsEiT`A~z*j4h&Ehl6%yeYfrze3W;E1yr2&RKNi9(Y7~{eWF($Dz4l znStX__`o*Z_zedyglDV+)*I2?zi$sNu<!xra4;#qplv;kB1PQb=E;p+JIrx}q)>!X z#=y)D{{|H#d{vjn!32;EYsk%F>w$h-@2Iimk0aV648PSwM-r(W@iOZ0d@gzHyXLo# z&V@!tuWAW0+VT4s+ub@G2!gCUPyjpt#T<9*(7Y`$1e`nEdS(Bta4iK7714c&;)!Ss z3KHf#`%>XAS{5Xok>nfthdlWr+M~5wX1;?(uKwRBq1T-Mi4sQN5rOfbY3pD&F)xs# zhEAT-P)3THjO7`N$1HG^84}E2cH8oTBZuTq>Kq7;i;e%|MLX4z)E5~`bQp9&Fhd|P zpag;w6)vY0o;AY4!AyrwIWghqp;$GaRZaqO;7H<CDv7A>&ZQH(nePJ{2ESHtkY|!Z zOJFC7a~$X#oFl7Un7hW-1~VWvf?9wTaN%wta-k;uf>_LHREG*(SNH6N3(*1hB{|dn z8#Giuc4wpj%h)cym=bUZ(w7a20^UU_{@4T(MF!!vWSN3Ro4Z^sM2BZaA~J4VBFzjM z!ot8%1xD;)MV(MhMEV&;8WR)5haySF*i3pUE%sn0w;RiU85sT(LNpL-r~#$kE+PBE zNpK$Q5ETe!N3}k%_!&PQ*ok335eiYc7Ch_ofAL-(Iw27)Nlb(}6{lWua^~+x(CrN_ zVJq+|j9Xm#`0?ZH>}V$0U_%AB0?^ahiBydI`iOxN;x(A?%gM=RYh5I=(3>kDLY-3G zc!uaer`6Lsghiunbm8QNCmX-t5*s68<&PCD$r2gNH7lS#APg;@8;6FaB^s<hIW)Fy zgCjI)eQb)5Uyi)7&Q7MoiheCF#i;y2dV1)fu*;0mMZ4cgfhNByD%U1kEF`k!KX<jf z=9YUaH72KI<DxZg-2BiyF$;n3()y%^N&h&(JvLak<a#4VuZQ&{>7xmL@q1nDScrz* zyNjkfe|?k8xvU|pAb<j0WGgk~$lVbg6LL(W(n;6xBY1(S8269cgPz{laJED?W4U^N z3i6!qbT}#~gxoF>C=2{Sj~n$C&WQ7a#G)D2@8_lX(6{@)LmMItDwHUqM9X&fSkMQR zn^yCeISLufbQIO2gY{L=yg7t(DK*+3ME(4GO`8Y_k^?qkVfm&B$48)myj|kqA>mNS zTgd@}3SgKBaT47h2ZO{o>i>ynJW?2aQiS@Gq9mjqUzmQY!|$pP5Nc^%Mat1h0firq z?L(E6JT_D@un^*8MtA|zbC~^xXO=4dQBWDwC8(hRBU_(`;V(aKf~1J*A1YH>&L(2x zd)1bs>FRZ*wc6VMu2~o2Llead1<x(zVTaMn8y2V2=!NaXpPnd?($V?{{MFW4@n4ML zwC$SVa(f6N+8Sf?Rx)0;JuKjY?jl{>()Q=Jef^w#gG7p*irEcq<Byg=&2bt5Dd>uT z1Mxprvz{oypiE1yENL6RZrsDG;-QL+58_E&LX7&aRS$%X-;9~0LWXEPp~&mw-SLDB zaEfRZ{jdTR!Nm!6=@5$~=cdsg@a-f;Z+#R8|8pZYF8_a%5MC^{i`+*-*5!+-qP(nm zmkmHNKG<^$Zmm%HGn3suu*3Nz>FfC4Q>aqEE8C{CBk>=E1o#3lQ+l41GBeBwdQD&D zX%zCFmiwYC`}4k-XcO1r#lw^;N#&b>JWF>#aZ%-g1{8kNkT?E))RmRRxPwk!OHrbW z0m|HNm6}3USd`*{%=^%kD5n)NI0eMu$l_7>`d~tCCUuV9FKW$)-KR?S^r99&e4{gO z@W<67j{h?^+uH(TJkI}&{)Tl7EF&={yV`bbFFh_llRktcGBPjA$_&@<Bk6}}>~tV8 zIe8=nEh0#mWsrc-)81~njLC?CCq6vLNc5dYeI+XgI#6lBY6iay*gInb14;8CW*ExK zh82`UJRL(WxI7NWaquDMHDDG>ul2kxEB~4GMijT4mO4v%h}(T*QPnvdEI=Jc63JXC zF3nT%f7Sa1eT@+NBdeDq_@MGB=8XA`Bp_J*jRY36x51wCaqxnmT2P!WU?w$6DFxOO zSCLYk+d+FL4QiZbU_JmiD%!aJ+S}W<eewH)FKO8ICsv9utt?6H+oJ~omil}(OlYr( zIWUw|_KbQYupv1Q-Lf7ccq*rLWq@l*aq-vVjwON6dZD2&=3U{!JZ|{-`U-a!b}gr+ z2!QY3TSTQ(IIev2=J4_3Yiw)|@7-JW;>Elv<J)w`x=8l;?C^&b6od$w@+B*5Y;cP` znYZ%SF1>L|S8O@UC9;42fLGd^9sd8wn(TGW1#r-3aB^PnTu~T3mbM2Fx_`Z(CiddC z{rQ%>yNnLR@e4rSh!4+NiL`yvOOfD|x_w)dbgSJkfxTMMEC7*o(C7VTM@QO&6WE1? z(HIXLXx)Xz8b4S5A#9I^lBYmZ(Y~yvcJ9#<iqe!BjCz>G2#I*gpuwVhg$h=%=Je3K zVf6p88Nz1$#tj3O^1!gvOXCnyP`LK^akB9a{BFP*5dS8T#sjq6EMiQ}3}>H^qYtW> zAuRxH_6ZAP*ZaD<y26zj1WMK-XK(9iVQBj^R?m-UO%7m_;g(UXwXFU+_f3pKMxdOh zhSZOTIwI3c2}dkAuGvS&#B|B7AmYtWWP<HS3Z=btgzA8T^pIzwW+d<rBao2sqN4Yd zMq>*Kk4-zFT}LZiBCsB4&S~;TLXWwT#*lRP?BbzgY8o02x9-fWpYTJ{nqUC<35rnk zvd)FNVMbrRePg%6!xLmVRTVr*S?uB!-IIu7!xmn`>iNu*CZ49G=RiU))f1pmf_|7F zIVt2mizSm{!w*(ev;-$}>Hw9v;=TF_;PO2|c0yi`b|B-uwtp?qI(AX&R|tq{L2<Av zq~IV(Mn}DR{``-bW5kD4|ID&HPI4S`7*G-vwF-KY32x)ZxBvB5q5+stxNBXEQ^%zf zc{-Qg5Lq)Bcy~--b~sFm28e@7R3$15J3Rmx#WQ@zjvdrZjN42PJ-05ZD$%*U@oCsa z1W-m&&nhY&L-?X)p!J`oDwzd)YTWILFP2j?lIb+<M#e8u!eGVB%#2H-b|#AFtf_To z(Bo)~2|ApM*q$x#(h;IlHDT*S51;y~l&1o;cln90ss<snz?}o+BnW1hj@)D8mq9aD zyo50XC-l*BOA&P4#zPsb3@|teCWIEm1E-FN*#RACMWH>Fv<<<on0f&Tau7+|#`Wxd z-nAcA=&fZdnvW1F4Qxk(ZEydP#Y%tM)z8m&5mX?|h4@3#d?Tg5hcvdqh;s(2s-C6e znOX*xPLUT6i{a1W<zXaR%1PC_h|<hXKff!Gw*|p9NpFI_ao~^`G}MMWsc*Y>mN5va zi-d4HFRuLwJAx|$MMOFGAPKV?L)+}yjIRR)F>u!_>dVMotsKY*ICI95kY;_~JXtWe zH4c(FhJiH>KR3E^G7S#scgu--TeE6NmDtA6(YFbb7XM5d#`L2d@^165xNODnW}hWv z^2qYsUkBR2cFeEuepZ?=#}=t%>oWF%+WW5en>TI3(V>cX8su8PV0?a?y|=D5)x|Mf z59YMzg5OIaYf4K?8GZxJQdG1c)_K9?cZiJexq?gLCFC!++C<5c!+{)#_f#5Cg2*J* z?a;?^G2tU;Nc#2bFZHDe&QTlqO0u{5z5ei@RbYGtLYW56REycK%FB}wTZxF7UF0bZ zRl?T|Mj%As>KuGAA>kt)PSZon!KieK!3Awi?MI@>&Y(Ph3*cMl<cf&)S&e>JwZGY- zK=$IOT+Kh1X-fRdsA>p)3wJJ;>3|3TG5;fY5G7j+#{=WoE{TbW6ztyCeT5uE-`=ZI zSe%d-^RCNxcaV6zvMTwltmDfg2?amM<Ft8w*xtaI>`$IR6-#bChfi>|COL!nmvqMy z^P&sw(fTzvmNo-NCcnyTV@y@{*n*@DuaGrx;tidI4_V`bh4zA8rQET1Y+nzbb*zts z5~o!_Ag^2iEaDGq00$|K`mgP*g21;)<Uz<SVa;VH_u=n<{~h0{;mcG(GGkwVkm!f0 zzxl7bjD?FU>YwUK;jp(*ZB-~D7--}F3{adyPcz<fXR0Pvaz_Y-uE4VF(Mle6jMW-S zs|K|Jlvw+MY#*H!bcS>7TLpVn$tz_oEq91i?kkg#4hD6^+Jb&z64(VY@@WSG4NkNA z2n63)F8QNaDwOTrU-*s%(x*^oiLeexXs3K3EAT%zQ@aShG3oMUIp3Q;<F@jM$>M1A zFpUajlmjH~7S7)CCC~ws9Ul?rKo}>!X6593f4*yE&tdaX0=bq40@ASZegmym{cF3b zT7d)27HxgSl#u+t*!puOe_2k|?t9Y!?M$wa?6nEdLPh#=tpV#7h1ufp+5NX}Ya=mS zqiWxws#e9RWcH7QRkA{~4hSap?aUNxzR4|PA>s6X6=s=a<wgtdmXx?5w?o!5px3K5 z2k&53GM?DRn$qo7XjjL1fd8EGDrLzWtU;{rotVO}%j!_IA^MGM*~beiEhJH=*rN!9 zNB!}K&gD1LIlja<!r@T}i40$<*>^DXQ(608mz99v6SQ=--<$|TWy)-P2<n4$@^2!J zaC-;Pi5gh--|Q*wtNHom-nD0meIlHo!ya$l&gyRh6n2dWjsp%H_|r3jc{)IHk?ukW ztq>Flu?9_DM+Mq}Xa?9p*NsrO`8#qu0X>|0ofQx(G0$KUu53b2Xy_#_^>>w}|8=Qj zt<pReyEn#n=@4Zc#R?wH1@gvT!yj|CtaJPq57n<mNr_&j)TPF^b{%Mmc>U_vPm-)` z4jQoz7=)+b3FITCC=Q*`jn7g(PQ)TE>sLJ4L_;_=$?B@Ay%S|T?8Mv+Xc_@NMPGpE znMiFYIe>;*V$xkiqwpvH?h``?qz!D_E!QQid<3<Ti-*dt2|E-6(}u1hY5DCtclxVZ zc~Cvpp-Y&#O25fZPO-S#MdF3`g2<xys7@{NM4lMXU0hn^kEFLaNOj3|(Q1vDWgp!a z$h~OFMzxrqE`EYcvP73mZG6t-HpUmLkN>+Z`oLU#`TSY;F^jsSq{J)wkv<RB@T!|D z^AUZr5ov2}sVbxB+#-x*2{*GLOrK=8IlG*I-y3p@h36&BZ#$(<Mtwz9x+#JEGdn<a zO0m!g_Zl{TAmZY}N8Gos*E-1SJ`Q%RFQOd$>P|a}w(Rwk{@P(f;<irwng1Tl<LJWI zV3_P0P;R)f%22Iz=mq5ALp)Krl2F)ist?s_C&@-%w8;5GJiJjy1!zNTI;);+%w^%E z9dJS5$`f=#WTF!?A5f3XS&`ywp~aGox*>+*pHbN#;5?U@NO)5IXDt3bHw;a<LL0Pn zzw11D^hg+gBevj(H*D<VdlRuGe3yyTv-0w-82feW*0MxO00EptEkb)D8gW{@_pYkU z!ND%9XB{cH5f1%NvQuF1cW<32S)kdNa{LGKXHx>uF{``3QiEzUQZD?&viU%(A&}nC z{#()QDojNd$*!?ku|kmsglW`7qYK{=9GE~(63h@wA%GqT8zQRPil1Wv;LV-+YU)l| z|K45WNkYtCne^sVMYWJUFzf?`_xwXxO~sqLVojuw!K2ao_I?!a_PM#Yub894xWbMK z?5B9wZw_HbL-_QbkUMouCn5st>}0cwLJs0ep_jdo4%A5r+E8-0gD6q*iVEVenHg1r zD8-f5S0Qd+k28rogIE_XT&R%IYq5($<7sJ+FQ4CE$UWsA0L4Yo<AvF0nvL|sgbY%S zz~(uQDM4tKhwv07IluG8s>K&uRSLY~b;9Qhm=`&`qJk%ll$<(D1`)b)9asK=xI|mV zn&Mg^L+I1X<GoO*l4rnB`|Hd0?++30EN-76M{D{0$OD<nH{ibM`t>5R2cdvh{gaIo z;H{qf_{+_fkK<eIr|Fl1_`>2vmp2V5TqsdL!WJZ_80m>^=XVZv`UT2LF{3ze{P=Ls z0IW!^Uhgn4lu%o)H4?px{Mfa}L>G(d$e-`M;OALjk_puM%-WGs`1QAbpR-3^6*BuU z2BV1?Oi04<W2ViV2s_g8kF@;qI;R-Zp`I(zA@RM*hF<Z#Pl_>>3;-py*4YVssDmvP zTKItW9N;<nMXx`c`3XE##CQjKU|N<tK@7FDWFRFlV&p6Z^cD4Z{n3>)|1z8zxrE+h zB+Tu_mzjutEMCyMk12p#twERn3y<Y>`09UAWrR8{sKNX8DR}l~QZNu+^a8bXNZ?#G zSYayDl-Ta#%`cK5FuAdl0tgI~#}V|mNbBrm-~!!N3ZB=j9^vjMZr5IUKEpbq%YS`_ z90nRRkNqj)!!g=kTAFKEVWlYB6{b?$Yi;$IuVL%!qL4wFe8y1uW!H9+?kJnN$-0ld z<mZtTs*4+A*yr_!PL*o}D+nZ4S;GCM8ZWCOFcR$@EMAXheLcNTOlYkt`<oGA>{i4~ z-0}G*A>|+^ETn#ymPYtDQgrAvV=MEVEKFx7P*Ydll#fvoAG09xzwUoj$l3KnH?M1x z<^oG2G6$WVBw<u^dFp2k@nSnXri4eamHxI+Jh+Fif%O$Vdy=i3fmV0*N88){jGo4A zsLLQD>aKHf&EtCDvAE!qw36Jz^i3fH*^(P0qiHni8O~hw?Cha<dR)8q3`z>|2@%Jm znVGDY!^3S@WH=b`Y?89aP^l6h^!k5(5E<OzF{cU@+oU}qL&|*m)KcnGoO>g>DznOQ z9yh#weP_*{-BZVN`}XRgE)wh83_AUKayDi-qN7hd$PVH`#v1?EgKP_)QCNepZy{Y} zhQs{z`Fxtd&`?0BvU2A4JKaf=vzYGpFyHGRd$yHW*@E*_-4%oXJ<w#=F<nVvDw!}j z&#Sv_B{znCe;QJ|ul)&ix^$-&?{%t@SZ(j5b)u8jd739Q#OAxYKlicik3f?41YPIo z;lpQ%c0AcmUR>&lExD5#j$OR=;nd$*j+EXNJSkXle6+lK4F&}pH})_<?a)&M_8PP5 zf5UA+^7UO7!(2M29UE<%pK<Nl6vN0iDwcFLLjOyj#1CKeA`agAKbsRJC(a9OV{&la zzu4w=C59{q#~`#5?R{I?`s)h~uV<pmm-8gr*+!rD_(isrjOvAsc8yGvJVoN{Yn-em zbFcsR#Q;bb7BhbOFwJh7fwgM5=rL4uh-1&Re@=yk>GkDH1SDPG6M4Pc2zYt%d`zZw z{1?GkRf>#a4T;Qc(J{&tKa*mb40t!IxdAGY2$fD+r}?jBnDqbmH#;5PduDFv_7puU zQ;yHDEmI*75g6@4AT?+Wib~$`tg?Y@&Q$KJd}E%)@*HC8x3|b@Uf<rAa9_W=!!JSb zbqk8EqIhcBg|;P&uG<dpp?iC%n%bwltADq2Dw@d}{*Ukb&p&N#w`CVda*i2&`50XZ zK&@74Gfe;4)p24SSv<QXGq1hGqy5s7-%dm+Kl}GQ_E15<6<Vf+fwkDhl3|uhm!7QM zxE1)lr|&2x9A1|%^$(1(IQX)gsQu0~Y`v6d(6E>LTvEefj>qhF{yg1VLW%f4L!+OW z>3{ypWB*SU=0ATWIVkgUH~;7FPMrG5dj03GB#J*pUjP1;@{pg|yd=l6Rpu^~l_#da zO(df=q76xSF$ra5dJ$>f7v`Dm*uH(t%6n-g&NAHte`*0rN(u|((VQdvGG`;fvSv@x zq>pX}v=9vs2zJ3-@%RkZ*Sk@Fpq%rqDggP9&}hJv;H?cF<VP@hd_x`z_d8U%5Mn?J z0X)5`VBduc9|I`(5z&FA7-&w>qkPk4qytdu%znK?@%ql47)r|Y9TyXwW+~0nYm4eu z3`C$10k3xnIpl^7cKN3=hsLFxr*#5)pnDU>-cLkU#IGbLCF!~QkXt|*-?36XSS7@6 z%8O}b7%<xIAX?<q)KrFWlk#bOeH)5%#*SMU2-9kFj*=~+Lb7Ec2okM@xR@;JgQ-q4 z%{}z}oPe5ZWheLoh{OW9wNzY4a+==sLEx)J^%r@Vle04wL_`-(Tel*i<&16MY*XAl z_ht17A}pLXgW={^msh4|HCC7L&oFefomnkgHV;)iwiGZ<0I(fms7q>f)==O%x0M<N zL)D&wD-i<C-CqNp%2xUcC-I3%tqu;CkdlH5c?aQdfx(jcFK%)?H+Itp!vCqO-EsFI zd6Vke6`%cQ9ADY%uH(p!bb1^eS(KhQg&#gdbaoCQ{i~pq@2xs66v%5J3G(C5aFX3V z-d>$w*5OlpQd4Z97%0KflXVf77pK#p<vHX0Gnap*LiOzJ+s}y=VN`zwTG?<P(xfKw zlW-~D@{-=xUHu)<^lorpXchE1rS0<H`}7&oSg>pN?t^q48&_(T<E&+wSkW<V;W%qH z-!unbMz7?V(Z`U>(SW{i*B#ZR8B(En^9>Ex!EvHq={fN<OAtnMPP3ovmv*>V7qF5Z znUS?V6CMzWHK)SE-C;vmr8fo)#Uh~@zC1|!HJShl(q_ssdnRs+kFNuCtyb+Fv~E9D z5r?<;o^@0Miixqo>Q|R3XS|V>r0ow(&%MS9n6&o#i??s*FJE_VqZkk+)5kQiBF}bB z$l#p4(Bog;M*q^G0}Ze)WNrE@)L@$<=e?m!l>dxkp~Jd$TQ=>6y`CNJoRxr6ixrTX zJaLb6fu}U*IN5aOi_=A1_d;qdL8=QN%|&Ampyt3b&1cmo;9D2Pjok9Kx-6ZqI`5_N zieyc0Mit*_@`Np*&!VE{J*nUPc0IT2JK60}iHx+f@1sBD=ISkNlRWpn<xa1R5?@P$ zrE3?~K3KWb+uP@6iSNx4hYzU(mCik!|IeeLZB<;Clxn@+`ueHO-?Fj+?cGYXs>kPy zcYa-LdBOQZ&S;zG5Uc@}#-fXbJB19)&h8x+Y5DzYNcI@>n4Pn+8UBD9x60l=cy5}` zPAKG=6V*e>%=62r0tapqeK?_kKyO?1Eo4qBvJPghOu?U?w>cGTeKY6##H%h}HlWun zIR1f+{W{pVKJ(=p*`Lr~a@K;RMl=sy^xW3k$Q5Ui)LmY_8Qcof_<*Dp-N0`hJ7(q> z!?_BjgO-~Ma#bYDj-{@SxXHi{z;_YoqCuM(h=_#s3K#O;706chA(eBMO;1#yAOJn- z$JAw7UFIk-7;z~bOf<eBsirTOVGd%@UyttHMYk{A`nEYlg6V$!0^{cr11K$0Tf?<T znAuTh#jSbJps7)X5DZlf4PzG^F&jg+&L`m*Y!PNeiY10bu)p_P&u3I2yq%XcIe0@| z9m?ZuWe=iT44-4AY8+{_0;md;hOo_m37Gr~+7IgVaPs^{Df8oYTR%#$r@UVZ<LxO; zG-3V0C@sJOQc$_1NjhGgIx2_;)QSpcstFA)NrcS3dV(*f-p*5GEbbkOI)n=`j*$DH zvy*sw$+qA%TXMqjSW;X_PpSxx*c3xy!4(aZ%|>cPQRdU6ngnhQJyc?=QP6?T_^os2 zU4DKKD(dh+D@FNoa0!zvoZ**fRGaCzoz=mqkxTF%Nwezt6?B%&(%1L#_7>LbwC+F& zfhQ5)(nh>EAZgCD={q_EPjWt8Xh%<K2_-Ir*aJuU@9037Vkp9r@(#L7mp&(Obm-E1 z;O66#T_-3j>q@EF-H!m=i<VrcvDCsE$l+$TkB2*<!lSxC@!@;|3xgUc0lj_xyjrwM z0uV`3$Be0O?b!5>eFZyj#z~1CWKd>6sZf#Lr%(*l)=}g$BADWR!*ie)&6=-A9H%<L zIWdpp2d;e7sAsG>(H94*w%J%#q_o55?;kmc;|=ruUrf318S<`!QoljG!=tJ8*?Z(j zF=EDYzx;gFQL9Sp-{$=#hl-B%S#q(R$P(noJw3N<hhE&L=gN?yW97H<HN2$de3drN zk=fcI*v#XPVN1HkXb$XrwY<dr!@?rS%~wm%oYvOV;P|gGfE1?r*$#RAc4*uX7R)=P zyN`=Inu9~3jqPsw0{Zsya{BYd;Q3f&a+asMH5M;7pfQcE@-Q`mD;$o-s6Cl->HA^r zg$0N<hL!Yf=ld0r%4rV7Hsyl{<t5!WFK|N4cO8K=YG1FJtB{8y@5}UH6(z8&^F6qk z8XRCxx9-(*S=2^SJu3Ul00XRgj3f8U7(^O{<=Bi9L|5C3eV!T&9A&#kY1B)Y4ZUUh zhev#P`SPH{r(!t%>>8nend;r0r0{rV0KIBZJG#qging<Mv>#OJxRi6K{5L5~<$fh! z8~n75kv%MWe(V&)^AK_G%gP4&QVn*vJSyc=l@&@Wgg%tML_1&C>?`Tp)G^BYGDH8e zmiEj3P%_wAEn9Y={;37io;8KRQlAeqFPC8-Nbj7+nvX;pemP7Jo#@fLLBC#DRjMy? zAAd*2O^<on=4SKv{Kki|1?4Z@okbQtec$coyU2}MW`jN7-j6H*DdCIH3yEXOr4j!L zg=rJNLW5Ao8WfLZ>c@(TRqEOpLeRK^#(+`pYFdRvQ10}Q_^aP4lQ$i9KZ^A$De<s) zGFik2_D}lC$QYl}{XKG$eTn9%Z$5s0*f}1TUJ9Z7I<&1MZNvPgv6fr5e5Q+p_MP2h z>G~0)p6wXP)LdII2d3pKouvi?zJZnWx*}#LCJfvC4q}38!Yt<(#Z<M1!!g*=V&+<N zXKbH4Rie?gx7r^c9H`UssL;N4QlQ!#P`P0#+v9*7GP65(hJWzT_g8U+QS9@f*h-a@ za|r&GQqP#*#Ok{^_98k@oK~*fZz9&yM0g-iG*u`4AHv=|tjD!`<Gy1fGK9>Mj1`GO zX31DGB~g)3k)cQ^qV`s1c4$zBvK5j<Buz4=0c|C75haxdB{aXEEBtoPd%VZ-_Q&(= zV{hu)eP7pF=Q`K9&gHHjbY$Vc?bPa(HS3X-(G+HCC>TQI(3F90@}T2(*u8i!AMs37 znN#=c7Xdn1!Nr>1mzJ1dEM$@>@VfJrXJcitxN`jR&r?=@tchN6c8>p$r5)@NNHIY( z?-(Gm{xPw>^6#goG_5!r`mcuBjT^Vh$|{0wIA{;)d-%5dP}{EES@1%rgNtj|l|Slh zOI*e}eDe5~`I!d1!-Lj`_ikBDuO37;Cnk=QFI-7p0p&#UsdUl|Uw&Vp+5Y|dai0Qf zr}uZG-FB%aB?Z~^jey0^=(JOR=oKO6I5m1>Le1=Y&^;sqfBGVBuV|-fee$(BV$7K! zPj&eW;c_9q1|K#L^M?q03K)UVT@`(o84HF>>JnbUEcP40h!%>#LWc)vvXho@hz=L* zvm^|qjd9W41AX32<NnI!WM^d9oz#q0(RJ+6w5fDUR~!^#B9wlbt=+!?f4;dM8966X z^A;)qoj;oxiq?%u*9F+ByxLfF#MM=DT9O3^Bn-JSm!++qlW)IeiyhmTpJowoUdg~V zdvqsdcAIs?cEO*P91Q!ZQ&HgN=(p{Na?`x$kbeE{J)9+?ru}|&Zh9%st*$;lCFYLZ zT}Qj@jd0$gxPq*c(Ek_{MOsm#X7yQ_NQ`p+(7&+T;rX);Z@6xQ0an;~zrze5z~hWN zV_rF+6P91I%`W;T-Q7vHvww1~TZxO~(9i=rT#draGk+A^aCkfGr)*HKNV{&vu=0@8 z)J<jUw0TojJ5D)Z*BM722A>kzB;Dc?A$IpWygVhpoX&`vms;jBAg9l>eIvWJrl|V( zy+^lhFDaO=ZDUQzUgw={<)&At|9nZ(Ob5;e#&-He5bi0Rv}~}OKjvQmEfBjt`>&_A ze8cGd>t;aP<$lAQ!eFaNa*7i=ck@3stZQp5+&r4Ebm+>gd5UHBTZdVG!}YdoBf#Z+ z6P3i~Zl=m^+>|wzSp0dq&$AicT`NwAJeXqYU*ucevg3fRt#?u5`1zg&BpCM1EdNs0 zt(G=8S8hUPw+qIhs{zYozw0SK%p*zK+8uW-{-bC6_RG#{GH=s664s#v!S2%uu(+J? zbj9nS9IylOa$y28y6efyn3dj4czA5Xx8w76EvDHSRr~m5*oH5k(Aohjqwj*_?1pyT z)w<ge+b|mBd-pOm_h^*{rG@?2bqU8S@9wmJfAa9fG2>SCIiuA3&Zm=_<+jNsp1zU! zSfDlEc^aPD`p(e$klU?y-Y&%=_E68!X)p>Q8JmeEnaj{_a)=FST)up`=@{5An5Cmn zvluw1_^*`GmMpl<Xd^~?pskCGhRvTdd9=|l&PcjEg+XNd*TlewiNrTOZL7!p%;`Cs zUsb1f8)S5TuCJz5;5T|xLC*^hZkzbyUDK$qmSCLxON#cixX6M`dVDR&NO^cjup6E0 zW-a91I7IgF-<i7Vfc%~N_ouFo-CCh0cc$#yw~^^<oVUFPY~Q?FLoH|6S5s4S-Tj@5 zt9vLZmE`4VH&{FDF?f9Oc_Fro$`OQV#=AOd?L6pzcEb^5${yR=5V37nIuyMme7VaL z`Mj@&pO}XO4-wcQM)PQGa&VaA-=8DV@!79qpUxU`52lQ^_7^|b5*$s<%w`r*<@tWK zt}C?&()W58=Iys&cZ*raDrNe_&$rHcbg}AipZaQp>h6B0pBK{8{X$UYRU-%Z*I<?i zQ8}^#{b=Prz4Iv&Ilu0!iUSAv&Nnv{S^m$Vr!<IC4}0rzxI=&aWBDV@D<<8vv!4c` zJ75HQ4KyyR(+0g=IaF2E26SPgG~{GxB|DM<6c0<FX5=iB-a3Fq@@;)mpB+aW=~0q& zoEdWWdA@UW4?JHTP1_`T1+B@vN5`K7)M-GoxP-1m$bkbrv}boe`@9f^v_Y@eu%}TL zynL~~g9Aq^OUT5~qdc6L4vfCZ9+TBBxZMEktedB_JR<)d<%RV$dJ{RFfr@3#W`H=@ z7p&E7J@^QBPcQuubSrxvdSqaF-1qas+1u}Lvy3M^iJ+e{hE~!9xDxUaaYV>D^35;! z9A->}QW7isP8#NKFjA^;NotRJMAOA_ZjhnZkR`7kx(+7m4Av?!vfdFGc=oJcJ7GG@ zU;Fy~J5Bhte>Q2xoYsM*cY)r?GROU=<AjW$_XdIz<$Jv4M`5AWwqd6n)9rSHY<tw3 zJvw;)ml#2^*vn(ey<Qo#?tcA}w7%*e7AR38UDQnZ5l{-OpLMnj8C1z0oVfZN?bwxQ zjX3Jgd54yg>zn&W;`24HX1y32re}<1=zx9G@{N9TbX#>EtP;62)^S-H5R2?Zi!X{c znvkvVsge3yKaM%Kr25&bS6itb`58WTTlkZ8s0--E5)I;K9PR^<N<CljeAJ}q4Ig$@ z{-GCYoU${00KHBr%JamtgnRcG?@b6?7+v~s)a1v*d=1vTRj<y(yu<gN9WbCQk)O+K zbNA;~G`T{tr2(?{*Y`Y6`2fJ7jC=UGkOza}Pf#Xc_BL&hI_&lRVw0X3Nu-HYTW@OO z==krV>vb~@i&<TK(SRu(2N=8niF2_~tDpQ9am4akCZ1Db2_gGG`qz%ON$Xd>;8j9c z7O(hW{8suW9xaZ$R4^^1*)g>LnW|r{GuQq{x~%BvJ8H(d*w52VOPoutQ)3=qi{z)^ zZG8OGn*e~>=d_9;a!#9vegE>M5(M^k-QVzB`|YA5Q9Kkoi^XJbW*_$uLUM8h^BcN* z_>dH?eOUg6D9^%@PVvX4c+{R`V<Yp|c@KTlp$tC+vTtKGXq-xsjJbdB-eFD_oqs-B zBU-R>oz(h8@Nw+kRIno~xR<v7(&fOaB7}D2g^Morb+o8_BALJB-w4)>#F5r%s|_Fa zHY3GMI>|0uu|k-RLUkdNV{YTaf#(mG#fj%yXWcexcs2IvRsc$@1eCops4}OWOCP_Z z2D|8&KSr#jmH06ayLR7({d~uqTfXH~2;BWq`Q_IF5ldcQ^WIs2wO_-RccNoltMZUl zv4_8eZ!`^k=3xUhFng?-fDm_U)B31%h%?v7JU!Om-%Ccy+3cKdbsDv9)VlD_&p9Kg zMFvD*;Ojt@z`QQ?C{LwMaWkR)Gy(<0V4u+su#8ER@k5r2wFvy_eqNQy2RimsQu2Q| zaE568@0JCg3>Fj0)bxShP+$^5axPJlqUfvI*>1ur=1p~<n0O#q7|fE~?7O>B&4@l0 z97XOW;cJWcwrNCijxs=&cjZ%DA7B`K&9n#;I-G$@K7KrF7_1q!bpQBq$8fytwh=ls zcsygyCufUa0|@D>F=Av12LM!px{{JqWB%C#*BBpmVG-VpY3+vXsJMx9KZjC~ba|sT zHoIW=vl5jS^%31}`itv!T-F=0cW<Am`x1+`m|wU(>M~u;;5c46a8<7e{t*x4#LUb% z-h4?E@-_v}iSha5U#}-ESCpTtJXp9UUDJGnvAOZAGWgY`BD#ukW>40cA~k|&c)`5^ zLNExCd$kO*`k1EGzAuRvXL;}<?SpB+C*Kc9cz#N6X7;y|w{L6W+S>PZadFYgJwksm z`O}{W>}fFwMb@4@G<o;a<hYh;F|>6z&3<ZX$&RHIx{#eJ5%~3rCa1EIcvxCw6LHQW z_A96^-{JHsG7XLq^z5m&U^RqMn$<QQILG_Zs^C{|eiY;$i2auT>tyw|L(jPDO&$Em zqjrZ592lN%7PEO>lw$k#Uv_M0x$N3Ue<nRjW~!V#osT!sm+@z@1w8oUY|6NDy*-O# z2G`ly_H`Szslf9o7)fX~y;~SlQu7bkW|(A!V-x*B#!GF%5ueLw_suCTydUtBEh^QF z&Wp`{!I%?I3$7}KQlzfCa`kFq`nkXVd19ghE`<US#sr+LG;dxwzetMXfiJpupv1RF zD#m4GPqI0gdBCzV`*u(A($uKH7l`qwQD>fI6367GbCko$QxHa^y1*g<z<vH2dAP^Y zxD%89gQajqgCd2x(mDsihu3Fm?fOM|NBR7g%4|2--)RsB5);VdPmTd>lUtlzcyUpC zB_-jnjfvoSqYJ!FKYt})=>zR0AJMqIGzi=9UfE}WtTJyTr~jr|&+d&_c2ng#B|X}R zhw96DZDN#6$n4g=v0Aq-lUNn39(Uh*b3Zq+n}fM5+#RDGf-4X6n4<PO7nuOyJPT%Y zIfW>+n?|MYTO=K`kSiovno`DqLyp#*)4Oi7SZ>XL4_=+$JUXvkzPkm}Fw~%{taZGV zdOei!b#8P2_>K|B*bp?VZX>dxvi;b*@tM57DU;tRs|>4=rf^0+ZfUQe(64s0<BCTP zwvFKXQ$#Ef-ZP8%wPT$tHP0NTPc(dDhN!)L<e3f;on-6gN~6}lqgK}4L*n=CHsE=$ zI3Nn)H_gOX6z&&{lAKZD<+LI`r6Epy^p&acC9Ba3(U{cBDyEwjwwPAGme$HJ!DS_M zg>_caid%5O=x}#hyaSdnS?u*D>&AvNm2srTX|fDpGb>w~&~UlhN0=@F1qR0<l_b~* zp-a~VxgS^h8jP6gw>F&~udQ%pKGUt&I7f>C-5%FuPb#;GN<4LU=S%gYVW$({gzY*& zl~_xPwm)471cuFCqnYsP6(={#uo{w32x+yJ;Q;(~vm+|s^K-LLHcjc&2xUOZ?rzg# zKm@E9xG+u8n+2`-oM=D(_O`xm;p%($?^9EaKeG(V6>EZyLZJgFI_`U!8k_kPL;mbb zf?MYkB8)4G$3!(0_`2gcp%W5xQ;`@_3DWQOeFwr$!w(P+-PFy2&h-f)2i)W-&$ZHp zSKx@%&o0`eF{GZ;!HqaQVcWOCXY{U_hB}nI<FVxCb$Z8u;3+0Y`Y-iTXy5(`q$+xJ z?D;=z>yl$8sFscAnvKp-unC_vI<wQX=Yc_1b=@x;^y)RdYwP?=FIM+SY1hs=^6v>% ziCJ0Zt0}vN4;U~Ya~ZVjw$%RCGd)$V;8WzK-`{P9zCKFP6|V|%XkQqc58raft);a! zw2eIs_E(SZd({>O{2yL$?~ccX^Sd3$F<}X3jHMPBh9L9`wbjH0Ob|KuqOegF?kHmi zFtUcTHGUjE^6z6xZVqCjlW$8*TA0dLj$l|szGFu;P~8qrD?f{!hEDM<?Ii=K=Pv+} zP@WHX%{*`4Fs0AM#S_>Ae^drR^WoQ6qYb+mauMHd;o2rv<F07UEBER=aJE=0%SfZ= zD>R2%`Sda97)%F|y2%rbSK;}>TT7d5PZxvgy`?hX1oq);f-qFDKPj+z<bYAM8s6Q} z>Pug8-+==LG2>Yr0sC&qaD6fowc*9gHeaD3_H@FInje%l>5maFHY?wBqR+<NE#fhh zB&K&(GfJ#6GTGrcG=>2eQF|Izi(XQTwgxe?5s+N-42d5|I*D5JjSFW}U+Y5#00kLV z^Adz=@V3Q_v8fawf%>Z$_@ZqW?_$yU(Es6#&dJZ7ZKjKnrn><t233DGmm!QbVXSOd zR1`Uq{>Pl-Ba3J-4L6_Nw=ac%n#`Iw^LjXJNV;4s$LO(cjjKbq$!QEhEQzpBFt^ht zI9Vi!9^MG$e!?K2$9`8*$y8Fw0YV1!ZZ%yW0B0?=Aw>?&`mtCvVL_Gu@iK*?Xuz4r z-biXHt_V}k!RWmr0Jq5N#fB_?B7hANpT-QDlgKMqk_@_1C=?@8=07uDhEpfsU6}At z+lr;QV<sypC_JSZd#&BY#ZhIb(}Fe~;-#B2_{5#birzGc-wvJ}Mpb|qP7g%{M@&qt z(b@X$BpM8Z=ks`>mNn}0F?3*ydb1B#RA5)jzkQ=;6Gi@`KD4(l1L|w@_xSyw6k95z zJEvl>DO+)y2abOyQtLE-P6h^md0a+hy@U|Kfzg$|{rtK1kRgw2vK}e*8n^zP4i|t5 z?i{p+(FLRYgQ@)+E+c;{`TM`ZK?A|r`Sg7JYN=kC|BvUrPzr-wm5;`Ee^?D+$%OCx zV1AzhnmN9$1tqHyU$dNO;D%(unj_9f6)7152Cyxs3%^+ExIq}b0K&x4zxktf6~Dds z8hku<=nDdo*P&*uZZM842&yr6L>NPBL325<2oa!fk`d{J7W!4Do~K;};NlJ_td@>m zNt&v~Z%>0Tx!~NPH@i~@>9kKvnx}k^bK`ZWf0$E<T{JhuceH90-<$ygx@lS~VsZZt zPFvC~#qxr~LL0AUEn3W+j}p{!c*5O~uctzPYXR<l2sJ<G;2&_?;@HS_ezAsrd-+x< zY5JXM$)c$KVOCz=Xz}rD=6a-W#)WOZUp9w|M{~q(`tc#EaZd^>tWWW8guUEy>mx+G zPt?YC752(Ye$x4J^}X0~aN|TxYlh=%-Wi;Y@J;e)(|j^(fU`z<#SXEFM*YK3gmK6S zEwOaKuHr8m^>Excm0Ym?h#~6vd3O!|;f1y8GUUXQ6)B#o8)>*)8{vm2WSH-RhuM>W zLeO$SXU`_@Zb2Pv^U*1;Jo`3;w6Bw*W_i<oM@v4USv~N58+&ySavx|ucJ$~+XtC?E z2Mk(p!Sm~S22l?5GK2Q9c?kXX=w3N2d-n%d$vH#<zi#4~v7l>I(IC5zZ=>Y@;jma| zMV#<SN(~u&ek+uV_;UR`AM%W12lN%XB!)|Pl{5iss>(q__7dZe|DHLr%3ve4j6Lo9 zZv9>y2M9ply%;Wt3r@y^%Ah$6d4~<wbT}o=iP?mUpqSCDq8n1=w;5z0DwhP|UxuSG zXOjWL?-bPn@KUSLBIKe8Yi%3}4)mg*1-z`HQ#EAi%^Et5(BX3o3>-FYBpEpKkqGrm z5FAk29DiAXCh-O~-2#{rt%>F^K*X~hcpb1MhF^`H$^eTGyqVZxah0Wa+hHR}<L?SE zx+h!sC}vO;Ae?|Os>(S!2kr2`U_$3B_9zSzAD|!MNRoGYyLBD4J|Hby1tfY2+(0!z z_WcZ&6@q(S8h&ZyO13|vk^9DNwS9i+Bzg|tDOY&1w1O8iH;8PN&!K>G1iNEztY1bH zj4;LYJ@3Gf6<W*e9ZPwj%~~m_5c^j8Z+}K&peB-27B)J!VS264CYJ$fLOJV@l;XGI zwF{p=pq26zepgs+KYkSd5uN}ClMxTz;}x`tnn}zjaxR3{h37`SjY9I`+|#<m1gr+E z&(@79nb)m#^Pm+!Plfa_pFtLop}Ra*mzRt~{nD}}on5FQ6t1x1jn~ss8DG0M^pb<_ z%DA@v;?zt^O#RVo%JmD2uDmCy4RZUGelC^iDbKboqRa+N47+GQwBN7q{p<f+2Om9l z%#oF>!As~OTkV`w_aA`&^+yuDn7YKf?5QoxbNKDv_5X({B-e|}hIS3yg?sAA4!xIV z*N>8-+A1xUT%GrXO^iL>Zek7W6SeeX;eqsV#zC&SOIgyu(H**!H7i$I{F@`am4<`m z%!4mpEVAk=BbAd#)JG7)JrDkoxbVb@6M@aqugNhUJ}7t+3a`Z7iLbv*5PfsT<!qk= zt%b9_+vV_Zwho&<PH>M^J?b>}cMIdPgFt?LanpJ^L%J3F>C>uUWta6bi86t$Bzw8a z75j{2B#(B7BR_2a%+4^xEu>|x!E9RPcrn!cxPtqUtSr5c^TVwvnrW9U9639ujqk(P z>+Jgpe{;UxWimaY-zMr}8)@*-ufyP0?*Fw}XLg?jl80HM()_X(T_mZph11xqbWkod zh@b637)f|V2)1K)m6gMiEwg6-E}I4~5ri;DN}JzbdWdsv+5;m>{LqUxDtdZsq+S{{ zVn!!v(WuH5`gQC()Tx`4b!bb+?%k`HCkN5{bMAJabA(dji#9koYF}sdD%7mk*1jy; zuW=LOhWTR*a}@a<f`3tXu0bKxsZVdv#0ozwxpw1fd38>}a>#g0=iV-=#M9lor(0U~ z6E9*5xJelwn(6dcJr5z0J28z9@F6=}tV0683{sRWJ8bZ=Xt8s<pO0teLVQ?-r1u_n zG~_W!v!9|*7hPwBHp$bP<V*%)vGE{gi-xAAVg4ei_jW3SZR|Q78Xe1ihMHZ7#&c7g z8$F?u%tU}s+~wg-ZfwjuByL3&^}fakL1uG?%9JbXV@RwjJz7eZZoFZsY_sI8QU2et zNNh$N?mvPc8nm__RxTXUR>GDVBaO2TgLQ89ed}gaHL*Q}4BHaAW7<A%WCuJvbhFyt zDzlnkx3F95U7!CJemi_QYCuGs7rxbZ-oABVnn!<kQ|T-+@=pv#xzXjjPn_~<+ahrb zAp{Nq8Q6RjQWQ~0+0+!<OEpS>W&qaT`}Wb;a;f?njJIA|dl@M}eD2Q0p=Fz&{d!S4 zL*~L{z`)8W^9#FhZ@Kx>d|GW@-D%2*=Q&599Zgdqo1lZtvvCb3$|4j)Z*a|Ka=yCM z`)waIyV+T4cISRq2T`tMYRCckK0tqHXz?r5Pj_|}dxzcSiF$Y}7>?=L+n>?8u&0YI zTl~p6(J?KXJ|D~_4yt5zII`(wOFOT~5IQm=9|%WuzAXQw*X+aIJ8Xr}!{C}qkk_C7 zhg81;WCv{PqAN7L=6`h2X{*(>HCYL<K(v04VBf(+UWa6*x(7TOF^9X|0F<~Oe?4l+ z4ybAOS6u&3<KKIW3Yk(&t;v%(#okv!o>Vs-0Smb+=o@obUTvBurjkEF$!_w^cP>N! z*6>@3U66qbt_1$A$}!Or54><)`F{`m(gvb4Poc14EBYK}Yw&9c>f1#60!Tm5b(pSf zdsy;yuSfD8$|7i}Lh&N6Un+}RMKJhGh4;IM9%%@bA&n+_x1<K#pp3>atDrEA^VB46 za@OPD;>^Sm7mR|5_+7*v0?*jAev-;3(ow^}3DvXc&E>Cmt$B<vySgOa=T+Zn2Uxhd zU}P_e|HI7I+xs4~22Fx!;L3_KpO;y-`8P@DaXfMQO?Zh-RbOrRv>tEUC`}m0os)e6 z0t0)^;L+~ks6rK)$Znu`B0H%lwsozsrSM|Hq=T-blarVg5U=2}LQau1I5{S_NG*6V z@lAXss-K0(6Z;3;5Qx*#Vyyj~!xX73nCpeUNqGub9Cp(b4FQj^x0}2Wb}>SH+d>6K zQ%Es9RnIj11qcwjsI~AOG(M;jE6U5~izhZ%mNzf=<#D4o@59yrH#X16zbD-a2?s|$ z@f?y&fn|cnuX#3D-%cjw32(U}VY2SDl5(nF|Ng3q?OkhBIGEfXItHC}S>ouC1}SXd z!BPIB<Y?s8-#T4;e{?d>`F_gElVk-U3U-;7zemc^;N}vl5i51n{o{{6*21{(uWOX( zMhj;&v2TFAQ)>Zq9QxjGy8&!SSnl#Pir~jJ<_5Tmdv38I@wJRf@%Bddyja+^ilkY@ zr)K}5@k>W3N1Gb@u)ZDg5U8@#4x=Ok5|8bxYe~q*W!rb?5N@BV&x7@dXJp)9jC;wa zPd9AR2k`4lGrI8Htm0Pw7e(!ZD4f5sMn?7vR?(2yb%>k9hm2<&<4r?ZNnyxmhk9Jd zqt9pe?BTEs4r3~R*z;fuojP{E2z6)##DrOhvAWd<8I+HNfw{@mc}2gId+$gliEIIk z?($-rCAMDNdhWGOmNv)@7$m1m;|YE>`@;wE?lqs1eo+F8<*!0WgX-n1I1)EeLG3_@ zwl&UN_6z>vB5d%erOOT+M0<1ZoJK#92_0HYoa)ZHsE&SARx%a8BtJmPv7-7G6VpN8 zgp(0KU9CquS8YGaGLEI2JPJU{(^pX&&+%3gQ}9kEp)s=TF1FnWMYM=Vu5bQCK0iD8 zLhQ-km3_`~uF4y9UU0E)@+j?W{tdXzlk#fYN~aelgWY@f90$>kBeEM4UrK@QPULSd zeHLcu9-P+KZO{JwYhGV3BFKopkaZ1Y38GE7f4`-2lO|1gG|Q~+=R<aAy7Pk%CEbZV z=>RJ#zM{;igwUgy&{$Ev2T~Oa7%Y=I1nq3%agI?C8Y3v|rpTT)3`UhQ0dEEIUrfjl zlsbov)AVQ0h6)giV$lM69g@?@qN<&KusbPPBR?U6Ynye==BUU<8-Pj2xfhAHCZ{4p z0#m;JB>r6&7jbRLJly2Nbv?23qMpaaSfNcT9rIFXZ<Y}Nt*48A&Q&S|qM)jxf@`Fp z?pT5+4!Fz6)lWZR&#f}=AGh#%GKo#$Rw)=C(AAsig0m_F@_a_idQb^ODKOH%_K0Gm z+pi%a2Y3PV+X1y=jGL#ZG{#}Z%kxssYMOGi+e+~l(YGV<hGWy9Z};b7X3N$3^|K@x zq1x2l!@)v^%{=>_O{=9?Es9`@N37FloGWGJF{@&@$s&eGjw?B8;fufJ{x=^rP6#it zY2+1l)P=I<ver`F;Kw|F%n&K+Av`R-3Shr#ZGK)}B>x#SuGXWI>-X=#H5Q=LsF_oB zmBNYZegRd1cmkV5dD62d&u3YFDx+xJ@cg55Cd?bj07Y4A!n#Qxvql-}R2p3sl-ED1 zs#+;yPEX$u{(mT<G0<+%<fa~D8MG0jxxh2S{7}@hq(thkDf#Li+9*Nupu<GJsE&dB zfXK)#^kLaC;nk+Kj&1v1y|P4kK4OjjxzYt{3qWSr;!OubNG5w?6>n`z{xkMu#IGih zM+>TGY@%G4q2)RqxvroxH-OkGTr5G7BS%W!s>;d;k=h_Rk(HA>s?BtP1z-aPdfw2X z(o0JUI`J&`?D43>IsB&q<|;<cy*d>>mxg*ZxOLWX)jm0lp(D4BDLZ^?meD!&I@m^X z9e03FWR{fF=w6zLx(rH&F>Q}y$5iZu+6ikloVO-SfI+=oDlPg8<5<eb0*7c$JDuL7 z*J3RVB2unk*(8~qWgH@Y!f_no$pN_snPHy<TZ-xVi_52(6Zk7z=JFrVc_b;Uq-gdD zL>e1M|5-HIMy>Cy#K^CuHgC-?aK?Y(x9kQf&H%2DJL2>Fw%3j5=p8@j@Xs&0^3QQr zbB3%zljhZ`gU;iB3o9D@?VbQ_sP~h>LzI<XM~*BXcb0sPPDnTu;FDyyp-G}1OA%PS zX~O;(7`g+I%o!}?Hxj;!7mU7uUp9)bGkug2UqhUDsu3jwFG#>A?+?4_Jx6vPLS@A2 z<gJT{vGnh^lyoLc$!e=(t=tGdcY%qG;D<A)gj&{b#d0YPUwQ1H))7nuH|a1~QV4i1 z_IL2DX%HRr^b90p4L*b&3(qsYs$fur>7)Mag0RGZ(2e-R!{nDXexwH|iaqgPaiUXK z23yA4Ayf%v7>Z7Sn~ol+6w+c1nprqI0-ps&kz@)_D#Z{|8{Sc5wE9?@|K?#LgYARr z=IatTlmQEDxM))c@)x9>0m$jcKp;Qe3^K#@cFPt9uUUDkw&-#Ic9B$2C$J{(*%KqS zAQ|dOT0=BU;0P{9MzSe3fJH5QcwQQJou2x*(Ii`6hTND6<Wan)F2XA$=fy0w-^~o2 zx=tNCZbVZ=5x|+#+j2o46gNgidYQ<ZX2=EC3q}*blBjMii^fRS{rdG2+G14URW7*& zxe;s~ds-jPKnEw9WYzfrAE?M>J3130dnFJh=n_qJB9}p^af{~7t+Si#;1^vcriUSr z!Z**xMFlSvL>_!GtBJ79_=1rQsE-0BMT<DAt;tnb5MqrqzkJ!;Ujv~c>Ex(#++oa1 zD29n_PKD3Z^W*tjM<2K5TjVoY*bx2<Zlwt$0PgpIHTfu~7@AS(-CO6(6VE{lX9vF^ zb2|ex&eLzXUaT8x%&|qScIRbu7r@Hz>YbmaW9!8Pa<5*thO3~QG7lJkr7QiAMy+bM zKQ!3@TL83gSet@V{qe5^as+C$svx5orJvg84<N~Qk}los)I+|+^K%eEYhMiY()Us< zZ+ff8yvDy+6Sl&DK5h~y0d+a1uMukCLt|;bi<pX~|KON~y_=Q=GJ%O@JBB};GD_|T zfsx#HME)#<pn$>oW#1HFLbIw=xHgX9%F1iF3?h|X0=Niwww4md{f6Y?ai;w9ys&Sd zK7k!xix!mob>IWMXvw&9$B(~;7o+m=b>GmWa=xSE4VJ?+cry>Hj8Eg7;E5Y*EW9D8 zD)guFBi&j0An3xYn;(;Cc|CupERj_HYd~NuYUs=YBd(HI-#dHu<2yDCn9xxdB7?yv zI^H70p+dpX3&dIu<8suT{RVXK4^Ct7Xr1l8Hd3NLy83-xG*~WFKG-*u2*GU@LKyFd z&83pPzabI7*eyeHPBvxI!&jqu@I(UWH1CM?CLRb?Pq5W6B*PX#4-JF?aom(OFT>fq z$G-{UBlT(DL<;bvWqi>zqM$YpU{(p6r?%k2*Sh&KI^O>=P@)o{uA-0!Pp4jJiTw}q zpSN&)7yaa2orq~q@*PY=zmf6=ocq9c#tI#uEe1eE|Jlfm;~dC|p#2+{+8Qk;PZN4h ztIHMXvdV7=E8bG)Dd*Dhz#YukxuH6(5)zAvW)Ig$d(I?0EY8Hh+JOTGbl<|R%jcnH zxD`_m7=@ZH2u9eED8qtLOnt<Q{Z@)Ng56WG{8$rEP3*+~ISq!qiOZKSfA`O0q8*PK zyDXPpQc2$-LyDnu$DFi9UPBvYWImc^T%9>`^kj%?;>hVdBZ`b?&tAhhp||Ih=LEA} zCMw0jCYo|9nz~-p{mnD+J6V|Ef%FW8!uJzG=p$Uvz3(;cJ@c5xRBS9(QJCQ#&t4kR z_4ye*so1}{MXZ{d6OutiSPq$0qUD9!U!XNS?pfflAX`S~*+0KY1%AZkQJg|y6ESw8 zGl3d}`uL8cWEn~x3U18;+%w#`ge6GhP+qre)r#>oL4SGfgZMGnN;e5)=P)Ldw@J_D zU<`!<&`ebD{OpxC^+Q3FkiNKqJ=$d<(>tYclv;EG0yKttv}E&;Fz9F`mGgs*OnP;4 z9sHe)L~cYojv5JN*0KK!8%2eMDfLe!X+K$3IJ9M~Cgs{3iJ+G<V%;nJmwBfO8s>{- z=K<Hm;wfuX`mcd&Pct*eoBh??_2T5l&v!5I9Rbek2^WxuP)fb)9H(;$)nJFtozJG{ z(<a;cA)(DtA@qWlHazo$u0=9>&cmD-_7xzxax)HrIW7dB7;NPiC@d)nGY-KHZyC+x zHzS}YX<hjGzqx&T8EgX%dL})GOP1{#pC|lZ{l+k*!v@3Hj592&<z2NsYT81IIuz?? zv1rkv)vGUT@Hum41;0>WvkhnrgSiy0&iyqt%P=o84c$y8B{mPwfAmnl$OVnPSiE4} zg+RB2h(_WJAG}IRwEmyFB6_?;1&Hw&5U<(z%lt3jh7rWaXKzV4vAhmc4Q36_mqOc+ zu!Jg~W`dzO(CjOlF7AxeqDB#qi<7W~dK9%Ce!|3EdNMmc7;*Crd^JHAGa@6=7jaP6 z+uJAm-H_=${eRBFAHl_&>5Zdv@zTCv<U8ifLVjIv6N8{uD_tdpj`RS!cZZ@Np`Ezy z5X@c%tVn$SCmfaS`oFl(*uQ8HePu}V1`mSI{Pt};k<f0$M$k_ebyLgmCz+YsNquOQ zT*3lmbXqokKFMRTA^oX<7-aXQqT(dWmB1`;{I+#hKxs^=E`lAqI(phqA7}Ey(|iXW z+KAB~N<M#ngX$Tff~%{P{6$9c?b4&iHcUW%rX#GkIn6Ak3JE2-zDP*;n7R3MdOo)D zUpliV5j;TlEmqvwT$<WA^Qq*MMa7sAz80}TK)Z=GrIUGs?xL85=her>d|JEjU%%3g zbA-Fz*vDA%&Y{6gO3qjv!MmNQudi7g4!Sanyfp1h%hs)HD1*GX#}?UO7GX|O7m`CV z3^NQ$t}elo`zFbr2A8fZW-@fF<{TTEeR7SHjLwGtc~abgdxRHAfB|;&U?20(vy7Yd z(6khbK)s7w_n+x48idTuerw+9R(S5MzuYMV`i>Y;0nK6>Ix^FWo0$ntxnvx2bw#%d z0HHIV7Jy<#yMrrrb+f&cxUBKd99?v8Z%2yHiR@yH%{}w?1SCa^0gQ-;lHP&x7hGI4 z&vf9!MU;=URSR-+y^6A%B=#pOBG@BX5mSj*)@f#NylgNAj<~qgH)+;NrGv7hqe>=2 z56-oMi4bE@`RcUS$ctyW*K{ATiJUcTFl+;)BdA>sQgdWc6Un}HV?^{=%(wxwIQJ@c zY)O9QQeXYnq0rU}{g5!Z_~H$kLxv+JJs&45lF6}#cTJ1lMUDRdb&rhIbS0g(j%0kQ z)lFkE`QN4SI}lJ=S#h}4ZMHk0QtT-^06}P0R+d4UoD?P7_~Xn*q4tylo-S*SIz31} z#pkr3{nR6s57?Z}X@qh!tV)0KMEKSs#(0>pUM8m#N(P>4+Uf|}jy@BoS@Jj0F@zdd zS5+CdZ6-M-0GhDorT-!)H*?mk5ykcn4k$Z)agtm$93k#pK6YZ!Z$wS5k;0!-8F2$* zVLd9}ECyZ+DuaHA_Or}nF}}?LnudLTbSa1yCucork{I%JYb^(BB%K6MAIF(n4HdxA zm^5h;{{X^Y*iTowRXsybuce$^dcFgH6ItCZy|AyN!=9|d;0zJ{Mbe{(Pvekd>4Q+B zo;#4RDH3qgL(AHL>K1g8wfX{CRH&Qih5+vJInpElkn62<`1us(tpfF4x?<PL>0k+k zj*bp7MBE||J=(ORsLx3VS5{1?ud1i9=0sDSk*BLhb{Pwi#;Nl>B>Zl4%!iV0ffy=^ z4U0x{REc`9_XQO3IcBwRci%8hWq;_cu^MWlsZlvAJ1R03nsgc9T2l;Z#Ww>oiSg}& z2S*gMyBv{}H>y}1qbE!dFmuMjUxE2;{YM-Se(nfx;AUXzF|ru$4Z;QYIXX0h(2p9n zZ=#bmtzzAf>pxqTstq5@HTE4%49d6pV_Jz<m)IAv+IY?paUH}x!;A^N@wh1f4#$of z6`T67`OZH(rAU@XnZ~8DGehpZkdX#H$JAc!E<2qMhlYj<%*ZW-fe0gzBdJA!anzjl zJnB~LDn%^_KO}%sWs8i+sIEWHaCobq|L@VCIEA<}eC<n3K`y8o1}5S&F_v0^1z1*K zCxc(;>e{LIOeq*_rF?vfRwlw!v1`{)1qFWFm(kGvK!Hl$!~Op8%{WQtl6;pgTT0(e zB07^a=!G2rn%rduBVmN*Y2^9dO8@WzkA~P$2xhS<3|^k6;ZgmNx5at+>nEIcw4)&> zR%{+Dv;}pH<H69F2b8Gx=pn_5!BCVaOF?Cr=BlbD-AF-IM*#phq#)-SJuX{9QM`tz zM_E&c0C}btKD*^jU2|hN@n9@K3QrnQ%)ywPcG<`Sc;{-Jl%<@NMsGhZx33U2JYh40 z0-i5>gwa)kV*YjDOS0h*(Wx&&YrePxv1S;`ZFY(7>QUDuQ456zCNucdvlamjm0lR4 z+XfDX#2*<RzO_CR+Q?gF-h0lT?jAUN&<4)0@%7gvk3r}grUJYpUZGp^Gulj_SlcFT z3OKBJ^Ui~e4FpcmmF?W)S0|3omUSO>uUOIwLLAty-wZPv`9m>#BgL#N%zPo!J1;6G z22zr*puj+EmhqM#NsGOyh9wQs=eRMn`?x<ps;k!w_alO%1&b{&21)k!%4(&|BvUVW z`Jt3HctN~;{d(o?qcT#;Q4f#xSbl{X#}GLn#gBjWW~zgK2CeYF@QmWz4Q<?8FWt<y zn!p95jg6e#hH?EAAU;7lzqt6)+O?8LD1ygYv=22k^i{g|>UD_?Y-2JJSh4#={yo1S zDj-3_KsjJWWg1!xU?W*)ZPwtr{+P4#_E7FJ#}0b%5`fm^o28^<^J|>?=OO{Vfx%Xm z+u`1Xdnz{;lQ?mWG#i%*9wvlpAfzvBEkP|v9jXDPSGTd7*Md?6^=5yJ1PpJ&*Q<gn zKd)%dQR^)jODmZ>{yK2&S`V>K1JK1@Xf&BCzi7$0p78os(9GAlLYFx|Fog(nrPCT` zV!LmkB4#ed_D&l3uioe6h~@yVl4@%4%NLT432Qe5flU#X9=EZI_lLAmU<QM%(Y0b{ z29~|iOUSuky@~v5jxpX};kikY&bR*y=x|oR)M!1s!+oZr`X=s@dC?yz?BxR)H%>4Q zAwI-YKpmSCZ3b(?rD4hDzGyRP=*M^O{>9k~rjR5jriECb5t4CgoD(DXvdYv;SLp;} zDgFOC!VfKvICQx=0;2vb2NyBmb>8U93InP4ep<GQqt@$SI}q^-CRdc=8V9GHF%A*- ziB?uY<Q&&6jq44M2{v)B`9@P=l*|~^L>V@k8Ugqw;>4mG)J`OHVbwu65tA+?lXw>w zB+&AGe(sxbh-MZvqjB}Rm5j6fLk^r9iCfGvfIEU0;@c=zQlaqS#6y^{()<HE64vCO z3Hupw1g5yPt#Ktd)a!aNrmfdGkAjn@dW8^9T!NmVngs<&78imB5rKkq`6;-QToJwK z{)|Mk^AH&q9W(9{^$zgK*Vj1gIRk9N_)q*!fPuu117##Tx(ZvFYQV%lJs&mLiZnB3 zFd!*|GYW3q(t+_3e2}Gbqx#6TUIP;rpbtm$h+UhtYt}H&6^T73kfkg+p}7?02Q%3K zIe4{W$<GZ5_yMsiD615&$A+&+zjE+1;F@q16d&(0(yg^**BL4MyIV31RU{f-%d|mD zOaq>*qOo8JQ7K;DXPrjHV(%#lg_$f+y~OJhKiFw6DhD)+P-sKBZ4}HlCli6_0|pL^ zzq=OtVO?f)1s9%;_LHo7xb`eZIVVhH9ipi<F>3iqM6eHBEgHf8Jc049H+~<Likl*( z00nj;vO$S2uGD9G`=oJNns#IN-nI}kO>%NVI7&OgLvceB*Bp*@nr0*r3#D?x5?c9i z7>tUrQW)g5Q2dO?jj5T+F8mA&<yOr80*1G!#Cg?{d}t_`U$&S;M>!nU|Npnp4HJ** z#D8*mdLVrZY-Mhx!MS`tkxlzaw5f;z5K{0vSNo|+mByIIks(QIR37NnY|;vIa}jJm zT^&KBgu(&Rx&GDYYU|V;nU`6_*=A1XyokO2aV{01mNa)n$x8{2{_Luyu2iZ2|5KET zgYVk^e;$In(H1uNKIIDA$Td6<_yPyAY5jw&xE49i_4P_FOO~8GdT8)&g^vm=l+O8H zUADq=^zc?wRr1z$J>D)nq3an>--jNLHq1^MdeP(VzaHs5z3qo|Qq|vGSXKL_ZpYjB z!q?dy-YaI>xs2Zyx3uJWg=75dOZFA9<_m2?zX5xo6;=~~U?_tk2DoZ3!v2XEifR|y z5t+)OcjtfZM8Xn8$k1$Tf@kn1O(uW;Jzz{)b^e;!M5kMq_U-3I%UQ2nNx7!X#Q{;k zJq5*#a@D@e*%V|EUMYkCEGWB-G9``zE~2BO=|w&H*JHpS=~n&icE2=_B<t8D>`Tc# zh}cfBL;7IXZr$2+)6BuF+J5t9<Kf+<cl8+lICo32r9DcgilSSb`jkaVw+dC1m9r^2 z1owo&3|nl^Y?~2k90`Mh!#mxw;DuqQ3|Zk}rId*n;@6j>7Fc7Kn;R+U>RWZz+H=+c zWiERAq3*EqQSROQoe*CS9qOp}cawJ!^7s~^$wjl*sVl1I{H(Z`p9_g#q7;&vOOA2> z{>LA?YU4nnziQPg7EKoKh!hXZH)3PnY*g}P&r(xTs#&JQvf2nnDm!*ODyZdi=b*-V z4;oaMJ6^?A^A48CD1zutqV9lDjSh_@q<rR|$S4?5^<A-IMYG9GB=fNjzlV#_o;}(? z`<+sPA<oaR;%N*&ClMC7T?IKgfon%fIvW7C`^H)#1!G{0r6<dm8zRV}2?XBj(WA%w z`SaPm1zFCA5;yhy+}m(sd<s4e##wx#d-tYH_(C@cLaiEAM+U+Hcw(kKau%2?^11m_ zjGP7;;xmn)U?p}C^i1cEQIOtkn%ejb{lU{l1!)@VcqR7mX*|O4X`wAm!rNrULbk6k zWH0zv`_}%$2taDwT1W>LKtQmGRZv@($;4h_PYC`!ecW!K{GL1aUm<uy*Csku^tbe< zH<e;{eEL0=3ISbMgtS4>n($oMUk&FQzAq{o59T5jsvd8Hkcjb|pcv6WM&k(c?d9(u zHr964szm@RZv88oXBQW7bT74h_XynTLlz>(Ajbm;<>lt;eIRM^1+Q8B<jeyOBX!e- zwX^xr?r+gD{`xA7);BPqRhdmQHz!p-^77@b{Hh%25ToFKctx&#*jxOAG^jYic~LLg zlfuUdav>U8`^E+#R&Vw}X09<P67L*B9XA&am|ZQ{r*Vh*;*e-V`0}jOPPUgyw+8(7 z3*{4piG9ULmPN<edy7WM+|z&k{dX$ta{jw_0S!MLMH)6PNS0?i1@a(SctsQ81CMiW zaIg&|3CzT3KrwBDi!{%utEphS==S`%ij;mBDVb)R{=jCE=ceZR|ABGol-MZA5GEuh zbIXCx%x2C^0Rf`Xrm>&Cw3(Fa0a?T@O)9uITMN+~17E<8{O^}DGCGG5LzTU)uhVAv zHx)6Rl{5<k9eg<WY*>6$D3Z#D|9RutWC56j=<Sho&OEEcrYmqsh=Y#g+JwOeTAP|c zA~x~}C+8K8j*b957<UpwT0Wm1hynM(8zpp!cDy`82`eb;7!+8BF(ZFQvg*<(pr{mb zHpwj+aB^JMoV+|aAtU5fs8(dYdhtR<QC<o-z?gL`YbB64>&jx!h=mg2m<yG@Medz1 zCZ_1hxlp{)EMSW46_@%tO4eRjQs%FcTs8L<{~lCDfw%;K`w6bl_H7Zq#Urqj<>lh{ zz?jz{@w1KrO0a3-3Aa*4blr=&5=e5C=xQhBZWUf2m_ZX$poFo#QC_U{ff^7(7|+Me zm<(?8D0NH}I(Ga}P*BA>jHy$mtcoyIo&u4<^JE}uqD@*Lv7+I(3q-a~uKoD&V=HCk zO_V`<NxFluT0R~C*n@s4h7FSMGjQO8loZj^*5BPk@*LeLh6SvpmYG9F#KWcwHg6S~ z!p)nG=dfWL7)jzg>$mtrQW@O4w*_|RkVLY>c!%^`paxXL&p88T;^K8@wpgC75`q{f zMt~cBj{2^~=h034XE-TdS~UuwBP{?70}YP7%gb8{4{kxQe2I*zPvmV%4lCYK@<*A{ zX_~`_(@+n_Nsk-rE)UotN*)`v;h!`{RIHjcIt_D=O*``vOA;NOTSOi8c5K$HMsSOA zYS{3cIi`|p{S_v+m>?348m(Hk6y8#4dr5<V?aaHfc0ztR!&zvESWsq2_zW-@I+y<V z@2})AX28LLG;GlzMpX!#3<|iLtV9b7(+DNYnXu4|?k=Nf^2@=r*Rq;8HbY7L9l4aO zGGGW{KDv)Uhdz>yuTG=LtiXn#z7?6c5@J%O*L^M5l|?7i<CKQnjBt@glc|#NPT-@t z$qSoq;zuywDEB^Fo&%Rel1?~PE(`?(b(E1jr#JfU=3+$#GbGP1EJ%6%x|*6U&4P|S zQA?O76%<@B4ECHQE2*sEd*>WNeG;v{=(0}!6e-z90CCQ@?jAzV*OIsFtXj&ym)xH; z%DM~t5e``%N35i<fW`A3E?oM7BaHYXt3cu<&9&xR_NIoErCWB8i`up8zeUnJFB%RM zQ(HD~?yQUd#85oH8ovFPYL(v)T{4e`_J<Jxw;KR@R8{q4mswc+OT_0^3ws_)DsenL zm(XjhL|f)AzdC}x&Ot;1`WS=vVj4#bCqRG5vX$)srGCQ`n$q|Q2_qm_NEjV5XCaY6 zR~83D^rj$>BD_(kD;s{sCPFj*g}QrEF$BX;e*T!Yu4N$*m;Ahrx`LdixYK_iBdP3d z6f|NL;0aJrZnU=-g9&ik?($4Yh-p)%lsF(*O9l`jZ>{N#U&xp-rVZ^a`U~#qlFE1C zvq<k(`JTYopI@GF=qb4_ZyYF8;U6zzheWXg1J2#+<3<CM^IStc?lWYG=~e4lK~2Fe z?V!EU#2Ch51R*CUXH_n_MnzRZ)CURcQJVNKaS{z`jHnDred(g;ndL{#J%D90vdgTj zs1=v;k<@wVe17p@VrOGv8Oca_OJ&pRYx5a(=TqVl4N@721VXlbj~-%|W6%d8=%hvu zUVjoge|nAtp6_%?XKr(){bB>v!M{6>qZi{So^53%$J)WbWHnZX^k9H^ZCA+ysTVP8 zW<<?!!lqqoFC5i#jmSF?vvs3MB78hUMRvg9h^oZ?LThVluC-cri)%<?p{3r$yS&ZH z5@Ot4yPEmeb9v|m49<Y6+sw?*P{F6s&h{w)f<Qn;^!r;*Qd!7M*Xp%v2gkj}H3tp! zj#b0`sC=0EKSeLSmxH~16(;G^4s=~+YkP@Mc56)$WLlciQ!*St;sQ_PM?Di(T!8LE zOhd&gEE-TT5rU+!?!>Wx17JRgu6+L$%a$F+{x^eu8)GW&dF)4sj)`&M$qN~W{bA7S zY(gY8F)-!GG{}bL?RwQzJYQ;NYMPv$mY$wFj_Q!i-s~A%>1+_O5lgWgIGt5@n@Y*2 z3Eq^jnJkbY*|8BK6tpfr245Nr(ItS}@S50&#fuXgC3fK|?#|*Mc&NW#N|K!`>3EVF zPIjSdJ4h;P*<40#qq|24h+Pa?NS;b07uP!gUeUVAQP$GBdS@r|Mjre4^_WCI8x$lU z4N(rJf$`Gqe<q3E8#M+e+)e%`3s?o(6<d$#wDB?lW-k~yDxFMXMF?_{OJBB!V2`S} zd`WL9AeM!g#~Rcy<y^61h9Tlz>IKLYzW2PWWPHd$l2;(0X+|aQtglRUVK?JY1`e@l z6L0JIB;*j8Y!Be|u@=jMv#K+R6NU6U>G5N@;S|^-dQc57OF_J>%K$UTuNb$Z@PaU_ zHCD#P&wxLGk4hpvXNW1h40z@VehljK#zc4H*fGX2)9&7#EPL3;2kX({pm;QTthE%o z_HIpn%2r3v7VW3#%t+{%*zMKV9X>^cDM)j(PSBq~M*$Tp$R8*_9vuefUUX~ta-7E$ z8c2|_PKt~&5#piK-xJ?0hDx`3d?a~7-gVYxU@{08LX>j-3xX5Jqn#v6v9t=<g}{B@ znD#_wO_BNDRkBU^5ni822Xyz~!^;~k>{Sa~q4A2A7CWDyXJ*Q7r%z{tzc67{aTE3$ z+@-}9kKm}OCxNYcXPzYPM2FQ;mlRTOW!{FODe%cEO!BZU0)fBBCypmilKS<wO2_e7 zDcOyfghQFB7aGYWgIgI#U)I2&HJR2mtwMy0sql_ov+xmT=5Du<Le}?IQfa~&ptt!p zdL@RZW7IFnFNakX(R?SFr_(4ahOP~jBQgr8WD2PwNUCJyeg`o*#?s;Cy?cR#>Y$T~ z(%eIghXOqBrl(&8-B70RJd=}>=79_N3YhZ#!-v}1S{g{B@gvKh<prui(=dU4Ev+Ch z+jtRjvvh<mcOSo<Dz@DF`c9mbG{;18E!#<Ua4Zs2SMFc4#~wLxqGxcoNT@#o(2(`| zAbuoh?((q5s<H1}WtuYk{yfa*W(`yj#tob#TUvt!*)me@TpIUqEocG?;E-~2BS3s? z2@IOtQ2XrL%z5D~WdzS(Nh1ppm@c|cITx3DUbyzSak2c!hVXp_wyv)mgGop0J2#iC zW)brUDQ@zd2>q2ZQtWCj^sO~>wkDa$E@KR-9VO#}S932X`oweexySB8x?PqZK4(JZ zy{Atd2up?<p4TRdr9ew(J4-s1oA@Wv{*9<8reZ6rs~_CE2crG;G3!J>4e24jH&^)L zlT~i=6c0UrCNLIt_f%B$5?SCV<bi(mW4iTI@&`)+bBLsFVg^Y}V2qIyht`Ng>$!#F zpEe-EJ;GL006&5U39XZw!_w`0{{Gu%9~5k7Cu6d~Z2t+NPgYTSiCre!&T}OY;l3r} zLn_nJ!5ncjd4XNBwEsQd{lx{d1{+a4^Cb6!TY*~>8Ix=>@whk;QWy(H1opJB2&lPp zM*bfNe?A|>Ab-(_6;U(y0TFFbr_Wd01{~P9VZ-S4?_A~IQ#YnfxFNaP|HX-z`vRLB zI)z3#r^yf4j(wUqUQ25u??#j@BXCo8szH%v2#H{lvlkGoGIaiPj)%BC1Zi4W)|Qs+ zDKJ~KXk3w@hzP&5v)Xp2=x$)2g{l&O^DAKFPGxf@l?jZ1C*s_86eC-ZArbQcC#|fT zN}k#bo%*ywrf79ka@|pu?pbq>bURRh5BHZZUWoPeiId&6Z^Dq5O^f1qk<jXClTo|_ zDuq`KRo9$TmGFA+J0Cvg=cj)kFwsg0^YNALtvd0!{n$n9_UUcBOH$bRH+Z`Hr?r=K zmX&Q*&?qK(LJGHP-I}3}|AO7jH5<g!){)`rn}u8Ity*=y_*^a(*HdP{@Js;a-CoNL zQyo8ki5e`CC*h$gY0i9l^>atIl-_j`;X&aU;ao_r*N0C#qk4JXCBoe9-4BpM(TfNX z{DX?WVU8g|*^uLI^y7R~R8&YvA!mbfR9#&i*R*+e+;UAoDPFk3*-;X8#Cu0k5j&sr z<ZnFOgg}j{d+}LayFhG2kjJnl)7vpIo=Gp%TRao^J}-7ttE7^gg5Ga(8oxf#Lz#+V z$k48xI%Uviu1sN|>5kI3jMI}_IF%~~TpKlZY~c>SKc|FbsCD>wP|WoO!}tC7i<xVY zX61E3CY<1ULL^|7)Cxy9Way(icXZVVd;1sXG4)|M@6DRdh>RBr2hlr^9?i;9VTx!f z1vJJGu?*)S2U08DC3YiAx5`JupOXGKA?K>6)gn$t^%KOAOXig5WG!X2nuu>-kNTh8 z_qwZ`RC@Lw5rhZwGQK|wqhdzU`}e_vP>AWEY|7#xN*5MKg&Q$+7?v2sRG!?!!2$I3 zN%W7Bk_aWLUZr-i5EUwlGLph(Pzw;ntJsgNgm8`6NUQ|)IBLs^59hfL`(E%llgG(@ zxf2w7zQb*f(3AV?HdkOfp-l8$boZb$Z=z(SiU`VETM1DA0ec&BlX0=;40wD2(RS<6 z1H*k<UWKELA;mrRQFM@fysYWDmd11DU7s(4J-crARtUx@N=PVto5>Rxlo}tbQggV^ z5h8`GvKrqH+}dYg*B(8@We8HNoHcSw@#3JqlItC@^)9X8<_HK*cx|LH?6q3O7_{1& z2JOITpX0}4pl?fVRu_}$%Pgq)(NN@;l-R9UaR!*eLS%AdA+zZI{u*K!+rre;4}_R@ z;zv4#*-tINY3@h+h7W9pMOVTh#TZ&&oD)*082U`R(IAulKrfoJ8kDhG&RJy+-px<F zUgKHROt+~tcN{e?-7ZvaEn2h)x+<RYtsZ@dMG($3z{TFQaf_(<semi1s{BBz$J+2` zqN#jir`zw?adUA~$u)deLzbgOu@1_LD+ZJjqvyHP%!sN$NM&{PB2X|Qz^CJ_00@R( zoc)aqO@)Tw<k*23yg9X`yuE;^3c7l_vh(Ms1IEX1+VK<T8$p8kPa7hsTm@ji;YqS~ z9ENw$$p$o()e~i-P*L$Jc|_x0P`^OWvWRLf3watot^0ytL;70|lp7>(ey!odh6(T^ zsR%4y`U+Kf`S<VsJT)Nn4qDn|OT7<NdSF)*ID7yVrz|(?8T>1}k+K^a2q?<sQSNZS z`BbJxOF7RkUseX6V*mugO-`9<Y8C*W8Z!|{OtoYfN0z{&Y@FR9>DEmA_;IGQqv~v+ z5r6&~Dpa*N+ldW>#>U>m|1k{4117tq<SjFbg0z)%{uU*5_YwN}0F!sBtvxA)RGLv6 zUkeZKplg63&%lAxyNlhl94L2r(V8bNh<<y?%d3<^f%HVlq4K(7yAWB_b>TzWgsfwR zRHEGqTLq25yVX&}+<U{OO;ie5BtUTAiWMKIlzBdz(Zv4zL$nk|edDHR|A{OWRaIws zc^Y(=9r6t!U2L4mWZe&b<i}5>qefkkQwg_(Xi$u&sAxz5Cs4Z-@ZcD37Noi(tAF@q zmH-)`rcpO!Ly6?#7_o*O=ztg@l4y97{Nimyw?yUczyFH;cDj2&vp$wP$RF_O0q5rB zUrn8WOJ35WM+f)rWkKT!CZ^<d8{C&uA3RtIp~=tNh7&Qv5!`XM0MC7wu^C=Nbq<eq z-!@s0fX!^1I74oPhPsN|&=DillT?wwU3b`rW1QZ{l<!eF=Oju0B!n%oiDi5f>wbLb zFd@5P4p+aJw2dC22GM>{YU~lhIGc{sF!aK(lXN8oIFKRM0L-Z!FhV%w<+XsD%XfhQ zYTc&Zr5c3vkjW(obVyWwr&3P3kd!b)>+S6wbWA8d`4(JBF}Vv<svkszwdWTDEf(kJ zS6+<&^QT6dJ_XBBi@8xmQ<MZ8@17VFLB{GCi_sa+JerK$9WVeOY3ujQEmZCwv$H9S z>MnK>n~XWk2*ZbuAD{W`*|&*QQa})@&%w2d#(%?*kylxeS4S->G(gNy5i^+G_{j_K zzR!OCsKECJIb$2oZ}eW!8lOK4pAza5n9K<kL%foMPSWRx;~LEr7O0qtc9wxb&z?O6 zfkF;}Sa`)6%N#XWrw9TjIe=f3GM|6D0QdoKLkY#tFxhMS2Q}^N{m-8vcnY$6&;NR{ zdPtI1!SHOV4`r2*STaIE8FcR4AjLNK%b2sgxs>xTpW8t8UP&W)qE)g29hyv+L&%UF z=t=?z^v^Y=K#B{G-vtbaxnaAHCQ#vHm`YXqSKymX=a2y;?M8jGZ`)$0>SJvc$0&?q z%S-f~zcnd}fF?XO1>e)8F4{7XFK3=OQ{GQ~ykYa^o3yBc-wZ!EO^b3uAha!o?D8gR z%lR+L8Mu*&mGL|l?CuBKt76lS2=i0wQGfq>QU5Pmq~LRo!4+sbb?1NJ{<D*)J0GQ{ zDvh?cJUo*`u1fH^(GNf*+qyN|RW*j}mH!7oPgi+x+1K}FpW>Zs@p4hsATt157A)Qt zLLd~$GA&52*Tfyr>!F6h`^;`9CZ2~pAo*<VJbHVT1C0IGd!Go(_yKKvdnHfjdt1rV zY>v}(0usyLLm*oa{oq2g5$Kkcm9^E}ts%sX^T&9CDF@Ki<Kx;I;I`ic9|uE!w}bXA z?vR};!IuXbim?XzNHK_owelO~tz|iSTefUbzDF=-o#9XjKJJ;QlZgOu`r5;WwbzBo z{#y8)d{)wIh2_eX;9%N-6<kTmO7Q3emN+-)B&+CUVy=D3FqpO|N)+Ugrbl+CuNv;} zT$aQiM*+N@JIZ0DHYBa-`S-2@Gr}se#WCha`ql7oB=cH)25Q-E-MY<U5*}p(7z}qH zUIMW9oV-{}Nf(QqFY8){i6M2Wf@%zkj({k<yf#4_BNd+^&JDl8!Z&*fM}8BwZzVso z-uBBE%vxqlo0b3vKq7&r&pbb`ukt<kRv5gE4E}Z4r8<>DpqL-U0F<NS59Abq7LYF9 zRU7b3<sk!LZuUu4_nL_rp>S3~7Dk3;;t;OSf1J~ilP866nkZyyVv_suqZqThVZ2uP z9?0tniHQQUr7}|u;T&9kC-}6b%)}AX*_7Eyw}!YRN`3<a=i9HFNw?BfGwb%}J&0M! z5tw_n1|Ak6Qxp~)X_HD@zi4QZImQ7xr%#yzk3H?X-FP;?Lb9TUFdCx>8~L$Q{Rn4j zma=xw_$3Sc2ge0|EGRI|dD7)=wXu(BsQ1}8$_zp7)f9vj+lCYJ=@+~KJpVbGc5B%Q z<{t{<qt6{#I*)M;P_f3iWfZJsAJ5XfbUM8MQ<j*+rcP9Np}Qy~gVwrm>pOLw>(j^Q zNQvv=!%t{Vwwfcjj@+qm=KB`skQvJ`_527|6H@vyxU6Zg(S`S$+k0#KsbAg@?mSf( zEDd1vqyoS(b%<f_IjY6m3t6^9^zV`tqm7Q2d2v6w$s6gp_-wF&k@6l0B^h7pMq6{{ zwnBnfCnt4^Df$dPodsqBi^v)LU{;ZdumQ7#k^7+P!Lrd?q<}ZeOB&i!K`<5N^v)gV z+>dEM#<71=SJ3to@aWAOxc<RyZdN=qV^S5;6(b%KBBU3@|E<Cl{6HZ&3~LD~pL{D( z!J)5_Q-<KHRXb7}MoGd=>Aa(26Xl`(c-|nKf?iBqfe4i&wL9r1Dte@@q<Uv^=c*As z;XHbh?s=`d4!xC{I_CoJ95+xZ)vjT!l@qG%w2#5jd;*#?gSMDeRaSDyI#1X_P&FLS zz3ubepq8l{;{1>d+!Yd&lhgcH(zSxtDsyQUPEjz!-8iHchl#Hj|0RKBmh3O?BsQ(E zWn6Yc^|y7Evu|pO{PO%##O2E>L-Ae2cSSS!Cb5bJtofT(8um7nc}Ms0UQNx+x}2Iz zufD$Sd;I<J*Nj6dN=qBGPj|33=Gdj3eH=~s5CMV-#<|WBG`S&!6}M`Q2zZqkax#-9 zfIFmuh)eplLC)+*cBA~JTV+2HZ0tCJgj(i(i5QZR>2%GNoW}PPO%o;dKZoeE@PI!6 zg|9X)iz}TuEn(T+U7b#$HiMko9?a2+U)paP<bn~TT2khDcjC{ith&u)xT@#OOqR0s zu;4T_Q5Rc9?{d<iGn?zC4q0wt(OU^s#1X%96*%-%;h-k>w>ElAfTtmL$$H}lp<6n4 zV=^N^q987SnJ$=PSOA{h|9Zk*>cSm^ggY&-&Lrnrm#Ax0X?jzUlyBIt@N%O1%j@gR z*coWluE$7&I&+10+)O6ngHg?2p|G}{BuVBy8};X`#1c#(4OQ|ma`Uxo!$l*mfU#1@ zQs&G*uoQznpu;o-FY>fUh7bMW-*(^o>#$4_;HfjiO^SzXAHfjKJjWEny$HjM&Sw<x z2s@2FM|Lq3N{qb6>%l}w`DSk7a#h6EpQA;Aj@+XFT(~6Gab9XTHntum&dbCtEo8)N z>i2&URcvCBPA~$_ok(#u>~|=9HtP~Qzbf3?>wd})@qRr9OktYiMo$KD+c^yr*?F{N zf*o(q{*=creIf6>b4Yo4x!iY^_#Z(@>)s>RD|4CAj5W;22hme2Oi<|DdBAgom>C@> z+cud!W5&PfdhMiJ`vn{E>nor&x_St4;ptAZovoStf_>eFjvU!*oa1`#FsMBX)E}hP zgaR9s;C=iXV@u=>Bge;ZR>*eOu3jyiyA~dw{gkkOV=~atFdOSj6el?+=oinRNKV+e z@(18Q*VOis`k`XC^XFB@Yl)e>yjA?rT#?VfB6Xq*b%`y|XI8eyavcG7!|69SKXobz z6~KK(ROTN0$TI~dlO|SX&L_V``;@oxtT6^<<cxmy+{MPe#13wfi^imJ2sB2AOD7<_ zJ9g{~Bc4!IU!ETK5K*X4sWNw?ZKGziXG@R?1~-!cc4$RuN)Ozm{^Ip(_!Qz*A?juj zx#5_;V1kRNsAf*3TfrI{Bgen44HSH1#oFqsBRG#LY(b7Ranm92i4lBA-s6Cn)4{QF zYd1Iy{PZ?wLGgZdLyGI%Z||YCIn?8qHmt%`1K>DgurywaKx_Pl)`gZ+`|*PU&z`L# z3iJMimKQ$;o>BfWMm>hB(-YfzG-J^GApK6`r;Dr}b!h+oLH3XLFMdTF&29c2EU=yC z7%aqULISO6J~d~h!V+steQWEpMq|g0?a{q^f$v}!dY7QEr&=B1j2iVPUGE5ADgGMV zG<P<sY@`J*K@=E-X?!%+WR3*TmTGcpL#dDr6<MAhpjZl9!*P-yGb=jmNNmU_xyPz$ zpri%&`IpCrtM~6eb;5mOX1!l@M~69s4YGV9M+wPBA-6K>=9ZFEi??mvS`hUI4Y$|s zPV|+>w_CYv*}ON(PHVE?f4j(ygyp59*&X3IjgU*2OuoC_bi86}YUKBIV8MMc2DSBV zKE-F;&s;gWK+fFHG?WEW)huhx8Oz@m>QO=fpHUd+rlyA7xU@90q=E)Rg-e(qzUMrv zk83WyYu2dM9MHN!?K%M=X_qcOzyW3uW;B3B{u$Q-KtFpVlVUE)NUlD~w{A6LADm^t zV!_)Sc=!-{#B?#}+9BW5hu+}xIkmM@w&=1~^^|Yb_hIUnJ#Cw3R{4G^Y3}{K!;ZEb zQ2#NTE`05=gc6%4+lfJ(lP5znupIjFrZB#D)27?U>+<{Q1uymkC_ph>;-_`!kalUw z*i5tSoJJb4^iJ8rbEDK{s7C&_HqtFQ!Hxa;dbc3u;ltK)a%&LSQbka31{K{NtgcRD z1if}7p3?lTya5LTiEca@At`3JIEZDA=H^+P`hE(peq2BBZ)T4Xg9gp4x6EugCsQRd zXQqj%X=mS+ROPQalz;u|e0bb_;m9P`%Zd<p7y2(QE{><KT5`-Rq0<)ssWwafSh$9K zio1tVfN*c5QpvxbT?gzDox4}gR?%+@39?=_+pvDm&6^=B>9lP0Zb}R8J$L}D&LH)s zrk04#rv(Jg$Hym+A3G+z2~pCj9A~W5K82zax8e&_0o#}m3C4=FkgP+u5MNo(ncu@7 zqEw$#znn7^Xalf2GRKL+yl%v%NR5cd$md$+!3SfbXVTSAj<7wIm(?ReLER*L$|d>) z=H}@|K=^&t)}YeD(evs<hZIlP@3Ywk;T8plD2_p%*b&Ccl|kviPn%`>S{3n9T-~6m zeto$vR30$kWBQzlg6Tj}PAw348-kfFc3&tSA+ZQ4^F|W4Jhnt~4d~u^t+r8m1CMhI zBFx*shScm;NE!0U|6)wuDiG<{hb+a)*f^X9otD=83-s)OROXha5JJ{Eng0o-)^mq2 z!Ra?-?Xf=j@fr4EI8{QfMejEM7KE%1qY~OvCcR!>UbeH?jTS-T>$f-`_QmD)sTNni zXCpY`xEBdP2aH2#$&njD<EjmP`{m0u8BD^p^w_bkluVha;B%=JZ`$k4vnc+Lij*>e zDgF?n3s6d`$7h+~weja-f4V2)XdzTe(<-m2EZ=&F2%an&a7!2W%Cwo9wraRtTm@F6 z{Iy+mzNvY@-m;rln~glAu5<hG6K-Yx=nLE05oFb$2W_%}+q|S7#PX^kwtpe!<dg<g zk@BK<KV7|vR}^SkwbU@RX-j2qBxbbk)iNl6zoz~ny_?!dnNVtoY++#U-T`EHUM+p9 zGz-oObt`rVOgEQq$#R<h(hX*1C|i=%)z^_b(h?$RfkPe~`X@@^8FgrshJd=fl>3ij zdDvWwplCuoLG}67&s3QeJu`7|Tg;ufabp}G?MP&zdKxhd!rEOP2J4AdQIzlT<7hZ% zR#2-l!LdQhj#Hx9#Pc4Mz}KGaE{_4Ctj3$)Db_)wh?3*U#%t8thLk4r-g%&fUTjCT zRDVWPM!K@xe{Jo^bJLng@1h#rt+~leUjv%}nB2hr-_H(T^K316nOd6-S;9_GoJN&H z*ela(GJ>+js~(XSPGD0fd$k1cd+x&&Q#|+^HlX(>t0N2%VYaSabBaZKYHAk}C&M#{ zCGCOHP>Jq6R_2aF4EOBlz=;!a!^_u|ojiFm1YWJvDKmn8(0gj(`$eUGUS7y+ANwf0 z3#NM9U+61tG66j373Bo1d-U{~U(zrG2+LcA3ehN*rDo0AVwjcu=oxKluKyt0!*IVn z$8R-~9mdgwv=~1Em%r45jV@yv%HpfY5xIcF?XpBA2=$^XOYsCx3PhwQo|cl5!rGSn zV~~_e5Or{L!NT{Ns183H5)u*~oZN&;%1JJbaTUx?6i!_wlrk2iSHY+$t+tCUkNDvX zSbQcQHA|q_LTLpT8K{AE9zC_dfvJSgYW5*?GDidDLbOJ;+(p*_8iF@cRX#3K#O4YD zXnI;&D}#Sm4Tl7XqCB42s?-Q30LB^;vsnTqc5dw{rV#158CmG6n!kcq349bNw}0Qh zR}_Foe|H`B&#ie_Q`u&YnK^T&@)T23Q@Z?Gnwpd?kOk0V_#}S*jDLc^UkhMDBQ?#f z{n$yJ(zX-31SYcQmZvjgCpYt%r|NvV)v$4*0v$vUmOS5p^O!ZN32GQzG9EhT<rVsx z0!PUn{wY>~)wq|C0Hen5Pb>BftISX>y!!F%^=qM?oU;VAmWq~EIRxMw=8l}5orSZY zelo-&{J0a77dEpkEG%f5e$7ile9ih3dL0ykb<pLUM)BAor4dn?vCKgJPDaKiYMG!- zO0AeK-t%U)Gb99sM;(6;%I{%H3XcOZ`e*VN1Z5}ZzR%bXins11ejp+mgkh=!2j+ng zDHJ&+%tLq3rQ-;FLA8DQ*506P{|{GZ9#?bvzVTxQlRcCrYuVC@WGO1gUY1G=sVpN) zAtEhG%}m*&DA7=%qYYUiCCZW|YbjYOp-@ODlGN}0oXpJk^*eujUtcDtbDq!h+|PYq z*L7d_GM<U(yQJ1Yb4?0HatDw!8ryY*?6VK&D5y5k2v)UM*~w@u9G-;NQS7rZ+O}&) zz2LlV-4_5OP9tfW0CQs*<n#C6oOyJG3A%3Q$z=_+9p~oiZkDTInz)Eemz%WI*DVp4 zN~j!qOh3+~99KYWK0#4N#&9r2vJ-G3Ii`5Vs!M~4PD&4z?%joTSK}hDq(U_~bC)wu z_`2mEp0S7*Z4#M2@M2at{kwcy&6P92i(^Iu`{$j`1AUD8aF?W)6Mr{5JK<ayI}X2{ znxl%!$^&NZ*t!+l{^dl(ZT8uhFRKk7ULEiC>RwdRpq(T!V6Mcpxg`Qqsz@~^YKI+Z z2WxU~c+s=3$kSqMO4PtR0I6_KI#zX%dk!X_Q!%!~a4hY2`Ryc3wN9en5QEhv%a_}i z*0C{gOUcr%{WF~TpQ4d=?HhBB!q@_J-&jP4)%H(lUqSE@vm}JL{TiZTK!&aEajh}) zp#Lr{>Cx<;yLrrM-W=_afS`p`o8F{%4`O%H@<Y^D#sEd)ScqIjc##V0-I>peOm68x zoqK&+MCwN43ljq0lA92GI!mp{S-I#adugl>9igs$kl;t>4msQgA#v7mpl}94fiVP5 zdyTLYAwpEd5<n2kNd6U0TSlB}W8-Jshgn%N2pz7XnGP{lX!wZ-?F3Z>Hb%ps80cMk zvol`Tq@RL8yna0vKoigUJDO80_v>D)g@V-{GscHb15S*onc3+xXI3EO=j?+*I*Y`6 zi|&SHTus8ynp}rB>8-3>10wV+-AfNF9)6GRF20WlsZ6Svl`%{FG^L)nzX*2lpTJ*G z?avAe1;I;E3|jjPK>zlPd}yEX;JsmY{a<t7jDv)+=1E$xi1eDhgrGe0?Gi*l#sEBu zFSvR+5>^HR*sbzr)M2G?Ptcm<hMR`heL5Ei5vY(+s+kYzL7mpJL8W0a!({iQ=mu@w z%EW}jhgSP{z5H1jeE8jeN>Xz*b_oU;0#u;V98-JO@B^b?H^Ax>g@hnMyfQWar!U6e z2wIUt;eZkV9AZqIO`_mCpvO>RkT=gq&?sK%3;(CWRC-SUq0eYGoSL8pYah;arR`-A z=;^;7c}VeN+E1=@?urQFfBtEGANVUedfK*c&*;Ngo5Zl|f9t!902tzm=k}&e00ojB zsNGh|J~=0T>&A`XtwO5}L0+|0-0DtQTtnQL_-O}IgItIfv&+w5KvHhs^l42-Q%e1X z430iota@(z1yh5)IEP!z9(2p$uAhu@OQr@^{<yy+I;wv70AmaSvs<OPHVd7Cs)OtU z0GSAOfTK+1Nq)taKTqZ{@$SwoTOOymyWlMgh%5yD41)&W9t7|j7%m{{%FU)Sw*Bdp z6hGJ><8P(A*)2CW(bzeZbEG-}xQsk&j(r|`{0CVe*dVop@Im-I?{i{}nCHwqA979) z^RHLl@nOGX|Aym6Gj43Mr!Ebta${mAds6<{vmbFHxN1u;al_&|%!ccbLPjGtPFYY` zSeO+)3MP;Ht53vxa}J#<e_VpJo*KxJrvXK?LyL6vZ_7+Kyg<_4nFr;WM%nJm<Oq>o z3~3;x+0k8ZAOo#DhHysMmw-ZPe~E3pd-nrYh46OFFfu&cgH5K^XDy+WI0~#e2L!A7 z<}l6(cR42z?z5b9i$mQeN=z#ulbfS}W9Da-GsUzK!R8Hv!k}iNJOG**F=U9M6nV^S z8VDp~{^G*sc=94M{#yRv`jsm$87O6*m~Z-8Fi{d;k+FjrHF*wt-OtQqPODK>P0jlU zu^k$ZH<Og)3x8-UMla6!`M}HGyop16nW;JHtG0hEP(<CxXFmM`;%F<_Is3}}ZU1Ni zcFqLWfjXnl>>M8tV1xrwF#|UN64-A<2WAX<*L<q3PCjvB6x1@(1|(7ZjPS6g%D%i) z3~3SLO%j@}%T)A(&>uhr{h2?B{=q;=MFsNV0oW(<B_W8+R3k@YAv#LwiV$^d@`tNt z$@IL6wvVeEzBa?vf0a}jn9iIjxC8zI3x~Qd?+r-;$@Ai@G-PW;$j@wCO;{(RWDEws zY+`RHZH6pH4p|`OL5Xm0GTN3cVQ6@soZ{TMQL@v~c^Fo47h9XgN=zS9<OL{F{3YeI zTp;nc6uFWwkeS+<VtUOMfK9=4=L$-5486%bRUG+kJvPu`nCA^h@`$k@%T92P;S1iP z7hz|j_-nIYO$>4(h}An0t<m>=BN2c|heKi>$CdniYXxC;OVrnyb7~PkA|lemV}Dnw zk@dpnca+jRk5OBE^~x(|jP=!3{*eB8$Em3t6lDg$BC%Hp!LpR&2#kYtd*;lUcqpiO zB#AKWZ#y+HA8%b_Ev@CO4S>ccF_)b#b?WQ>zTMF9nA-qW(DA>D>xaQ@pi4r^+L7b$ z0Xlx^x2M0Dq%MkHOY4L!qv}an0~G;($=O2Yx;ioCiwK<4YYwxK6)zVS3d%n1B?POx zheuZ_4XPg?v<7&srT1FsuZ1w9>?7g_G-9e(DL7-!_MS@03QjGrU-={&<rDX`Bek(y z0M^Bhm6&GX6$+68WlTeyYLGohDy$c%o-;QL_!xmt&=S;cK4LUDy<@x$m3tbtBJnj3 z*}{1HpbVxAhba=m1M^AAOLL?eV1=<lkh~-P``-FclxoJ5#pXx(^l}T-p~y$)&V8!( z`pvp+xUx=(s{e!;ET6W^-p7pVj(@~_^H#s!IRD&{CXC(~V20SOr(0hPxcBlQH$@EB zRZ8Z8mqjO|fIms73>%Wb@lF2%zNn0QiJ~iTCL_AVUZJj#=?aVs%M-*d-1TB+h4(y% zX7uAk4*h)+Jx@kh^nlym`+MZf;uW?9>=`@|yLIWZjwr}KsCkp~JF8EI3ZT-Vb!+S$ z2?zJcaLqpS1|Xk3aNs~>sQ{uvmdG(+s|9V}9y4ByWz%=Dp<ahX=}k-lymLA%0*(40 zF><O&IRWv)(3Ej1M==D#GgjoV)WEP80;Q2CRiRum%Yv(v>ua}c-yXUZ(qDdwRdm<r zMWYV1`-u2A>?+g`#}0`k)F&BE(2}FHrBh29zyGwNA!4?N@Ezj_i}rth58laKy`CI; z^2^q$XoeqgS#>&|bd*GrO&gub)%Tlw#xyfCO`nC_5KL0y+}I@cqE~0_^HZRD*b$~` z8qm&k?;du>)@f^CTSgJhsO*JGC+#H{lmYxW*Bf`1>U+(4E-5TCp0J&%nsfJq;ljN< zx_<rjXr1m7^Y-jc4iYb!Oa__g%93)dsghs#`5l_Rf7D6t6e>8>a>5%l*Gt5v^ku|s zRD^qkj)}&W-d6EeYy;LP8Q(yck2A1)%#MXGg+)bJfI+C{$dR@iHW^DoJCZaK6S|1` zSO@~_%13dmgD$u>Q!;bjtv_s-_v-^5ro-kHAAET8W_*<e;JcVLRSS7#+N}gOC~Rv= zvA3B=VP<AuE;gAZy}oAOOJ+n2A3SnoL2j-a&VU2#?N=kow`io{XReAEP!YNVkhWru z>X0F;ps+zJ90`-e1Pa_QmFGXO9<!bIhhZ>|+L{9+MvK6Y?O}2$Ivwak)}3hD#qkRP zgUjz5g|h6zrjeja6u@|mEQbEGoac^}M0`;TiAgH6%ig|y+k>7~;3lT7W!M5HG;+Pu z(IC*GWQUc1coeaas0&lQJ})*gF(Cq=cqtQ2ie`9afmHD^zf>0e{ctX8IgIpb+4rYB ze6Oc{CF+JEj{8bo5!<wF_gK<VlHDt^u9*_rc}E3=RI@M7>x|J`%pqvaz?~{8DlmPd zzQ(wX2Q({na3J>2EivaaZ))2e*Y0f8nH(%wXa!Y~+o0d;q;p)%_W?1VoB)g<I|P3; zApI#!2aJ3ja4Hv;W(QOqRvxxb5Yb>rGEO0f1X~CLitb4jXevYrF8BnZ8{3EK<Dmmz z^_B2QS7dIm@cGIuCwh05VlD3u0z+aASO9AyFvW*{8bm9KozO)CB_%<x$=^SSmmklU z-|+*)2oO<OilPC%4X+*aFUqY;FOb!|TP${I$s9Zr$#(R(F8H!(uCtQhFEDG5ii#4l z<c}=BUUws(T5h-iMM1G4j>uh2-*>H0F5+|3F?%50OhTPJQDm}?f9rf(VzOlMV$(pB zP9dFRR(nPC1}g$~C<Yu2ROHf{*pR^Z*+M>fOl&ObdMkw-90N&x!iQA!D1mOIP3-=P z3(8WMOhMdcmiT?qZ^R`yKy+?V_Pf13rbsjPJC3a@xjj-lAi$U(e>qWggl;8Ie9Db; zIA^$X-V~LPCxIASLh{o?RRu%Vn-J1}S(|r`Z6*GSheQ=d@vfk-IVh-IhYn)Y00Apd zvAn#TlofSn5P~miEE+8ek4*1T)Mt^{k>)W%Jxd)#*vjzwd}Peh<;x3++g$174daAh ztK=I~?rBN%rb^sOJ&oyO{bWsvx!~G~UxA9kHv-)<Ybj|>Gq6dHo10tAq0*?G!QtV9 zl-<5AX_hPW#dgQensm4WwL`w4p3g!W`8A|h$3GzB=X%$4oBWjQ=pHFD?s>W4KpSN% z)~ADUcj=w?V*|ka6a`^wA~fxLnKIvl8POb&Wjl1hB;$t+8FC^ydD_gG<kPz+mD(&X zf2l`>n@{W9t_k`hM(Es3E^uIdqn2$TE9Uz(Op7HoF>MUcOy%SmJ*cOlf#lxk!C*>h zzlVM^-I7dqthaV6Ox(4;#n*~`z>s1MQBD$JgmNB1cU{%<@v0M?ot?S1c|iey>hmg1 z*({dUf7-w43rCPN@slT!2GI>v&Kpsdb)z;&7ixFFOCZN%HiiI|MlKyq9uGxhgmg2Y z8zDA#4iQ1VtffX~@omobzkTP9Ov;|?YiKw(!p8G^8m!(g=i|8C5%ju{o>B|zR(7Pz zS`Uo@@eFYddo(p*wAso`|7N)ZebSg@2KI13?e((@>pl3p7A^CgYq+1y#?IxIp~yV8 zM9L|%wA>{>yj7rd*4*My?c7{Sqhg*8*3AF3*&glD1UWnWqkHz>pvhw&_@w5fmWqzo zqdPwxMJf<kqw3@98&zX*&UV1{3mT&<t#4lVbA3(EJm;FpLJTmu&ktyNRI{R=jD8Q< zcofk*G~)3(ayp^0CvUm<r@Xe!zR2E%nix82R4u^9r|3Wa5cOQi+qX~2X+&=mKW47M zFXZ3(f!=cY32h!7?xNw#1Zd9(2wu9Gq?w~Kjmt>l1kfJ6l<!KC&pk@xwOgy1CzF#y zw{G9QU2V`9ST~Od#v$1WUQ@ItIPts@F~fAc_;Nq2yNZhG8pqt@DZH=Qv}syH_C`iJ z7S;>Ya_m*<=J4uHeuVn!nZfJYDH#OiwIKLyIixc=Wu#r&1r`uBknx&@5=rF1kt3-Y zE^rP+v?hD3JV-#E^7L7S*IWTT_<!n0;;N<f4#+B}$S@O}j{*;|I0RGnM9=~m{t49? zjXyr%^%$r~VP`-UI;(M`f<!lL(&3OVrCr3#2tnE$P>@8orFd3K#NdW<&lE}C)Tt+_ zfGg^$C#DJ?$u@mR#ls!JVN*z|&lB4(niq!87sSGb3u#-Q*4ijMFDU_Gvz~vVYEp)b zUQB6YbFV17qMc743|L`-ox-QW?%b%NS7X=h<TGcqJ47`P-LvP*w9oD--E|I%<{wNX z=G^lhc%h-sdXL_U*54X@>wB+juC#i*g@ygESuM<_C%3i#-P~$O*Hz;(wTFi5S{<1^ zv&G_paox|gbZ*sPFA19MHa<^jXxGAyh2tk=1-ZZa_Okxs=c>5wH{8shC1u<#d~NjA zXH-x3{i&}E#%~P$`rEgH#E*0KC!Ts~AJ*%2!n=g$_F=X>O@w2l_Ofg5Rp<y413qZa z5nUZ`qO8O*P6`IWMKw`F?Nm>}Km4!OftQ=s9|K?OYHeTY&m_u0N}y6GBS=9I>CPj) ze(Re=JtqbPC@fg9f{D?dsV{LL$3@BCj|rU_=!8@!SX$=t#2Vc%az!PtMxMRot8-<; zk$pCw$%zl!e{R>dt@Dg_C%KAu?3l57^+(Qjl;JpyT`QUTU}l^}Vj20q=DR(c%)5Mv zSD@9E1CL3J9S_|;ZjWVs!ok}+)>yA=`pype`xA#55LuYjgtGyAzI-*{u*{NqhXzaU z_wCiI;hxBxO|V|lLMzT~EprWB;Ot!S>eUD$6LGdXDs2CRBZ-NS_;s8;Bq;(%kGB$? zua)MKVsr9KQl?ZzDLL)%;h{5!N=l|*%V;KP2%+hO>r68ME&Yl)39j(o%N7Y7>aYkL z=WX>Ku^`!Zw!F^G&ll{ho!t`%0OG`~AHP6!P+AWPKrZr@FAc^IG@3o^w2H<}@qA$O zgsO$zN#M{PyK?^g5)ufxHJoCDye*|~0092x9XptvB2MYpm2}%-&OzNP;GQrvOIEQ0 zUc7ZnQ%g%rSNGnXJ1aPj>46H-b;`=hLEZHx)y*j3_aX=_o6%en+&Mo*(qO?=;$Ux2 z;)`syPxtQh0)NR1;4F|FGZt=22Oi$W_PrPhFEor)vKgFl*^=YPYZO>kaYCKHcPw57 z+B4P64C(Ph@>YD`%|0*`GHUBKrfo*dduFzjkPJbrv1<r(Mqz1l#Xu2fB+5yfuy*z# z*O8*x{w%iWyw^C1n$J>u_u<36vfm`b_|=odXf7)&G?f^&ZAVXz19t(<7+G0ac&%K6 zHk3%5cOW0d4{RMJ12;uZ+lak;w^O5X?NeFp8VWuYoW<P>vjQ+enlqfAVs1)m=VqQS zu@8JpXM-?T)r=4C&hwtc4(~dLh8-vpa`^YppRc9{5^6S&LY={bx9r?05)KkKhcurx zWO#U01O^6z`jLt-&GodR8c%?V1?jz1fr~N0IpGd#&*H3O2o<>|*RO$rL2FfZ2RRRN zv;alH{gAycnzk1X4K2YkL-=B?P4=ju46f$pp3?eESD5}xVdfA%wUs1pFG%i0%8iK= zS%|66egrRFwv2g051?jXyhQRfRF#$Sk2rkd#43V2BqmW(mT?yAX$;2SmzOW2Kl*nr zoW6&N)R6vJ#l=QY19Ta1e{y&2HdOK-c>~;yrX-z!AqX{3IkgGn%EuBceF{IogBUqK z)1N+J$P%0+Gk5@~&5(m~!z)XLe!9Az%|*@l@XD2bc-N9DgW`yTTV75v&`Cb;hnxLV zeSg;&R)QY2oUE)eNFUCEkbb15dV!FVF$O9UA-K{?kBpJ{YwRK=2Jq?Fk?d**`!)AG z5Gx&8FoB$<(yao0i9W)}(>mUCkM6t;@1N-3i3WgxEo53jb(R}o_(uK!lCqCB5XH+6 z9z0+-6;INW6!&Bv5gn<yU&xV}C8_4xTVgUD_c4DzA*m(15(t-?+6zJ@#A5h0m;3~o zl}7BeeV_8`A>z<uRDjACQO7}Q4u3?CFa;8<D4mYhE_EY|QQ)KR9K$pKswT2lj=&s+ z1t==GzLPH2C6J}Li8lSni-@t{2#}ATsGiTYN^=pIcN{u_o62qZh9=Wz+1ra5iI|G8 zO&w&G%dW8l@funL2*n=D771G3aB-Q^s0i^lb?Qwb)5{kxAd35Ws7sPW?zp4pXv?&W zXJm)izN&lRcM&B2wziyEDj*jBgZLlUVp<WHYB-fAPppqJ%m8hpG``577{p`a#{EHt zD3g_~S)$UT2eD_af(4gQ_$@+9l4ZPO&3Xg)0pX|FRa*aGu(mc#7Rk3$W8OiYAsIDF z&W|ulvR{RgEiv)?yW14(SniS3zlsPU6QWvic8=jvh<RoUYLH#~PM3KL@(z!YBXw`7 z@?^NH;uJ4uW=3m$)-Twrc!<cXtK3;)zLFw1cJb58mlu+8Knk8E$sP*4#qpTryiSZg zK6+H^PIP2sF_ucWmx-Y~tpm4HahU|wptGgpm=92i=m60tyAUv_Ls|P4C1;d-^e}WQ zy#d3hpbBC68FJtz`HA@~bF#9YbkyU)@^aV=jGikqY&KWCc{9a2R`-2P?JyS#Q=yH# zc2n0j%?jsW%EzwvyuMU+g9lt$zKLX5x!!lIj4fQ|AcoQK`*pt8Z=Lr$a;e`JFCIa; z$KI$Zf3WG*SH6g3IQ-YT55>sI*rX%kJwmVYZRv?-RZuO&%fHK=SaWsh`0)tA=uHvb z0eA>4zN3P`By~+Q&vrj>BxA4aYIyzP-8<UGyp2B5dBHc~?W1$Msi+u?8iff^+wQ|A zfOA%od=hzsNi%5{KFT*_mqd)Cv62h9tW1A?ro;$qK8aIIVxsVqAnfrqxQx6!J^K$D z#I~<bP4R2)A5I-64g;-C@r*sY%7qGepLBTq0cA?t=lCA+3KPCUDEqhIYEU>%{7qip zh{eLjTzl}~ob5mxYiSmT)I)ibD!yx`rs?F#OSlf8`?z2_OK~sdR;wia9Q^&jm}VtW z;>loGEtvf3`?OFomepKc>C=&4I59pud-5wx%4o%FA;!b8ykbNloqY=y<RDT*h#6mP z!z0Vd84AtHoib#Id>UF+F0~BB39Br7x(bN8;K>sMEv;$JR;?tu<Br+j%kcEc6H{4A zU0of)5Q)oR-rs*o&VMp9s|W%bydWPKOUJO|zRW|dfhPrSfxa3jB6d~8$1h(h^Wt~Y z=L!JJfcn=jUv}=^UBLZ-95;Hk%hY@YF-ySXfbQwdC{=OGiWaE!c!@;63;31zQoKwC z(5dw&1P%27l@@e@EO)M$nPyvn4wPH<x8}{wz1aa84zM)Tyx29Uj-AFVb$<Q%^Xn%~ z+L(SxlCHrj(7X5)CXa<VXwFeL($)@QKwM^<rVS+jZH|r@MRS=`igB;ts1SPKfOb#{ zSn-e{Rq&ELUA(ah6Bu7*)eBaOOl-7{&T+Pes9esFixd2~+q6KAWR^F3DVj`=v5eo) z_`?JCL+Gap&so%+zs$%gY#Cqzpdv*m&0A{VCa#30S5nN+WC7^~rSQYH6>r}Hyn~8k zkz~G_@0##{7!92JnI3-Ol>!FHB})VrwG9V%KON=!_Y*_0lGXK?CK;xwt`1e>3jWDV z99sVti)MXc{KKtZyA8~{i`Bm_bpCGtrE;dGnCe7)<c*u0Shs*g1P=u;_T1gQTQ}ir zOuPk15DF0z=RidmRQ|||{X;!PuP|Q%eOe^>b$tBbXI+1Bbag&c#|0`VM6L#IX7Sd1 zR;&Ya*D&=}jBCP6iBkARSZ7I9_QC%qP=wy($3>(8CdB68PTxB#n~7TnITb;;$q4|; zMq}{c_!B406)eonS8*%}-{MIGMaRYp+>6`-FTG>Ov^{!q3OD{Y<D^Or!6$d?y%FbE z6dW%KXw{i5jR;;>Y-#`33~50b#tQ`@PH6SO*&q!*q*pIqEKEJzEI5d7d-m))LgD1? z-JYX9U%YrAw^e51*8lk(KHRcIBjGi|8Vvty<Hn8jH-P8~1-ZxrO}b<2(Uvy`xR-8* z2jfB^gIV7=;7nGFsx*4n^nN4}n{hl5?C6Y{GaHdra!Xg@*-%<K9JgjJ(;EkrAc+5G z>FWK^MhHY3IUAnap}lF``2oo=XnKTbIZi#rfwKzu<BxG*I4q}zhoa=uZ0v~H?s>l& zB6!jeFGx0O{bEE6HX30-i*8HE15}br<y~3w-Rs=^jNJv&Lnv&D3bOX;<;M_$Kw+R9 z@I)6+24^Zi-RXxm)PKA#XEUb|;VG1sA#fdn6C}HOWzHc)j6s}xdHdn6ks6Zdjfej; z`*BSpv=U3bh@jlH8uEgJ1h=wsC97^LdYdJ26Nu;T@)tpPG&U^DmkVok&=bw~f(e5Q zV@e5i9@#ZcZs3aBgM!NLw3WpD5%;h0U`acfWjyNWoqRY7_pZvy%dSnIJ{>jrz?qSS zNa(vm)AQK!b5Kl>yQQRbXxsMojT^M>ZV3u1!V7Kn>XV8upp)QadGWw`)Sw_L6vAZX zh4IZK`Yot-e!kLIQ@+4X9if6>?Bukp<(cGUJZ;OVwtiMqB55K9?1s-qaFvwgifEmO zhLDKFiAC-k;7KnAHz@6pkR&t`3vU2ZC!VZwYx(bU`_DVpf8<0$9#H{&4KzxWFqm>* zrGB|}OQ_a|U6U&%yz-gDLk|OKG7pe=GjQNyHac&)`wNsc>H{V~1hOOGHJI9=GGx9# zeQD|Ynn_OmXP@d{K*s3}8bI7&0fzsD9VHo1&6h72CFSHi?<fCNwex@p*>=1D0YDOW zuo1Rwp*RS+czthJn6lt9yC5XPK6)NLm27x`$eRaLB<8-JnpB$dmw&-SY>7w;d-m=v zL?5|=cRYZbG<*J=XE*UPj<>mcd7&@{EaYHw_Gz-AuTZ*~)DiIxFW8?m{J~U~OYilu zl;T~6!z9t!7XJ!OGF{a5_4Pr-$vUYRa@(qO>!u!N+SKeGC{-1)gK$}f-msAAZCIv( zT?;sfk$5>dkZql$^y&$X4Vw^_g-lgQ-(;USX4FbyhWe6d)BkR$%(pLze<ZsiUcgbA z?tF^v5;*($^MCK#Coi=5VjX}%Eg6X?6MQ%UhcCwOBLL?XlMOZ59HVs=1IGAea1mng zn0Q!W+x`nxj$RsJ6=>Nod3g$%X<-qvIC#kX`TuAE3Sr49;^`nDNeB^l0$A%FvI|Z$ zFKR}i?;2J}lrwGACDhN5m&u`^EQEe`<fe8#Kz#L5XZ<SCX!bw*|1uq3Cn(JXF$Az0 zF-}|iSO=rpOe!G8>7aAF<|%$x)ka#7J~1tbc|mj*QGRpacI==t>!RYJbLVPVP9%bR zN3S_TF<=>wqn6MWNe>3Iq9p^G9$>T`;xmE-S631jh#T%StlomAYoMth#dSgt@pES) z6q*cR%9~BcuXr|hH;gGSvLIknLI(ohr}XX0E>J!4;#`pmFh<kgGQQ{VrNKM`vBqVx z0UiXW=DdIFKtTlYpssZTz9*E;Qdbh~3Lj3w-D_zbW#d8g*wov?1Snr^E-{$_qr>1F zl3;{SzjI{EJINCzC+3n?ynOlK#Dei?ekl~o-Oq3r(TBZg(IU1KjayA=VqZh_;sE<F za9qA%4;tY5Fh6#7jdPR2n2Xo3<MUUqDD=n6EW0BasM|A8;;+KcF{>~dbTk_L$ou(7 zy!V&tYPt*?Pj4buHq%4P5LL9EcHm_n;AHgwB2tF8q)__2y!@JI@V~%UIaV0eUOyYk z#Q6I=J=F;&CjPG`Iu8&kdg(#BhwBUW@y_7VXB8Kc4zQ|a-g|@x5I_Y~pSV75Mw3@B zXRsHnBN=Gu#Us1I!$~dmsPe(h$t!t&p890Y{rFnkW29JE6g^1axoE`-A#vn6Bd)+l zu;R_C0z-aS61SS9h+7sJHe(oT(6F3OOLML(GV=P=f5eD~Nb4wf#oH(-;0zQtQd?3o z&%$?G1-*2A=%fNq<U<RiTz)An#bIY^sSz<m`kHOmPMHy*-tpDnSODuE_T*)0w8YaI z!M91QOKr%9KVVOK@MXD_PxT?5BitIM8bF6Mup{*nc8zw&Afe`m4QL^Wwu54SPFVO} zx_c(YkygM`j8>%w6-7(({pMZ8S3gN=4n@j!jW??R4%Z~*lE@Mr_}cpo+?|`W5xYhX z65l;q3tp+@`SXt5yYC}o3=XE_nf{SB%fd}%jK|>DEg(PvSfIz{GHKnWO<B9P5>K53 zSBo;b=VJ_{VWy%n?#7KN<$IPubR!NwdUcQ;z|?@_8*L|IU1yad)S>a4V_%@<L9_jV zfl$UCA2@JA>q3X=7>^{*g?!>OPSHs8oqVpbMa*`K!AlrZDfjYWSnW2uV<3xoY?+Y` zC`u2Sd*W`H?m3j)n{)f;z;k??Ok=Z|GrxZQ>dIJ|)2G)EZN$Y3Oe)$VVEw6e6*_eA zSiAP<(UdIl^(FZ?0Vb%BU^`eO(g-_7p|cRLGCawr_=V&-Bj~Zv2;;jEw04giB8eW} zI^En?sr5tvAc7cObyPTm0u+$+BJIes9WiWJCn+}1np6ib5})UH0>^!m1KoU>;=uZ0 zgn(1H^T^*Z_ghWN${kM&iyU?cz(^m)NP=hh8TgR3=Bn{Gx-a$U%ozU!>g}irgSwsf z6Td|;<V?WxNjC%84ID5aaN|ZW4J0Lv6jjJZhhGv!X{4LHXJ`n81LlR0((m3aUslKt z;lgWz(fQHb?}x-v_!s!o;oa0v#wYAa3mHjrZ0w--P-zzAP2{VXDsZj!QBqp_<=q^w z2fs*~=cfb*V)`hwDvF%f40>R)x{{c>vsl&|`GSJGsN$$izS~g>mV}3m?ZXI;CCL|{ z(l_|9XWnm*;W(h63doa7aRwMk0D}Y+pl4KjzqiLyaQQuS)22(r_X#BLR-=rl5C6>j z1WdOM66-~VwkygQsPcJt_VW-i-N}_71!3w)90X^gWvaHemK)YY1=+cWyL-@%9SE86 zf(otFrvB&HXw&0_Un)LgrLSJmw8Dywgb#jVZE5L-)X>kb5s`$*hA3FMWpY<ovUzq^ z%)N2t;X|7zaiPU2(#>RStyH<tuuhmaxQ$IL`*N_7kh#1AaHRwxsiD4~0XVLq5drxI znG7(e&%Mt9Te9;4R;T<+HX()U?`YkZEgQY~cDM4gcW1(PSf~T{EF|jPtJFvNp{`_U zoqUaFA2S}Fz5!s}56vZY(FiomXpG(53;=y*Z4*Y9ALqf42a8yISY5_lr|YpNQj?Kd zoF%CORjH}Hjl#jiL_@?E6#RG4Wq^iDH}hs^0Sd|6d3q;Lobdc8#FSm7A{xJX)$m|H z&9^QyZov}j-<w;vmhyA#dZEwfb0Tn1d_k6jG{J#82ECxD>iD8ddQ`r8{rb1xenYm1 zx~7g~5X6ueWX{Z3Q<=B(5jx4F2dln*6?r+JP)LXcTNv~L_+@mH%$s8c%9XLzFPlg3 zu7$R1)PZT8?RkhLgi>x?K6fSX434)rqsmqz9O0@1FQ!?QdzQ=3YmJ~_Bnd&(@|N*J ziq9uuC!%^Q_I|5<=Vyk*U+Od1hkTc*q~L(Vj$lNiIK7K%2km2x$Le)xbO~)i5rCm8 z$606wc8YNi505V9iWc71t-ev6QHaA|AVci`UJOeBAZ5_6WnD0y?1Y1`4L{bBprOBD z@t3i8EVY-*7M&-PrW7Fq8Zh?uy?a7@Oz_3W2((h8Y_Q{93TIgm_oPkEtWcv(PR3F6 zAS+6hx^^YTZ|^z!l<BTks~a19QPrW?D;JLqL<m`72LALWvz}rMA$DD^$AKGn@6uk% zYc3TZ1<KpJ_|JK#s8I-&!DlgefLlqR4`}e&_U+o~>*;-`l|JDOhR^905Cd=~l;H#N zi!_v4lNuqHXvlJ?kPSS?FZ}xHQ<_`Z*1YB(z&ga=Jx|co!X(MKX({X}rSQnjooAeg zfCP+)s3kMhl4ml-izo_k+vSC=4r|$zyOI@!q!qs^ErnO()va4Mx_Avok8an;hyh_o z++th$7xcN{s@h??_I#(J0n<UWnI@&cK=rMrb57J15p(fwJ_kL7JBFGlkZ#fsLOLhB z1i^7zsbbo$UgH%uB|nW!BaU6fe&Qf;(L#l3<sXE6<erT=b!s{l0fZC46;#JZsx)A5 zO*jki3l7ho$Pr8K_N?(^AI?xo2f9iNr4i1?Mni778>oo1&4pB9fR1K>x`sv_!Wbmg zHNdhYfUQ(v1J-?fF%jSn>evAkDr-~5=evfMTfXrH8kXG!@z;-a!oVQnwYF~>dbip? zm*Ct==kH?BbN&~33lIqthU~nh3l=;FEvDK{G*`!A_%tX2pHVnA>>p*Ja1tsnHmw&I zs@uBCLN8=}<zK%S%=6>cSEas#F6R=G(g?e`Yh$g%f22*SU9r(N`AzG&h8AW39=qAI z@nB?{kEyOVRVob)+;e16%5e5#syK&oFv%xDld_7~05HAQ!$XY4N!Fx_#1NeHi3K^$ z^nkBlXU>@MYJ3+-oPpIdDRRy+T-IHsjN*dmvwC<6e(zy!t_W+?1#{-O*bKsjlME-B z0u7x59rW)56gfuNwOIl=F&KDY{!w_=X?lJv7eI}YbotY>cQD|ja%!r<vGM1xaMuRH zv3zcp1xwq&^I#)iJa}*ccq4f6AmtcBcr`bf?uB>QbSOFf4lh`!EDSILPMkU8Ti*7b z`@2U&i8E|NCP+n%UbC2!EZqzh`<;tXbY<{9gx}MQOzV1d{=$W7)_Y=QGV7Hig4h{o zM>z^$!&h>1E$7Y4ph^|p1dN_JGTD_V2I)8;fDZOcJem~@^N5WtU@HJ<PjOEsL157~ zI14_C1$#tAhDC-7zFO}=UAcxCC!-o5Y#~U;Cpy))Y$i!+$AmMgXBU_AOYZen4ItWn z`64`P&@J6Mvm>whNFALS&R&}OmwFAEZ!*yl9*X<+I=fjYR8g?;|EM1e3+<PsbCF}W zb&j-0D*cnrI8pGAmLFn$d#Y1-R^S3bH~KX;0O%p`nRa?4(5<?99On@8CiKCaZt86j zTnl#XC~Q`>nC>p2+(w_cf#gVa0vf;pU(!;NlPTb6^K|4y8-7TiHRqPi-yGnIH}xO- z++=;DxLO}{n|;t6KTn|%5D{!r>(;G>urGjCl@IGbT(I>%02%$`wQ-}^BtUK$tTQiH zBl6Dm>(}%0lB;jfp5<3pK1fx#rI)4kSqmmGW5y1}Lx81W=iLZp=)z7AQn)VKr~<hR z40_9+;SKqEw+8|LCl#&ZM+6i?o|b**NC<F6&z@S=cHtu+z3zOWo()ulUZnQ&b+#kj z;=l($6Luur$vi~c#*-oFF7xxXH8gVVw?-JB;ETjnKWI>)_|X1w0!F3mhh`<RY1y_{ z$Ph5OoOfKAPx6HvAiOtPa?OI)EUs@w+6n|?+@Fta)wQ&c5iKeYVZBB`&)E;fQh(0u z*>PUjH9A0gx*el)WHkZc&iJuoF94A3aS0@t&;5LTUM}FNn{WL$GT7-!9$Rn{V^`f$ z{_CyjWp|l{UQ!~EJbKMZ3Fo0xmp^vnFqa)WuRY!QPpFJS5+)dv_*TdBf*n5(5blp9 z%9gWdlLnsWc0*!cel2<zNgE9hZbFjw<EYqe5_MfT8d5OIR3J_Gq3hr<nCUV}(7_c> zv}s5N5~CUf5@0P$mMp1T(#%8#&}0q!UH({Y3twUbeH#?#Vc+eMS<o*ey02Lz+Od9Q zJESKpt8CFXAlRVD$WS;9ls`va!u*XuMNp@EI<oF?ovUP4&y2oT`cZ7h1m#YgQg(&{ zlHx9-`Z?b%#8C^U!VA@E_pFI*KsbI+Tox>#lKPgkZAyGhT}eW;xjG$y^M;I*S@vT3 zgThCupN9$7Z{shZV{=AnhX^Up_*`<VPC9pq^Q7Tl=@QFo-OzHLF^jDEV-;Zp*3>uE zW-qW6DTsatZozMVnte>&zzp<Q6}|AUGt@{&!H;E=$q%w?I?ku3a8~RK?)q@=Dd9z^ zpUrI58QHU^?4H9iq2aZ#h}gt72)kHou?~aD{UoC0A>x8glzt@FlXH6^^p<>TczWG^ z!1J`+xP1iz=rxt?8es$q%ENy3*v~uLj3>+zaqm-iPno_d1%95E#?*`71csL$z*9s2 z3hAbUx~cjcMO8|Qm=LZ1p$X_T9|t?Y3jy!r$B#K@6oG{BT?Prryhq9Z(>kU_04=R{ za|6mMd!PENM18thrNpx{dl$!0C*-7I7u|TAOl_WPZhnGf#B`nyW!csxV^V6Z4@1h_ z>x~<7ec?@3VCD*A!$YYZ{f~IQNuB;3E&Fj`QE$5zW9Mccy=jxSWxhTSAu@RZ;n{X$ z*I*4BdOp!&CM^5`-j;Fh!E?|=Kza3s=W03Ijn{{ymssX4Up2EeL-;Jqk0~A^-3I|o z^ejI%32qtt$QUoto=S|ge)xhVG0Dd-km09dxH_6JF5=-b!5FguNbG5riAzZt*@V&f zC)@{rp>p9l5aQ8La1>ltRwY`-BjDqw9}$E}T`<cWIB=ki#R4Yf6u<G^rt!L@BpC1A z08YFx7VcnNe*%;bl#tGBZ+sSHOPrU5#rL#N(Dc;_lny1Dldy=MSi;e<`EK2RJkSo) zztB?utM`VL?%SjXkyOA!9JKP8v)ZXX-%3wF^!<3`rcrTRc4hBBsW<#>I(P2*wL@J7 z&~EX3+o1GXRwtuY?OTCaN1509_2C;H!DfFtJ_L|9*W+YkY<789cER;oL$hyXXII24 zBN(tcu1v6uw7Ud;o<0<kdRE$*FJ?Z1Z#lU(ux?voe$*|z0pt5z#UsvM6QS;#cxA^I z9@$O%^N#{>PR1Ky|LV&2CI_PU1dgI%l+@L>*eME?<S_aL_>xn_3BoY!3CXVQDrBFM z=n5qzC0dhd(=-B1ICpNg9)`VdUc6wij3*g%#5Wr_83NatMVJHQ*ltR~u1R0EK-Yi& zU8uV*kfb(oUi@yrIf1BdY|E<8$S(tsGy=8nisk*EFZxJ4t<42=gyk%76qdMTjUW&R zBir^Ib^qqgsd@*uPu!{-Nef^teSu@B<fxyNr`O-krkK$!P=#F!`+MB@S@FW~cga_J zN94qQcJY|QM8{L7!iq>yvOS~T0*Dhs-o{@-hpwsA`iz}u-D^><Q2}Ws-As3nRZblR zscvM|i)|D197Eb`D%ZEWxB*sJ^W2}5p)G7zxw~tBX!j;+;gTh`m-@sIua%9J$;g%M z5B`~dNUAV-^7XDGM>X1#-|-*pZ95&;3{qL|0HWwjRt(e(7$yb*0*P{f3fQsXbIJLP z8T&#M%{5aZeZvzM<Fl$sLM!5Y^k>N6F5~IQuG7Uu_(T{q_aAzSrzMkN#=)zrqT(tx zGz>7+L~X5S`Ht&|J9zxOs2|AK%r(NO!pNqMRd*AtC~<-;Y2@-p^J#xAfBUP6`wO!# z!Iuc=vW;KJP;)b0begXD;Pr^)g4;eH!<SQqbEkJxy^2~~LvvNT7_U~lLP9>iD;Dui zrXiabzcgWwnMcw6?ZRu#^iP`s)RysQYvOkfcUBl|O|DSZ8ZWS3@sh8pUAHXWAAVZm zZ<>SP4#2ksx@cev{7qno02c@bpLE>OI>NU%Z_#3E(h@+IkUe`SmM6>eXy<6|S&Aku zdNJ3G$tItT!r+c*t|!}Plmra1SyRRx{R%%f8P?o*e6r|ROhe`9D;o-sx_wVdYuAF~ z2mXgyO9*|<d@e#Ss1@y+Y&aru*eNZ^jX>~?%twzrNuQ;gIiZL%E%a8>WL5Oai01gm zhp~T_1(ADYc>a)sABflBfz0RZbBQ1+pL%R2UD;7ZiQTn)`z_=Da;d9c#U?m1!Vgj6 zYnw5)0jzuSN6=8eh|?I2x<9U;^)!#_wvDQov69_x-fD^>%M~u@5K6Oo_BFKAv{B`z zi*dXm<7${%ai_kMM~{NIT33d98o;M~{QQ~e(qVSRUAlIq-xavyJ>ka5UX{LTUd6=? zd`Y>1Kd1nWhawIvSSjK_J;@?Rnsr7S=Ii4lj9OM>>WR((*oo4Ke~|E|Q*#jvOq}B7 zb*l?v6S%h~@W9kheRpJJZzZKX0uj_F@jEUquB(-QXckX>c$Of|&=OOA`D~e?BQYNj zspCZ@c2bB3HX}GR(AAx(!Yqo1<hT6-YH13I)XprzqeYHk;#j5YSP1G>T<mzvMl@LE z?k-TI2m?pzEPcn`=Gu{YAcC#M`K7azb3tl!=Nxx;wc0Ymh~=QsLHsIf@5=;-sXYiB zN2{~+W41jfL;#rRzEtI-SqYp;n)XZL4PXP=hYdA6jf%*!c|pC$+9`Y_3oPHIuHCn< z$&+C3iMO1<J+?OPO8BGE+RN`RqV9t+St8yjc9ED5q@7?Cj4Qo+)z<q@jl49L*?8K5 z_ci}~fbOYh&oaQ(Sq~0u#D`=^ENzFhYW_8-3=<veROGaoB&qqobOoQzK5gwXFfdZI zh{mV*KN+g2(dm8+B!yeQP5bsb>q909u1nU3$Vlo4MiOc4cd^rkmXf%2RF%ES1He~1 zOL_94$J^Au*rpqxh{6I{<Ohxx`q6nP2Ix3z)p_6)Su!`*O1u#>j_aPu@C606{vpxN z1n>g38xXK+_3An3Y{|VSxuKf)=wyYX<+R4hldH?eL)io<B9kX+@C4`@H|`QgmHa<E zIJmF2^%kP@Vw~Xv_8R+bkV-F;U1)mvPNfQy1<@#BlU1iXJ^S#nV`8LhcQM&)?Yl?R z>{J#c8V*#4Cr(t+;6uq}Ir_Y1e0_o2I+_^B%5am<88+e)M?f???Gf2l+({e*y7%k} zSsjl0h6n>3GFOGcb<FlQqyq>f8IPztw{Mr;X(iv^Ef5)t?1I@gWX;rK>)hS-hYk%{ zho_>L<3-s{5_#t2$yRb!#5ytG$=-`leXl*!Sm-?(%`l%_de-7}SoIO)MJ`2YXB1zc zV7gxz)_`7(miLGG`C>#kf43BgI767?XAO$~U6AnHLbpzd=52VrM6LBf_gO1K83;Gc z6buRfS;12>VbdYq3`JSXh)A!BV46th&Y9Cnl^b^fo-Vv6r81^`PB3c`;ih3SZhjnV zI!YHUUW}{5GvG02BaI)qnoMADKuj2*h^XM$RVU3#zlPU-0g617@UNY_`uhHWY)W47 z)JA6Mi-Lk9#kr%T#*v%Mk8%roK=9=0#o3RVhc~VH&41$w4qO;9d)3#jUM3pf^7HkP z)*|JFp<)p~y^@EN=-eiPy&t40S|3%_1tdcJ=lpj9D{mrVgy}Ids?MEf&7NKRXTA_; z@gQDTVq1F?o*y;>p_T1T<6k9K0}@<Sd-UohMoG3&1;9vN^;~+V7QTtnPuT0CQU<D_ z7RT_H=)|-f6fX`=%X1Py<I2jct#e$q<0!n}&|@VsBhE6$fe#-%p!cO5F+GE$00$45 zt4Br%z?vo@bRO_V4h}_Z@bjoFV`2mlsqw-WA0XVe!KUIUyz-UrPsw=0!U04^Z;d>Z z&NboApl#IvGPS}v6Sy=Ha9so^0izLviziQxrS#@-^IVDuMd&Wd#O#g(7tXcPX^JMh z|9><g@5rUzaA0&N&Dz-QSjLyS{4Tb&e?cS(44fzim*6AZzskUfiMG1aElf#cX3ySB zaY>2=D$Jx@(dOw;ni}M4_7r1Kt{~?p$96U=X`$rwp&=+@F<Z*A=s6fD^g@4+o+iE7 z2Ix_~0e-TQhIHztzEn^R;D$!l&6C5cw1tsBY9$2=E30Z@pvF$h4Ua^g!V=cu26VvK z&oM%bBIDMAQ=IW|ud6x|m3Z9-a!E+a(jAmmeNV&vV?(omMS)_LU1xv<qGFbPP|;@j z;q@O`2qyUgP13@J!X7^D^Fc&ONE1R$We|&VT`vr1$F=7HdD5766e@({O`n*LwzjwF zFLC8@7kvXE$EO%I)_<nhJ3_iLXvTievoBrsaYk~<Ig|xEhn=L}f)BIG{?P(}A9-CN zv>z5jqo5n6(3>!p2Oy?k-q}BTchXUjP4Ea*Cs6O8J_Oj}Gw82a8K_7ainxL%?5dJW z>5Bw!MWZ3x0Q;Y<<S8)vtjDbN$BrHahow#O?W<Qdb$*4quim`r*1I?C*vstpZ6sk^ zTw9^Cpf4KEwT=~ia>{8NTZ%``=S2c@ofY!HD$jWjo30lz8H|RooI8beAe{%spWaK9 zuBE$R(&=|fUN(ys26U8ne*1%c1b;}KD!-g|g*U+zV{ED8LrtA17X$^2)|?9<*TWF) z55txzX$P;TFjPZ&K~lvZt%JKtG}<9_aP*=f4|EXZkq;r*|Ni7T#`RTyDT;BNKK*}{ zWS(B@?k;+)kOSjP<Uz{Jgc#i)4MOT_zJjB#q^LOG$%#WE;5fjRDKBY}v<D|7$cu#V z{reS^vaU14dP*FXNs19-IMUfpkYspQG9I)PmYmq|x!Pz8S$vYQGL5G#{3~Mcl5ks+ zX=aD~OK8s*LI?@jt)aY6%62C*I*QmpMOOI~eeL3}P4d$yc0pZe#i!gd%^2xON{NXb zH`gfUIx4m9-dRq&R2c}{{GB`dFv|;k`qnYwJud`otYt#se2Mw$|4~9lw?>{pbnl0k z4Pql9ui5U>)J1UR%I{#NAZfz}%Ht>X+2X}EB>CYw^n!vXi5>x=N)!O!&~6f|p8qOL z{0klc?*Srhpnsjf64pC~1LE#}fayB)Y}+$khnHt(I2x-}fwU%ZJvQ<wN&HKno|ZVb z|8L@WQW*K7a1h$GsSmB9)#vu@dyKOWbqgZHvTQlInAt=2B0pspI64+u!kLYj2X)je z`hO-?r?=;}AbsXTVa4NUWfjvMs`5iYQWQNsfRY3Wr)hE<=FA>tkR_1F$B*WN>^~(( zp?wv$BipXC#8KtHFKIissqqTX4}||8nb6MedPg`PZcbvW(+8~FtCvG+8|n=Fa@TMY z;QEH@=_M9sHH)_YPXd>|>!&0Q*cqYM!D`_WA-W%_s;Td?MSBk~%4nOcQv#GWGvoX` zAP=3ZOr<sFKlzep7J48LDjjySEy~hyv~-hc3hImm7j4N;xtT;LC<M?yaoZ9nCo~W1 zu|8(v&0yDGB*FiP<x2Ezi#<F^TkOCwkB>(Hy_YKY_;}HH>2`+>BNQJCQ`;j?cK33< zcr8`K@UHda)yK0%!xkvYIS48%??jo5;W*cU*E--Zjq{trYBH$UR~a{7Z+$CoVH1dU z=HI`QLTiL2R6Nwx(P>~e5Hr|6OUMnx-GmT^3><fb__RSS%*P`|kBEqP7(eSeK>6g! zkp{DW?wL%@=$M#Nig&~`y+<tHOZ!Ob4pB3wzB0s^v1Q9WZ-ZH6h^ElD!Amr+I12eH zxLuh+DL2G>^7Pd9G}Vdbx>2@H*Hbmlb2oq@AvzPtJdc*#f(B1sjl&z;8QYdc+~bGB zN7;%8RP~Aut%g_!;vOdl?1(gm&~@B8anHqzLXJ#36@(}>Ai7~87RD|iDTyn-7nXh0 zLZBTYPyB;apBak<Jh<=G<!hRxyA}22k~|U2DN#@d5p;z-hHN|WeSG{+F{QrHl}w%b zn19Q9^WXah2%liDMCyX10d!b#jomsRj}wwHQG^*5Dr)Wo=qpF>$FSt^bfDTlH~?W? zyJRM6-~k|4_x7T?zo2*-MF2aClsMmfqgjodf0D#)LH0^A*RX^X2P|EE>;_`)r!OY~ zKkZXrWn9&cc?LDWB_8@U&TSc}te0P~m|^vGPSXPa&z_;lFLCRdFtBn8AF`3~kpj4Y z4CN9i)2St45IqB;1&_RR?D4~gAJAIE3;vpHG_`VEhiC)ku3brfO2og`amc=LLtRH_ z&ou-L!smh-u#Vh#KVY4gv4uw0K)Ykd)AB8381ug`D}x0>)rTvJJP*Lc=Vf_0P?nwQ z)qm52`Xu2bM=&Mjx^e5)gSPCzSZF>Wwg!ZOlUcMKnYsBo4ii;Aa)ihcE9^vdEm8~U z*%+v7MBz(73hr-T>f+*2o|}c=9Y>R=%C6yb_DyQy5OHtOMXJja;sQIJNcIEM6@=fl z^%&i-2Jvo^=$7J1qcflymWhTDF&$*+!U^!$$OKCnRwXglL9{Y+$`nPZa0Nth%C8dB z6sf_>m|@H-X*&-dc!7b2v-B1&SRh2OJB{k%DEfxA{MCHTo42CwqO`gTTmzvkT$+So z`j0wy)vHwIe07kdblk_lhpj6=e~L%D&nvDYMbT(X{78|_WGUQ_fOe13Ig+bVNe{lj zZ%^t;c*0%_63Np0oa-|qq$yysx!SYkZ*dP{FdB@9%+s1)E~NL^<HI74LPUkad5kTU zImZQJ4jYz-Kv2tu7#KXk1(6XZVc#R$_OX|9Ws+|YsHk6t;YsBEM&9#Vlht(|Jl2Hb z`TQQ;=}5Vdo^zCVY`MQ(AvY5DxRBW$Xpq0fy*l<D@CtMvXbK(k+}6-_R0<>&<Efxi zA2f?Ar$?ASUg26JMPP|$D}00>;40)17=zNEifqc!Na|uqA5rDhe*ezcyx9sl^!@Qi z3P*j{V*5&5s3c(~e5#u_2M-(OMluR!pzx7Ak_6xjEP@-u#zpJW8)^jt0aaUy&}?nz zzf!Dq8@jN$dx|7mu-r5Q5==O<x!PTD*gy-IP~KwJxPO`vMu#Dd#nJN#+P_r@y6{~o z4dT8xFGp>Oitsd7LOd0MbTiT>Jo$jfZXOUR5@{ZiI;sdKC#P1bv^k{{JEEievLvFD z$Qp^g52XqD90HV1QhVYDdU%daTy6Hhzr6hITV1>48+b?1F870D1>sgoZ>q%H8i6Y- zxRmj)g2hKRc8Pnov}tsveike}bEwU&tojGM!Rrrb0X$Dtw6pyMBtcbO&A2%Qi`lbj zp%qS<t&dOr8Looo5Dy{Vhr+F-{>AsO8jauu<Shq@s{+O~7~`QR)8M^$1<!*)q-px7 zWqg1li`Y~e94tI2NjE~hAZ$cIG89A?Cntm}Z+1wFe`4_`J(`CNiM1yB2-h3#`8B+J z|6Y5n{TXL#Sz#rnJ!Fjz6l4!VFHwfRN2!<h@Zp}3o%{=SQZqDuGj4p%7^L03TQ+a* z!(FX=;lJDiVa^?T=P4X>*O7lyUzXmH#}~IbtTTdju^D_Qe<CiAlEk&<8-K4zOp1BW zA>O!YQ$`^IfTlZu&v2uJ7N`i!aTFIKkmicoEzd$FH|CSAK;a^gRCxaS^+?hCO|R?^ z40dwhz{OJ0*^o?98a{hM?5SBzM~yd(o<^(}vo-`5c+mVt(}1sprvQ`SV%GNdS^-Qy z5TgxfLoS(4zvOnZ>!V2W<KmW*je~OF$fsM-y87P^0ZH(5a2t8VkW{tQ(9vPodnvbZ zfFj|Ps_tSTQ^DupiTR8MK-H^Ah0s3E0meLj<)5b^mRr265+N7|4I0!&l`d)Oq~zqd zMZcOK<LKh`^_Z$l^(v4CZU1)1uY5C<@ARgB$p4!AlG9WX{#vW@^W4}1&=KEZk+X9e zBt3CJ^(xdj_OIC+wEoQ$oBq9j1&^PbwX<pf!=Ib3gpxa=ynq;Ot%?c;4!QJHpD8~# z>Oa3VI)bbZL(9nuIUq&U7Por+mihNxhEXABPzVVQt`c12NA>@5=B9s{Pp=tvSo5B= zS5Sx}|48?ekbG4kzYRZY+*rTnjGe=t!~bNU)}sBHYo<~@Emge1w-t|14baw}u8_l~ zA<FYxB>{>Hsba`Iq?-{!5Rb#k6o39&6J-P6y5yv!4}ifmSN9i%p#0^3n?;Uk2QPtC z-<46&R4V*o+1$?sdE5RG`6t{b$_FTlw&A;X_x!u+CNn6&(dJ1vlLMmwo=n4GRbx5> zctAL?H!C{-l^ES<5cN}1Z*EN#Q^cYzAkN|Z2oFX1kswTtEF>tZuh`B2z2g3oD)}6O z9-vWm1vh^l1pfqTDob<oz5(6Fc%A^mD8vY+>2=D2IW%Wibe58GqDiqApJ6yQ?_4~p zSGn_X9oaT$*3WK*gsp3T8>)I0M~}X)qPbMO{L?|Y=>k;^z<nzPHktP>S;oCxiCSLq z1z|*#{~s6qvyx%v@m8qam87uvbP8q*KhKh|YObBse&YR1${x{>ojfk1r9^)ZO2A}M z{dT>V7hg9-{AZfih2*XA798o-y$y{oGmc!DGWp>CPHwZammRob+5h#TTMG^Zx@|vk zs^i<_PA+q9RT`MR4c=cCY?(7~Pk*I}Yj+Jiu-G{ve(?ss!l-9IE<C$)yC`qk^p8IE z?svu(*nf4YTzveTcTo}oqbS8;^D`yd+FIkpFSZ&_b1@eb<3w^eLmbU`!=B<lNF>Ue zw{AV80_p7B4voEz_n(JiwY;igb({XHBu<%Su{unmTp=zWN%SR{hAmsSYS1UAsX5C- ztLf(wKGuI*Bi#)6%LB>p#qhi32$A$gjJSL278<+XKg3&%Z1yt_Wfr_*lUN;wB>efF zstEe~srJ{@+;;DEqT$c|=~Px)>Qi4EqPP&{PbFE$BXcJwo);C}>@?MXTQZ%Qp<<8^ zKV<IOUwoE!9e?bM+rGVR&7`I{8l6shW)TGUmpGE=zOH&WRy6DPti1702=lia*L4}K zd|SNxi4VAR^Bcbj2fs9B&_2z`pyMVHrSp3LFd2GB92%svFS(+Bw>Sr(R01SOJ1k!T z|BbORF%ybZGsBObI1#LP2+RakkoedViAryFWP+-qLY{aNUjCmbdE++nGvas7pFDFG zT^CwK!M@)Is1F`|De899N=0An*g7ykk>sXbyLT7*&4q5^3~TmEK6}=PnYVT?zNxkN z?YH$_Ue*Q(kv;*A$M@^Gv}yyFJD8odfhT!2_-EXJNP|^CX4L9d258E7CG7E3uhG+x zgwa=WSS8OmQU0+cD#H^Te^qT>qY|2e!UQ3*l>uVIr7KoUH^xN{agVu7B*PrSGrjwm z{2E)*E9b_=6`ABC^bjAZl;6ve60>Nwex+i`wyfu?@_PLem&v`)@=sQngJ(ROA??by zVIRNMvj-UEy;E@*KsYGZ;D#rxHOvi+6P-&?x>etNLZ-wdCDHt@hf)Wgfkf3U)jsSx zV3-eO2&!t}*FQ|mEi4Z5V|d8H)H2Qh%f=IXc6|s-wUL->YwUN-)1TE*N<QQc7>`pN zIrXVi!^mzq5IYbrk;C!E`n_$`m1g`sLR(L7x3?#%IR0ybdTHbMMnhArE{W?FH9fhs z$&Cdqw_TH%-y3{x;9u{aw!BRgguO1ZXj(#vkL>EglSEr?4-$DC;2v_|Dt6qbgP#J$ zT=Z%pmfv-u5pw~G=ZFIb#-iNg<}2v?N@d&T&C%j4x_dd6p{L{3Svo;2M|jxw+F=j7 zQ(RmOJ(2H=lh7xUbH4Ayg7yc?UW-=-=|0#LdLhu7{Qx>!UkreE1KMJvh3$A<CSJJU z!X(|K{hY<1Cb7z&I#~arjmGn`vWeESFtm2_@US&&>%N@5gH3MC6E@B)*CEMu-SfLn z3Vg6FZLx-Qvq|4tu3DNJZjHehoEILO;%mYbp!gf9-=m&8Yi&i-8IqVdpwuEs_^zOe z6p^V~dmhJfL!lh7g8KSKCGFm|&v12WwVIC~$;XOM{=TVtgKy?9kDKH&EUbRVE?p+A z$O#J$#*S${h3*q0(F#TbLMv6#Cv^5K5!-e}RLCcsDKyUC0GYac`OwN4bv4kCqtH#F z$ab+Q5VZN2-aDP?X<iJ)0L6jrVc3x)19x6WBxQsR11bhb$71vcJ$v*xzu-#CvTfVI z*3wc_eMKa?{CpKI3~bOx@HKR1QLqJW+9XE3-Fr>q0{H)}wwCr)`<r@}q3QqZzG3V> z<W*>)QLwa9B`#7EzI~dKL^l`)J(Q{3jW9l&*c$J#*26I?zt@<716vxp|4ao<#$SK= zk$zlF4GjfV_6;2)wu@t<liHp7r&T=KhQ6Lp-|BTe_ieNb+O@Kv_u^{{iuQ+eV%_wa zc=+~R%)BiWPh58NjqFg&Wjk_;R5~~z!vOI;xBB<G&A|VF<f5B>%oxt>Bn2#GP3hbL z5{Q|0yVI7dPoMs6dSO7o^5XP;bK-KFJWa_t=-B3^tEu_<%a`_T+YZ;)w|*5fK~~T@ zx|tX_vT4(A`IwR|rLuf$)KYnduKNg91aoUNUgKrO@KQCve8^JsW{F~*n-~BF7sEMo z_5p(+cn5EZtt|nb<ZjtLrN3#*rYOo~w63}n9xE;O7$!^{U-a{Okz|J7ynDBj)GOAb zW#gwuj~_GkgP13TzO23bDu9-eOr@iF*p)P0*j@gmXZhycT>3-|49g2jh*gx+o40Ly z>6S_nz%QK}v<YhJ$;sb;(>oY+Bq=Gwr9cP9sX_?nLJJ4|;4=a#SUtUVZPKgWAKBY@ zyv9C3UJ8n5@oKwR3Q5Amzz1dycdY_~iK}D$8ulw5lF2YYwlS#La~7o%7<y-E^ItAe zFeT{MQa5nbowGkqU(~ME|5OBVA@T9$#3f|Z4`Eig##g7>yHCyw?)&d`7>z~qaDf3J z?S@X_OQ*KKE1ZhOM3M;;?t2Bf_k}hYJ$j#FE=en^UGUf|qa|?#Nd-g@cpHziATRIe zM{{&H;WM-jGpIB4Ziw=h30$PdK5kJ^CCk$->WzzN`byXV6_v)6FO1oi-R{1{G)Y-v zwXJn*c-sDWI;M8^GMsb<-M^C%PE4j~&dLgOrm7xgU|@IY5<N_-X{M3hK!*a`8MZ}; zB*n|*J=9)j1}As!{IaAZx^lKkw?+R}MgI1;FQ@cuKl#EXp9Ki0lDYVZQ#YoyN!+6` z@-VP*pQ0n01rMj$*mNmcJk^jk(iIPncF%99F5f+DNq>zCKTl<a+mY6g=IZLvPv}nx zE4V{}cRZ_yqGI%o8)_j-LEIOfd_=Or9?fq4&wi7%i`bL)Gj-(_uhSeeX20{JCr=Xe zgLQ*x*9P_v?OBQ{Wb0bJp+gTo#n;~EQ_S!!6As-to&9=6VyRs-lQVIvyyL&`)9Bwn zz!~zF$8PJG!j?oJ_>w3U5d7xN(u3x4iHWz2K4FtsoZ*+mKEZ?`?efT`<&j8y0)u~M zwd6Z{XKP)g)s;<+Dge9pVvaVa*I@K${6{7i9sJSZ<m{f1VLf}+!_$Mw!hCZ10Yc)a zkaz7!{x5%&fZswecoXa4T@a@Mg+kj=(cA+b#&>n_D1=>i*+L4!+d%8F8N|Lq`}T+t zSl3xt%o7E`6WBswVfg?i0?UZ-unxmI9epK*Ey<BW^FOIR@g4G}lH~HC6tnc%FY>Fn zIDjQ!{S6$&xVR}ZXNrE*yr;Nno=eC*Rx#{A=)Qr8&{((4A!DP3aj1aXZm&e7hd;bC zmy6>*y<KS+Y?4vx@cY1b#an^1@jd{=JYd>jk6qH=zIAJBq$xo`Kgd>*UGkARP@)}e zN6*Jx*mepT%_dl`r{MG91J>5of~;(r0GF7H65+Sse#1^xe^S=#&nWCrcau9I{KqFI z2Lv2J2a`tZf-DBl<d#g>NF~f?7@RL`p)p;aZNT9<kbRL;ME=Y)#`)zu4HwESDS^fe zHp#iCYSZI28Nd2N@mTz&`285)P|FImY4;$QahGL3GLo<0Ys8iFiOm8l2QbN^2q#A} zE3W$1{Sc9#r{a~jwCD0TNB5ioe1=!jFkESPAI#NkT-cMHx_9>izD7odz=!f57vhI^ z@2=!tY<)bx$}^pNhWzXUmn7Txm0POlRlr{U%o+E!Yhy+}wJ3z_fG532LA1!p3A_ku z;}Qa$OSh<wdMGJX1689nq3MO)&5H%_<>qE1CHxpC>HOc-b&*aJkn#AnRgg_=A(Zhj zhSit&SiArr;P<Z=6&-lo$jPEzXxHcsJRK19%rV25;p%<e`3bxezKnBT<mBarj^4oF zLj<>Mgg5=vLe!a7J9KCc0w`!!zTB{RQ(Coe|K-!CgthmxvYe3A<Q`xv3LlQoucn7g zxjH8DZ}IBUr;i`O6B>`}E)@_~YmkiQ8}n3iTDy9!865^^E?R29(l=)<teekmWv<TW ztp{5n%~`&3C6jY*b3u@?I>*6Czv_!U0P{K&KTlb>@KCk>W0V~bxTS$A%F3+$S@v3; z!cH!FPwo=xPBzYD(BcH24Az9_XiP8Zh7CraM@8Lx{rMX?Cxi%kwjv@(5Bpter?)PG zr3vfq41+^TB-`);0|9p4uE*Mhng**A%fCCiQf2@>&Aa`<f-}MdLx>+SBhAN0sgDNI z7T&;0&G0=J0FQjf&YhT5aaQ|@U$lJrG~>j}-%M{#v3g5o4~3Pqwu)+m8+&nYHvjYq z6PkY}gLn-8t~5Fj`L`TS9`boUt-Lt3320|#WIC9=vra-#97P;6Q5}}Wk(QG+YB9<6 z%9Xzw25y~j2t=0i)PT|y->G1Z(Lbe&a9J>&3^qi2ecTsNEcAW*?l~`AYVIYQM$*hO z-R=kai?RJcec9#miPqyWbhfp!GGFnk-LJ!jx^|diYP#^|MFtIGvFd_W0z8YaI`3sw zf{d1<JlFjK9=p^1XZ2-+F#s*Yr-wzUjCSkqes542VljBk(8{yE-`C|gq*MbC<eCj% zb!7fOT7WxvaDZNXs;WBYU8CVkcp6FKFl?9@P|jpN!Vy^%c1G-x*zZl^ThjGNgrHzk z8maM)Y9T$DJ4yEZ%OCz;V=!0X)xSUgL`-$1;Uup5GK~eaDILtcgr0_&P2M*w+#5ST z#_({Xe&nqY$gO7UYJJ?C_2L<1BIN_E|3<Uz>@*Y=@K#Y&QYwA>_B>wE2w6>aLp|cT z5`O*V7cy!jt&@7uc;_d+_>y}Z-n&u8&)A9kLV%!|njq$Ufe74w`O6<BeHCeiqO|~! z;{eYJ>x*3*Qc<&T6UzSL??lV<>|&!K{}t;hG4cr9f!rBw1(nR|QTy3lID@H=7_mq3 z#p~AvTsTCwM8j!9i$FuFQ=qep3wY;xC~vZJdMzQE#!xeIXIY<fuK}JSKr5&sZ{xgm zXEZk<;_AYWL`g!#Dk2<Tf<^G4q$A8G3=12irM2C8%94#8rObLLt*ngPzkixS4ofT- z;xB8OZpfaUGUWzqFj36{k4nBEPeNwf!+HRGJUfz7Dj=A6M8iBYI%0?q$QTIi96gdh zF)Svr=W7BNT<5@mRipM32w*4IF%fz0-0vJPYB|&(q{L=Zr_KRFr`1W=3EO!ao_WqT zTRZpF%vq~RSB143>(NRT#{|m7T{Di~xp4!4);R7gsp0K2MRm(5t*X;b^CvD67OJMY z8qcfoQYmt$Q~-8dLN$k3(e6XMi3_YnKB0xj1f>fr#uE?(2xj?F_d2v|hmnK<x<z)d z^M~pAQB$0EPEZ53_FcO+nIRw0ZxnL6UA93aK$&;$-CHMQ3U7Dy<wv~DjxE~qu!7Zo zfMyu(lrFp{q_SXltO!+&F=G|`cUpl^9yI=p#cvl@k_TDjo3jFkpCSo)cVJ(yy}F_6 z&`;C3+^$`a(0@Iie1qe0?b?x*O?((!bTS=CJlaOI1RO=CxZGrgNr~6hRWFm?U}0%X zg$%C+vquPVm1`P{5Z;S`#w_>>KGFm=3*0|RpW(m;pFaB63uXk_9-ZA$Noj&Y&i(s) zDT?9mu@UOuWgAPoo=ugo_c9cRBh$60Pc&u@>avYujxG7BynHlbCpgYOOn_;AKvPp+ zD+EgE6M!i+%^$lI=`mQ$g*C6+t4=U8LqaqW-6uCT+~)>_E!=oKDbF#*fEOhEKuQO1 ztZ&<@72^r7P|IU=N4rec%a1~XK!MDUaK*sd(dwOTzfHLE5hvr%o;g#qYk(Q6kM57A zu_~eoJ%f}~t(*VCtQg8+)USg(wY6%qDK-4xG}{+Caj3y2H#j*0{<G6dI_SBR(FkE3 z`CZ=tVV1~;=-GbL=szc@(TjpF!hirOe#eXr4vv@Xq+&o0y8^(2Baiay@yn1o-_IUB zx}4}i#g9-lwGcr2v}FsMEfY>ruCSY7>*+J^B*hc5L(frtbz_j)nb%Ax%fE=w1FIWB z$WXh<9*p-IQ_NE$dMc=rUXqIJ@lcaaK{$h~h3fH~J<SSBWyenO{0KH?7xc+$N%f|> z-e9_D2J#1ur>-OlVW(VKISd|57Uf^0h?Fr*exLKLMBl$7X$`5X-nFZfR9sw~GR?3W zde0aSp{=f7TV3ASeGz~G$<yxM71ZZeU&Od0-piqUPRJBGM~p3$J2HNo;a8V0{_W*H z*0&B*m>_4gb?%Tg8a;i`(OX5XCF-D7nOnZz!lBcaQgN;C?9CnYdetZ{E9Spjat)E6 z8vE8Mb?mqrsiaZVOoea6PE5ejhDJxEd~8h2cX8)SVyR-jf-%C-eWvr?$Hepp?Iw2D zbw)`jpuQ;`!uIbULl17nmC=l*=5n@4NMoSt$)!1Jy?0JlIC<)n0p-_cdyV}`q1yu( zDQKMVT#g88=<d}(rku$w>}+PcPxP;FNp7>U{=R2N0-J|alT0jBr|lYe1_~}Ykacrf zq|FK|_P4C!F7DvW4m`nQr+IYYI$HJ)j98&!owJBNnj5w<QCYe30VAE$U$n7-Tb(!4 z{^pYupgAl3JWAa-!{!RAq<zSo7nSedwM+U;^*%f;f4uPyCGUCi3yX}0<;E@@uf6_5 z@v`MsxwuFoQ4rvd>{BD*=g!3*kGksi>v!eJlZYp&5chvvSY?7$FHcPPe-oEX%?tv9 z2^I#1h7ZP4O*pm3NU<L5Ub>l_mUT{F!??Ct7H23Y`YOWrl9cUv{KuT>02F6H8@b{3 z;t_KiaHxKKQUPsk0Z!xfCQ@y^BBTW<PM>}ULDG|_Pi-q!aaqH;T8s{+36DL$r+0u? z*flOBQf?AXQ<tTbn4B{7-+5vj<i4*hAJ_^~nK>M2f5Qw&{qJkmlQ4y*xOel7hrRUV z6OgUty4Z*M`%Ann1eyXY{kzvZay*08F9NFVvY|>L9A-#2Z`^ndbQRU$qLwtv{O;YW z_Zip0=y1Q$;R|54qi0BQ@YW-zE^GW#P`VH9+c)#+)AN@3kEjHuM;RS(W-<W*Ga)x| zRL}MY7F~JyQDa3v$D5>aSqo{GEOI(hta16-v#9Ku0|LZ-Vp*ZBEGDX;QjO31zpBnW zuEx9#<Beg&OjJ{rY%R8uEg}@Dm{hc#Qb}1_D0_vn4kyGQ%PBEYIGj@wl8DOKq9R&k zZKOh3Q=*dI?^Bj}{V|`hbe8A&J@<0m*LB^~oa*pZ_j2|Ta;D=kTsQ7*`HTsH!`zuz zbjIxtu~w|Z5uhUF3^hna!8Ef?QfZ%PZPyiet;qg1GrLYDGC*3aYKHwAF>1V|m$R7s z2BN4X&MqfzyLx&aP~I6Gee+R3!cH7cZUR~X;|g>+i8#)K<R(>ppr{27Dl+&GA>ekM zxP-)8CT6@~6;X{;(F+n1k&`|Jv)rSH4h;hxgNE`F7K>wW<@^NI_Mw5!sTjBkW)r*Y zIR6&_#72K`?nyKQ9IU)Ncg76tKb<QwwvjS?C<{al;E>dmzGLI#>a6N4H8#`2!8MFp zVL+x7)?<qv=H_nyGO^%K5-(;F&H|tYbt~U+G7Jh?<zuC7?T>a=@8IO(Vwoacv&QM* zp+n{=EVcBiRoQYCM_k<j1N<MJc@1Xb-@VJ%`zIyA(No4e5P<VH+a=H0wIL2?pUU`< zNk?muvCSkNNgNio)5>Y;{Fb7IXR`UX$c{#i7;!}SIZ@xgIH?YDCbt9?;ig<Cw7&?A zgw)Gcj8ogY($Z1ZSSQ9VKKDpCiUZmS+=XmI!9wkQ6;v0gBkurma;Qo99TiHRf6;rk zAkbNz<aB6Vxp5<`_@uB`QeTI>Kw&!lnOhcg0rY|vihMbG3g7Lm{S>~#zPB}5ml*DG zX?7)io5%Dck#B~;woUcYO>^kW>BecsUmT9}bAkRP<cd5OaW~7CRGfZd#SGZVqvB7G zR0$z21M-XgYf8XK@9=l;7R70$Jr4rSpe~Z1;oK#!?zanGbi$zaK)Bp0$Jg{EH*!Yx z6`v*&JhPJ7=ugePeR~mcPc*IAU_o$j9d$o7P^cLjmF|ISs~ImGxWlq{O==~!iDgb1 zvOh_<<bN_tT+d-3mU{a7j0s{ZM-JEIV$p<&9x^iEGW(1KRfwwY=m=HR!9lff|M0mx z13iqqIm<}IMGXWN$)F=v&jv^=^6aQ4|9G~z!$x!8!Yy>1xg&zs<~}FcI6ZT$8LS^` zXQ@7Ffs4o}O}6w1=);kRPG36AS{k!C*?J75_mwM+H(th=)OmhkK+CE5<Og9m(r<TU zWcI_bQ04c0DcqBkl$4@AwDRj1ncYYlmTq76`a$9C+u^G}LC&)LNPm5}vk6ngSeq_2 zPvasY?ozju1<fwbBt3i74&~{u7=2dX*4xXA0D5D~#_V#3(7&Z~=ghfnd$RN57&a1Q zKax#$bPRydAZnnH(bvJTR$ON;7Z395rlNAkbYau+-2b3kJ1_i|%ZNGe*ZB;b^LoJ0 zBjis`ZlXkB_7hbKd_g(Eg3lWdyT(~TLX3TxQ2VmT)<dJ0mKT@Na7^fFPB5M9VQM%1 zRg_P%YiNGTKd6LR6!e_NLYur2RQO>Q+HnJ;_hn=I^GDIXV@iNZM;^Y4wBNDiRb^#8 z+<=ghX-D)DJ!nCuOzF`0j??3`KgAi_O~<(}sOR3*)QE2EF^jzoPn=Ez#zINeauOI@ ztQhJrW5vlQk6HKi@+I`Q8L(5BJoWC|H?Hh((E8%>`%kR)0l39;Bx|f%bnFxt$JBOO zfmR<Wb)nBi*~-1r9VYcwGDAK4VxjCbs05DU_5}HbMcyGNFSt95ejKH|Ga|xyYKiPG z0!ZO4QO-dF##+?X_frhw?=VmMOvn{0_v?3*s;7U~W@RTQ0jIwsRA!=w|NiS)Joyzl zr?(hL(<(AH!HN1IA|+<>;EW;p<^y6on*jg#bvMVEJnj5rv??wOHZVU?pT3}$GPY*x z3xlob9`YhWY--%PYn2h9a*hgPgrbHMXU;?{+cJis{N0EJNssJpEAXeipq_vr$8IZ# zB(3D*fAHcZeUnnWKa+9ow~{r$QBJh|xiIvVK%W<W%?YRd_v*=Rrh{**ecqVdyrSPx zcVQKBl5k~%xp{{*A{TfUZy%JlUEPr;rsl&3|D_QcF)wTB)^5D4M#2@>`U%nfA(720 z7nd{ff~isNE8v;(wE?B?4smzRX+6Hyax|=$ox}2`CdR~=YVWXIhDpA9KuKX%mQ|nU z2~m`QJ&c!!E3;JVGbO*%xu4H#(=Q4S{rdIzm~7a<8U66?9mZ7!`T0x{c$Uf4<d5!7 z8$Uq`qbk7E$NtNOAk*HecP15b2+_05tjPHi7AKR%jQd--alL+i7IrwmL-Kc`sLQ0r z%rEA5nAJAf^igXMr)+A!@hTvL!E64C9kAnfGooK*-=0qPB{c`%M{Rc9PYwQ8+OVYc zOqis;S7%j`z-W6I<-pFC{MO9e1i!xaNY;;Or`!B^3r%JBb4x;%wUb8pCG1T3JVh}! z@{~ZK8MyytMTNbRx3lq{ecKVl?c-H{cA9F|6DTyo7roW^6nJS_?mzi-cnC%sF`jxh z4{#-}N4IX-9*ZNqRq}x37G&q7JGBmo7Q2pUB{H-AKB-YCXdO@@R}C38$wy_3nI-)# zXq3F6Kv4LB>sz%OiCHAA38gZyq2ac79QeR}((XD^kd*gJLj5)IasvNi5w7KwDV-Es zgAN!}Ta-`lWoGH((xoum^+AMdt;F^G$v3*Ici_dO);$R#9F}JpO<z{9!w(V_){Z$E zQ8-`2mdksY?dIktlmPVmz`JCm43x@ka^si4YJ~$q*^UNF_@<IxW^MR1&_Mt6BgJ?; z^GjyR%3ux|jU5_veQ0&77-C7(`xg!}G?Wagoag74wLh30vjMfSF9yEC`9><2-^h!K z+B?Z}+TNYjlJ+|Crh9Api^GX`I63b3>({4`Tu=8vw01^<;!Cl6M>&&&t=!&y8^1VY zJA&p^hwcV@;Y+KSjhJ`uqQm88BT9jMdc^j?8)UJKxiypmG(`yoS|}I$tFdD_>Fqe@ zZUE(FCKD0|pR9?U+DdYN)e8!{z<tMC4d|Hkb9HFmiwiDMf=xhTqHRJUqvbWg>EKR= zn*^6I#zNw7NRqT45m_p*3U+@~3+Q?R`MAf(9hQN@1P|kR@GsosG?lMsfAsF!+V7YD zWc5j%WruVf5(JL}3K}i3TBlAdJogurT)y&FyI>eF5tlC(lPCkvXXTxV%c{`T*B9h9 z91N{tkI$$EuL9wr29ceqxpdq@TYh(1yoqX<IO<eR&oJ$)YJF70&>j(#B<`WFP~4;R z-`#b5FcW81GV1p4e;fJTq9+O~CbYd;wcI$a_!q4!IXUy#%<w*hVKxN|Y?f+qI0FO< zwoyh#vfdjsl*24|!-$W^s2HIlC~A>3YR31;NXogC$3%kK&vA$->O;TXQHi2%?Y1DE ziO7lpLr+|<G0<ts;lTUqW{L-L6U!HU>6l%1`SX0~^~2L`g;|g^YI9v<f<Z#m{Xgs4 z#a<c%LaZe;P5J@Nq6fxdH+iP6(9yU(WP6~^lqttWi)(3U#DeC}e@?-NFSR)<Hb^RT z^09X^o4;O1aHEEtZmx*jMB0&t`^5H13pY)z2+h-L{OG9>y~<jVMTkd}7Di>$rEml1 zb!{*j-gA5A&Q;l`4oa^-u(!0_1QQPXaP1w#W7^0z96X*8)e?d68vn=RpOaR!I!B61 zWs@Jy1xF=W_H8;}vO;OdFr#?)O`;q1b9t=T-p2=#PdH77hsty?j~%zfw#zTSb-6fp z3Ry&QvMAYqYRLQ@&JGTjj=9*C8akK55g(>AGdI<_?B^svacoTO#+?P<`NE_Dgn>eM zmPZ<V-2T0S=V#~SxZ{JkJ(D3Hy{o{AbTV8b5~)o$BAe{JTyi21;NZwZD($IGza?)x zu41ib`P8@RmchojPCkBPCP&p-CGRK*;P&Ly#a)|vB(?HO?(AeM2DPtmigIjffaGg$ z<#dqfy+tILKnexsLnUJGH1F%hu&H^SC7Q1A%#o4kLJGLZY!rE^8rtz~iO4UON|M$0 z6pp^PF!gR;+(CEUfc`!o+WHt5UfQp8Ea$-;_#5ydZKcJCJB7XL<bR^f`VztCs%Nh_ z@q=&lhj)fgmz={~f!VA$oKpSD)vI@c$5Ba|#R|&@;BMPLQ_ER#H>%sTqWv7w^x*0Z zek{XW2zp`It>4!nrPa82csX*ptG)er6XLA5+}u4Lb}&mJ$M<`(*+Sb}yB&sbOJ8dg zzF|8i&19^lN_ohp%+i6le~~A$%LQe+U@uVeq#3p&tqHIlW5@VQL;Tt$y_q06UpQc_ zh&V1A-N*1q&PwS6OO4UbM91PbmwKu*`A;70h+))@;DdTPJL(|9exk54vt)TH101zc zZ)u|TbcDKE_WE_g^<P_H_$Tp<#BAVj|4US;-iTE^zcsA+Cv)F{uEUiyLO~VdjDb zj^#eyIXCC;30ma8c9hz%Uwj()Hw>t={+=K_`Us+aPc~kFFi=hHaE|k8yJ7#KfDJ_n zOg)JtjCqc1OsD&p*RdbGSDekf88bAaZ%&j%2wUnU`Yv<4yev|pQm$u<$}49|Nc*^F zOi4zp2k_`;tQNh>fA(yhM*WSK**$RZZP8%rm2}6#gW1Qju9Pfdt|~CUHiGVZPfyMo z&|O*CG=*dR(9}_o%|meBxqtt0xtumtP}avUUHPovxva7YKE=LU2bjxRM`lY;y|_1Y z+znnJZCJz7TENl3X?@<NU8|O4>Fn^ON7?LN<~Q@ePlr!7*<Ej1z1IJ;@1DwuP79qr zZWV&f>nQfruS>Tsqb3&aolxjJu>5+O<A#GA`Q@vRJ+KCvBJCqsOKH`cN9W8HqsA;d zzPB{K4GEK9PD^<Tx<2+9>^n{V^NRE(l3%apg#H7Vb^WiSS#6p|FXArMr|C~S^P4=y zuWG{lkY+Ul@ABafdwUh4puzgdTdBnH;;={t|CG(YJA2kFTxfy8xc!KPa(eHtsR^vT z71)06Rv^?n{`09tNesvD8!>%gm3eP5n&b%AHsNy8av<P*vQNLKi;P5vjvce{IH1>g zsCveACyx?%t9DA}=9g){Pb@or02&T|0>T6$z}^MH=ty#al_YDa1G8?ty0~&?n)iYQ zQ+%gmvyl{;A@%t!BKAdDDLs6uS-{r9<>?N=z4gYgX$)I3%(Rcw>YE6mUYrTs?>%sW z)5jMhwL5KbUH!B$L<eB2a>fy1)f8c`kez@7SJc%p;C4_N6gg(m6Vqp!wrWGzp<Cal z87c_9^z})P!CB;w`u&UIifNZ|2|#~G(*j0m?>Wcs+9SQ^3}atK!+NS1v8puHhKYlI zu8*&8e*9hmM}*3hvwTmyP33!L@~I|RMjB3hZ)pCN$_Q&9?TAg3nIU{KFp73sfJ&NT z*fB$Yop{%VLF0?1W@fFdZO0yHs;l<xW1D&5+<~+sI!+zz_G3&R#}?87i5-)sXf;gJ z5hG?egQOXHxI7Sv*G9h+y%#&5+a&)qLHMrON!+o(IH|!@+pV8gegPPeJ3S+uEC%>9 zI8RETURuqJ({PM66)90NtcbX0;OVk(uD|~@H#dj*CtW^2yV)(`7Exw)u(P>&DOI-g z0nn|}mtajz&9(J+?0(;2W4JHTPe--<j^~<7qkk5SJa?dRBL-WRwz`6}by?>E2T_pi zj*&YhlB@eYZyml=Azc7>?K<;J`9wHF)eG$<*hc23c;G)1?t1cTKNPb7dY4V~Xo(*P z@>3Eltpii;j&)|g{;Rj(dbk!9iPJi6TmEYp6iGGUzh1!+9{n}Nw?}@o%Dnul?=Ba8 zfs6*~C`y0cajR>OwXTO!c7~jfY4kSz&%L14v-<afFasdV=zZLMYl)+<%Gd_{tQJ_t zb;&u#o04%olFrOouwamNE@v>pw?bekbqihvmS`KPC~e5zjQ}<}dU7*dtY<P|up<3w z){0;oo8PjZh?RYw2|n<yA*VdOAd#lJE%NZ_X`O6Y8-H{VS)1bIXs2z^-K`Rf;=2c& z6j(XhJ5B_`>WHYjRBuydTR@qdJ+^?6qHPL~>Af+`L-q;0CBu35h6JzjGxn4ltCkOK z>S?O}FydNiRhda2Tj3B@?Zg$ie!CI}<j!r`g%KXs_PUcC9g|WA&rDYd{(YV#X~1NX zZ1hc$SQ&e#dgsm5SvOun)tDH$b?S{o+!KUfYc=+aSvDw5aue19R(jPH`Tcm5`Az)D zAt0>rom)Cj{Vl}=gu_V-0g{OL{UT=G?|fQ*b?9bF@e9kTMl;ig-4lw%&E4L=F>&rW z-5^TYY~ZDAQKrAEXSeUnqKie==H}}?{-N|L#hES(8bx5mrF28d)Aitq(SBPj4ic4b zyz1L&3x!7f=L$wd7QR<#rbWdOPwD3-vD9#+<;01<IO&`LvCi3gT5s-y)4FNz_k@RE zv^jN5Ts&a%==iYW+Uc6gU~=Q@lUI(-$g=r|Zud@~wiMK564NJD%i$Dw_lgsc!b6Z@ zJYv}|<}krQ>X{Z|$n^v`2u6$MuUe!UYj>7m7zRLQ*BRC_k@7Pxpkv{G`1ogLU2PvV z;0$eL>|hZL>1?N)w{E4cZX8_*l_Tfo{U`|Y#u(yZ=E=X2Uz-_!o5=)+)+(89s{cYI z6K(Bv<tr?NIRtDTt7O`++@0zn(8}<xGrJCR;a>c`yYqyInC1U(x`~M=v{nb7155)> z9$jV~FO!9j%(v>T3dJQZAvSg>CH?3vR+AVy2A)5BSX?KZs(*<D)aK41?M%+A)`hyL zeZD62IN4E~m1oVEVS3;xH!tULzqvev03k;oP-M?e-13!xWw(M!xoPLYXCVksVJd0~ z3x;SM-AWxYc>Wx>zJI@@^3?SjkfQtnBbY9{nk_UX?D2T!Xx#HH@Pz0*xM^NkQlK#{ zlztTh?oajgs_0HmchYyc*wc3F(2t>b_iwwfKrkYdDy;oMvrAvXQxbT`F|N}L>6ix+ zK`BFuv76pGe=dCQ%1IXEMXzbQZsaHlg(()1f$G;PNeIbFNrxZ?E@GCG*T%g~tA8lJ zU4Dsu+rYrhX!?TjN5GZx7tW_DH^?5q_Vh5T13gAA=>9lr1gR=@VDu1Xk8A46JNrC{ zG6sFEF{q*bIc%-D$wHeIPMnn5#oc@Ks5pbrDDdtv7yV)iZ1AcEBQ7C?7#x`KxS>(y zoFu*<*#8jIJ_PQfYljaVDrHn6>H7NUfz^-+es3QN`10t?{CzoNxpXy|g_)WCw6w;j zZa38g>f%KTE_253OqdoWw0jIKE?j_Fo5B;K_t8K8XV_>SDGPcOKC7<dOzkY*I}mIq zGxKQHTPma2ahF$)T>~Ut(`IH!^)sap3BwHx+)jD5YuM8g)BCWt!FAd+$JfBM746En zn{2{3lD%<&CAomk2O?|4>PnahHOJR6^v9uDmL%KlMdnU{%JX6=Vb1BCMe1H_NNq35 zDj-TgGqdUeZ-f;}@qf>7GD<6Gw3(n7#R`C+Zj(TCndG-=HD20*bSr(+C~TYp&+FSP zC?F8^X+cUOA{?@>^M3aJ79ZB7r8P48k=4T6UB7<4w88!S(wl%+evp8bdW_z^DhJd# zf!PqTT(T`PLfBTiJH0NLVh@Nxzl!ZIy{O>xO5vw`W|mmcaq*3veR7g}6z(<fZr`JX zm#wJGup!Yz2H!Dt`hkdgCUeKPbs4&DwT%5niI%=fAc6SRILHa!b9fCPLY~qz%igMr zt@-}&xyJ=n?oLZfBY(a>I_>1FHqo8fbQsN0kFyDc%qM{E#p!?z3vF?Scdvla4b)l@ zjqdJ)J_47$Y*uGrlmLAS(gNKsm{@-`YEOK_1^)tZIN$Cwtb{xrP|i?~c4(;q-zI~} zgjqn#5mcL}ye5f61yI4_R|<w`ki6R_-y+mzf`h_H56diN!h_^P;0^IUgj_bKt2Y?o zsL^G}428ngL>>_EM24-*qV)8rEQsm&tYWF_Tt4U><mH^-rU$DJcUqTKBV87r!Kfx9 z?!;exRl{UinQFo3PEbePXE}moMb`7`&iXFGsFL6rYu8^jjJZ081JGoO^uCH2YdEPA z^#f!~mhQ|Cvv7znvRX_dQR69CJkYcCQc+>Ok|sZwpU(${F)(lJfx&Ml4`qOe4NVaR zrSf}XGNTFB;vSYtVSp0f!Kz8e1kwy(MW{FYx^hLl>Y_&ORK4vM!Azu%j-62O2sM)% zLwq(>p|%DTq1$C+S2`cJP$M5U1K1ZdBTg!SmcFho8WHl*G^Va(M)_E}!Nmaq@!7K7 z1=x7hcfLX|A<XOD-1hF=d5U?PUV?M|u$Y8|$xW*U>FVBi{J4=ShfebJ=^E5|8Azyk zqVbB0NleUL;z85q7bREJ;v`_gix0Mr7=eDyzLB)^M$K;Q*REbo@JJg?S=P*GrW^;> zJ@L0_E1nKa_;K6*DQ;ei{z+k_obh5ISWkTklyA9o>OHV7ga+-j0N|iEp*|$|Kyzne zf~sUXoup7cUP^I1Vp9cWZ^=c_3N>|gv=O~bLWcM0Bc&{YWYxd_9EwXK*5P_=H>VW0 zAJxM)IQSe01kE&NKh^Kwlh5T7J|PJONEjy$$4nM_m$OsR_`RW_ZB}grw8Lcg0rq9I z@yQl!BPd0mimPWu_gQvYQ+mAJ^{t2T41=-Xw_iVi(Fqn7{YNoC>73noAAXL=it|;v z+<1S^SYqN)$Hc>c>*x~0{cQJ*nCuhRpUT}4XO|1UOjQh_E<W2qlfznp;I3x0_r!A? zwesW}F;t|Uho$J_I{fsB6EW{M_UD<SZa!*l?94o{^vxS$vWI-cVc4tOGdf2MDReq* zVI2%5vvuS>FZ8vaD4KE6o}PMZkT6dI4q!dQ&boOpgo*UtK5EHx?#u$L!?pmf$nly2 zdT`mQ^Luy$F1ud!&lk&=z>6s<E9;#Wzg!yMPpv&_mZezbM%`Ir)SXU+$wbJ}B@akh zaOAC%;SD#}rB8wuExLDl?yj7lN=r-CkIg7yks*#N6{W^KZ9^J{O+eIqnj$AJQ=T}m z5P(L|GF?#N`M~~TP{6Qddt+lu$j<ES)K1*C1fQ^^;xgg{?v=VpN8cC4rG9Q~q$IbE z>a&Y^6XvT9&dxj$9u;zKOrA7gF1hqwRTbRDunPeY#pc4`AlW$fK2dpVYStf1P{&8? zy-(Ex?e4FqLx)UHUwJ<>^N-ZjAzEHAHVA|Q8;2OBXA;z+1Iqd<3$Zq10(xHqYXe8m z655#^+2wZ~S_;^T_K(!+-H5M~a_61qc{}cMa$PVRRF*RACsd|ZGh*h6(p=@P--*Y# zk-24_E!2HzLW0{`Qy%%Ry^@mj0gYpU5#k7f8<^LNk4}@Ilyg~*KS|ks@JV0+X^_T! zz}REPf+Q9bI4v(2E;$;no9kr-r8@v@><+O|yidD~M@d1CDQP%RuMW9iP2cmXX`cr+ z3gAv<CW6sE&l0hTt#+Hd!^Ps()Ybrb1}VC`MyY9e@oj4wYbB5d9sGrAw2?ZhTuEI+ zgAYG7VtE+@&oyvKK`Erm&ct@<+LdJIgI(hLiOWB|(@F~!!Kk3ggGgKb#bfXB<8LTR z$+DR}bD8`iI3F8aMdAWl@`7F=G4NEsPyj<=rfW_0$B(`~zW}Jc1&EKYDgI+O451wS z8X)FLwmJSg9{eYD-U)%*%_e8_;v`Sw`lE}=uB(KYIBPiU`>FzY;$H-8B-CVG<2TPM z^_Wg$1kdJI5H6-&JWD(@Ks|!R8=Dv51e`Q*+~Sw>=+S5Dixx*yd_?H5#mcNNA)?VG zE@I!G@L+cA7+Lp#(nK$8DGlM~!vay^!v``fgg*+PXgWwT9>y{O=iD6~^TDpR62D0l z`h%Ia<B94*c(=5K2CRJ`cjS;ELxv6OX6Uz@G6E+O;`F+OD{se)M`cI;Ndx<`vTD!2 z8$KMH;c|)|<n=Sy^EPa8f+~2GuvY_(I(G%qG*4J>RhAP9+kJUIf(oNaBj|k%nv9gV zAhlIhH?CjrB5c($T=o*4)Bzw0<_yAe;)f6SNf~VkOwj$t!Kx}MRrFYVk@l_E!P>;^ zg^C8aF#&Qe$q?^=SzW!L2P0>Jc6yLm@VYj?mPw1_u2L5X`(410;PnIZP?i!ZgbUtn zAN>ZO&tk#^T_X{c7J<jtvJ-)$4U|z`w~|g3uNBBJs2}+FyW>*g6uqAfD4clk0tt?Q z)c`opm%Evjr8ju+LKt;aIcS++_c<jEB@(S5XJUXzg2Y_HA?!rozI|yN_Ay$g!f-q2 z;6RK7+e4dE*ROBi4ehxgFZL}=k1|?(kb9UD39D@0gVQ(g%jEk*Hf3}VDNS%gWTOCY z#JUQ{%!T8V)eevs8TRprhS0`S@?wfB&w9c838Djv7;5?zBSy45o2JoGbP1d_L^e## zt%IA&n?^%_CA-@k7~4pmB7=Z!6+D|wNN-}>1lMpV-sOz>uC|5h@RB>lc?jWF3oW9r z@5>i2z`z8~K2)L{?JMt}x`Kue+fRW(Nyp6KFo%(HK-iHWSufjs{(J*-Pa$|R2qssO zM{)f}=5dOc#8JVrBVH8tv*_34AF49hiQ0h$O6>pAi@i;`a_iQV$0dB9m9Oo-Dvr{7 z%U1nb=^(F70J{q*{j=Wop9d*HQ*X3<Y5rR|5SW)oo+UW$Z#cb$Jp?DMZ@)|%e++jO zG|^))g%gMKz@t-*`zBFt{>N+Ut^YGPTwC8FkBlGFp1%$M^KuCsvX&S1F3A|7aQn## zTX#O$*Y`;_1Eo@Sz~EqqWdZ7BhANkJ)Y{7@5<Q!hz5L1A_b+;+yl8sGmMvS*5s^6@ zBWbrEG6iQlYt>mN(VV(GJ}JEHaAPW4(fY>0Cc^I!;+R3_lWj27Y!)KF`m}OvVGLs7 zAC-Hu05tl|7$|tK#QgGt#kNhyT9jRL&N7daO=L@Rh|ZshSD?0Y#+Vp)1BR@RNlF?P zB$x^L{iRenDrEA8{b#-M;&#mt-jiQX;`8Z#h3&EQzZx268#?ocF%OU$$|urB{`hvN zQRNfaAoWdh%yh-O;u>K*NVR#VV)Lu<pZ<O@t)a_OEs&F(zA3!*ZWAgw{ZO--Xx%Hf zwNWrV4)}rg12x(!idX%|b}POT^O|2PX0<SX`f@R!B`!gkU)9(eR7U;8=(>mE_$pk; z>k(8%63GO`iu+0ZR@i=rW13&0U@-CLhz6#UElP3jn^XPAx3-2TCS!`ht;^1>Y2rE~ zS^sgJg`c#`xV@&LD+{MvMBV&>gvevR6^|=H7_N!!j$&aKj2%|kZexDfxV-u8zW&El zi&Myei{_Qbn}@9sv&>}{Eq|*=VeZ398aun&+#}*8y{deSw!+#bdme0xhjr^(5$%|! zjh*~WM&9z1rY3Ag{};<e*v;HB?GG(N_w+7=W>MWk=mXF>;yLtdvjmEpot>#-SCmMS zWr{K1aYNyiOw)EnMKQ&hiHH6h)0Wp3J%bJpA-cC1I!$jzy~1Pqxg#!qFdkLSXj@3X z*NVG-pTf5MSo44FUiMa<M9Bw}|8mJ9JRt=2)|*WV3SR*8E<v`H&1GY6de#?>Amwk| z;s5;j%5tObx2V$FX{m;Jkv=Lrv}HAP)rSw#xcd(IBFVQ%@t+?dDwr|ZCsi_JZ>L2X z{)DSoVGk7r1=fZNUUU}P!uiK0ga<T!Wjnv`^1mOiVmh7mWkN$&$nZGcMc;WLwz-J* zNMm~54HWqUc7|C0|Ec);!$Gtx&`Sr2!$FmxFSv2BE*8C^4}q00RwngOTj+k$>mWJ) zEr9%ZTLrtemo8ss$*tB7OU60$N=5WDL4UMX@RUPWRMDA8T!rw2R_M1LqecI8KKT9S z6&TLz^Wy<|C#-&PJv~Q73-$%8WJpfJIr^j_iDXoeenBSNq<qTjhU!&Vbw8reHVEdQ zS#Xa2r?SJ>h$nowf@?)ZDO5IMOz`*cXv;ZApEQJ};Y53kR}g%2xeG%E{&=Z3f>ep7 zYdJZU@&WzG|HWq6MmA_r#BJNwK}%pQGcT~?A$S)w3ePFz42oK0JVEX|WQgo;_Kxqd zHb`*YqU_KVqYg<ufBx4{GvVrd|5eGpvVVoS?rY)(^_OE>6<5EM3rPbcY7iW4ISlb* zTK(sf;(I@bqlHdD91b@YhYmtQ+?f8~UsS<vJ7q0Q?48OE_V&W&BV?4#Hzl11071an z3m5)Y7Pd67^-JTob>A69GDpl8>tQRxENiDF02M6on>jN!b-fU<WI|CzYg)OjC48AS z3LB@Pt^IDs<d1rYdRTZ=o>+HMRW+bP5y<4%fhh<Y(~}neIDJ7kxd9*;z@Ne$>VmU2 zJDh{s3QeWmGI*yjv=5Tt(MWs`l|J1&JZB%zext?OT8Gb`y?6C$pCLo+9*_U&Vq^tQ z%-lId1bxNy;eQ(kJD4BZF!%IcxXDR+=HDc$cBTwi7_zC+EwbNs*RJ0&LME^w=$4Hc z#9Oa@;N;;UAPZb{bNx<~Lo^%df-y%2A+I)b4-5_I#28EV?AC2Sr!D;T&C)6q<^F%G zw^gtMJ3F_1doCQDtFwk`c~SFF#8WLDuUh+kx~lMD$k3tGLd)@^HPys2DmA$JNP2oY z>tQxtwqG_!c%NkLF=HNHzAT)Ag3C-EXHwF9%KVy_v@FccXG3tO8s`S}6*K>43XL>m zBCWvV*q=<-N3zxn1fGe8V@EopU>L8ohThl8W?1V8FV8Q#Lj+hXvnnwCD!5GB!pI$o zp-q~-&@sp&EVR#2LGtHqTKy!jjE}n}T@!pA7PmAM1!FizoQ+V+-r8ai8f<8P8ab$y z>DEcTccP0G=m|ozAZ+6mYH@1ctnGh&KV^1F@Bx6U5FG4vJ3`!YMa;&EirX@KouG|E zOo(a4m3@o?h+bT_k<RQ9PBMgqkV7pePnH6`f4Fzn?$f7-R|?<>eQzF}*oTR26qIh! z&rrG^I(SgFk)b@D131&v8&AI9g0&$6pa=nt;fBr!-k)T+wNgpYHJeUiHZF8>G!j?1 zhvLg6HhYGk#=A_e<l^#Ah%WwY7yxqs{VOmheNl)fx+WdLa;%c2m>ObMzyN{?6f%!p zjEdjj<|kCiPSHO+9PB(5LoabWtDC7cY(L=4#1rp0gnN~@T&0FjW#Gm&0{oG!XIcQ> zqk*qz$n2KWJjawlG*0<Xsa)+kB2(c?8}h`lwPgw=astO-6^b1}JP6yLv6uk=$ApGn zrS_cf>wEM;zz-Q(WhBB>8jdMZ1`{6t+582s!0Ca#hm@y62Vj$OX4%rEj7t2zvgKm5 zSgg((zS}Uv#;y5c(@WT%#WjzP7MPIlJO?U35aOeu-2u44Tj1#SeS7vS4-TfoJq*~S zH%|9^JbPk|6Ot!Oh?cJz6#0;@%r9D-H%1iaHuxl*!J?w67cwk+;{cp+R~H@jTy~@< zL_TrT?EnsW)_L>gWQXp@B~KS6s7-BK<cBVo9ZI5y_&(jgFDQD6F@+jm1MVJ>1YtQH z*d3hQA=?Aa2`h_^(9a{88jffZdCl<QWs8g|H~+3sxz<9vq`D4b6GvG9k}NHkS3lco z*?U$k#;DUSEI-LERkoxL4E^)_$^N7Y7WftjX!FN~bV#IxxS-8o-U7+hZSW}i7qW}` z+a{hmpbpVXHIBH)xh3Ao9_#$2M(zWymi+f?x%Y67=OJ#WukWBS{Q8|cDl1~Y9)CMP zdPc`=OTq%qO*g-T!dm4^<x8+P1Olcbj)B+`|2!|Q@J`E#6>h46Lb{&NhVkHRe*E=y z{-#d_je+l}udK)oU%zf057zEIdt5v`j=#CBU^<`i4_sXX<TE@ZbQwh+Cm&`#PuQ2= z@{&9Zas0&xbo*6b3|&?tso9HDc(Mm7M2Cbg%$^HPCQwww6&`rv=G3<3K7DLT4%le) z+xi(@Hk@y#c<{zHyWGWR$E5zT=h&gRFNrfN-W%^X9ozWJw5#{^ZaFXh{fwUUkqIC8 z;=q@wXYRWBXP@exd1uAH9~)Ox4IK12Wq025Q}d@K7(O>{>Y6z8+UIFMzVef>;*77; z)8>zByrKS^-50+<Sn!Qf#BY?P-yD$m&D#F|^)|DRf9nAGD^08Ksh!{1|JaeOps;E5 YPZf=B{%_`FG=G9g6Q)=kH=D8Uf82kiIRF3v diff --git a/tools/update-dockerfile-graph.sh b/tools/update-dockerfile-graph.sh index a1e22a69cdc7..88189e8ab208 100755 --- a/tools/update-dockerfile-graph.sh +++ b/tools/update-dockerfile-graph.sh @@ -24,7 +24,7 @@ if printf '%s\n' "${FILES[@]}" | grep -q "^docker/Dockerfile$"; then fi # Define the target file path - TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png" + TARGET_GRAPH_FILE="docs/assets/contributing/dockerfile-stages-dependency.png" # Ensure target directory exists mkdir -p "$(dirname "$TARGET_GRAPH_FILE")" diff --git a/vllm/config.py b/vllm/config.py index 5cd08db43712..37cec84089d1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -988,7 +988,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid from vllm.platforms import current_platform if not current_platform.is_async_output_supported(self.enforce_eager): @@ -1004,7 +1004,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.runner_type == "pooling": self.use_async_output_proc = False - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if speculative_config: self.use_async_output_proc = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b561a1a77487..988ba14dbc27 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1084,7 +1084,7 @@ def create_engine_config( disable_log_stats=self.disable_log_stats, ) - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 4cfb22c5a750..323580fa7482 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -67,7 +67,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 5c0c90972b58..c79c603c02eb 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -75,7 +75,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: model_config.enforce_eager = True diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 6ba5a51007b4..252c80957305 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -114,7 +114,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/features/compatibility_matrix.md +# Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid class SpecDecodeWorker(LoRANotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index 25694c121581..511db7e67bc8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -85,7 +85,7 @@ # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/features/compatibility_matrix.md +# Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 0825abbed143..d9cf2055ed56 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -824,7 +824,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index d925f088357b..e2854bcb37ce 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/features/compatibility_matrix.md + # Reminder: Please update docs/features/compatibility_matrix.md # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: From f2036734fbf6d4b119d9362dddb8b4a6954e3591 Mon Sep 17 00:00:00 2001 From: Pavani Majety <pmajety@nvidia.com> Date: Fri, 23 May 2025 15:52:20 -0700 Subject: [PATCH 115/192] [ModelOpt] Introduce VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE env var to control blockscale tensor allocation (#18160) Signed-off-by: Pavani Majety <pmajety@nvidia.com> --- vllm/_custom_ops.py | 12 +++++++++--- vllm/envs.py | 8 ++++++++ .../layers/fused_moe/cutlass_moe.py | 18 +++--------------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e74d139ab980..3c8e6b95ce76 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1085,7 +1085,6 @@ def scaled_fp4_experts_quant( blockscale_offsets: torch.Tensor, topk: int, expert_map: Optional[torch.Tensor] = None, - MAX_TOKENS_PER_EXPERT: int = 163840, ) -> tuple[torch.Tensor, torch.Tensor]: """ Quantize input tensor to FP4 and return quantized tensor and scale, for @@ -1107,9 +1106,16 @@ def scaled_fp4_experts_quant( input_tensor = input_tensor[ expert_map] if expert_map is not None else input_tensor m_numtopk, k = input_tensor.shape + # Control the maximum number of tokens per expert supported by the + # NVFP4 MoE Expert Quantization. This is used to prevent the kernel + # from running out of memory. This value can also be increased to support + # larger models. + MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), ( - f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT * topk for" - f" scaled_fp4_experts_quant kernel, observed m_numtopk = {m_numtopk}") + f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT(" + f"{MAX_TOKENS_PER_EXPERT})" + f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use" + f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value.") scales_k = k // 16 padded_k = (scales_k + (4 - 1)) // 4 diff --git a/vllm/envs.py b/vllm/envs.py index 363ba14ce4c8..88953af1042f 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -117,6 +117,7 @@ VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 VLLM_ALL2ALL_BACKEND: str = "naive" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 def get_default_cache_root(): @@ -814,6 +815,13 @@ def get_vllm_port() -> Optional[int]: # - "pplx": use pplx kernels "VLLM_ALL2ALL_BACKEND": lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), + + # Control the maximum number of tokens per expert supported by the + # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for + # the blockscale tensor of activations NVFP4 Quantization. + # This is used to prevent the kernel from running out of memory. + "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": + lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")), } # --8<-- [end:env-vars-definition] diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index aff108112b61..26a433da2189 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """ CUTLASS based Fused MoE kernels.""" -import os from typing import Optional import torch @@ -271,8 +270,6 @@ def cutlass_moe_fp8( FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max -MAX_TOKENS_PER_EXPERT = int( - os.environ.get('VLLM_MODELOPT_MAX_TOKENS_PER_EXPERT', '65536')) def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, @@ -330,10 +327,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype" assert (topk_weights.shape[0] == m and topk_ids.shape[0] == m), ("topk must be provided for each row of a") - assert (m <= MAX_TOKENS_PER_EXPERT), ( - f"m must be less than MAX_TOKENS_PER_EXPERT({MAX_TOKENS_PER_EXPERT})" - f" for cutlass_moe_fp4, observed m = {m}. Use" - f" VLLM_MODELOPT_MAX_TOKENS_PER_EXPERT to set this value.") + out_dtype = a.dtype num_topk = topk_ids.shape[1] @@ -362,8 +356,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, expert_offsets, blockscale_offsets, num_topk, - expert_map=a_map, - MAX_TOKENS_PER_EXPERT=MAX_TOKENS_PER_EXPERT) + expert_map=a_map) c1 = ops.cutlass_fp4_moe_mm(rep_a_fp4, w1_fp4, rep_a_blockscale, w1_blockscale, w1_alphas, problem_sizes1, @@ -378,12 +371,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor, torch.ops._C.silu_and_mul(intermediate, c1) int_fp4, int_blockscale = ops.scaled_fp4_experts_quant( - intermediate, - a2_gscale, - expert_offsets, - blockscale_offsets, - num_topk, - MAX_TOKENS_PER_EXPERT=MAX_TOKENS_PER_EXPERT) + intermediate, a2_gscale, expert_offsets, blockscale_offsets, num_topk) c2 = ops.cutlass_fp4_moe_mm(int_fp4, w2_fp4, int_blockscale, w2_blockscale, w2_alphas, problem_sizes2, expert_offsets[:-1], From 4fc1bf813ad80172c1db31264beaef7d93fe0601 Mon Sep 17 00:00:00 2001 From: Feng XiaoLong <79261065+Crucifixion-Fxl@users.noreply.github.com> Date: Sat, 24 May 2025 07:16:26 +0800 Subject: [PATCH 116/192] [Bugfix] Migrate to REGEX Library to prevent catastrophic backtracking (#18454) Signed-off-by: Crucifixion-Fxl <xmufxl@gmail.com> Co-authored-by: Crucifixion-Fxl <xmufxl@gmail.com> --- .github/scripts/cleanup_pr_body.sh | 2 +- benchmarks/benchmark_serving_structured_output.py | 4 ++-- benchmarks/kernels/graph_machete_bench.py | 2 +- examples/offline_inference/prithvi_geospatial_mae.py | 2 +- pyproject.toml | 1 + requirements/build.txt | 1 + requirements/common.txt | 1 + requirements/nightly_torch_test.txt | 2 +- setup.py | 3 +-- tests/entrypoints/llm/test_guided_generate.py | 2 +- tests/entrypoints/openai/test_chat.py | 2 +- tests/entrypoints/openai/test_completion.py | 3 +-- tests/entrypoints/openai/test_prompt_validation.py | 12 ++++++------ tests/models/multimodal/generation/test_phi4mm.py | 2 +- .../multimodal/generation/vlm_utils/model_utils.py | 2 +- tests/tool_use/test_tool_choice_required.py | 4 ++-- .../entrypoints/llm/test_struct_output_generate.py | 2 +- tests/v1/entrypoints/openai/test_completion.py | 2 +- tests/v1/sample/utils.py | 3 ++- vllm/collect_env.py | 2 +- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/openai/protocol.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 2 +- .../openai/tool_parsers/deepseekv3_tool_parser.py | 3 ++- .../tool_parsers/granite_20b_fc_tool_parser.py | 2 +- .../openai/tool_parsers/hermes_tool_parser.py | 2 +- .../openai/tool_parsers/jamba_tool_parser.py | 2 +- .../openai/tool_parsers/llama_tool_parser.py | 2 +- .../openai/tool_parsers/mistral_tool_parser.py | 2 +- .../openai/tool_parsers/phi4mini_tool_parser.py | 2 +- .../openai/tool_parsers/pythonic_tool_parser.py | 2 +- vllm/lora/models.py | 2 +- vllm/lora/utils.py | 2 +- vllm/model_executor/guided_decoding/utils.py | 2 +- .../guided_decoding/xgrammar_decoding.py | 2 +- .../layers/quantization/compressed_tensors/utils.py | 2 +- vllm/model_executor/layers/quantization/modelopt.py | 2 +- .../layers/quantization/quark/utils.py | 3 ++- .../layers/quantization/utils/gptq_utils.py | 2 +- vllm/model_executor/model_loader/tensorizer.py | 2 +- vllm/model_executor/models/mimo_mtp.py | 2 +- vllm/model_executor/models/minimax_text_01.py | 2 +- vllm/model_executor/models/phi3v.py | 2 +- vllm/model_executor/models/qwen_vl.py | 2 +- vllm/model_executor/models/transformers.py | 2 +- vllm/multimodal/processing.py | 2 +- vllm/reasoning/granite_reasoning_parser.py | 2 +- vllm/transformers_utils/tokenizers/mistral.py | 2 +- vllm/utils.py | 2 +- vllm/v1/structured_output/utils.py | 2 +- 52 files changed, 62 insertions(+), 58 deletions(-) mode change 100755 => 100644 .github/scripts/cleanup_pr_body.sh mode change 100755 => 100644 setup.py diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh old mode 100755 new mode 100644 index 3246c6f9bc4b..8d65936fba1d --- a/.github/scripts/cleanup_pr_body.sh +++ b/.github/scripts/cleanup_pr_body.sh @@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)" python3 - <<EOF -import re +import regex as re with open("${NEW}", "r") as file: content = file.read() diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 5088c805f53e..6a50f47d3951 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -672,7 +672,7 @@ def process_one_metric( def evaluate(ret, args): def _eval_correctness_json(expected, actual): # extract json string from string using regex - import re + import regex as re actual = actual.replace("\n", "").replace(" ", "").strip() try: @@ -687,7 +687,7 @@ def _eval_correctness_choice(expected, actual): return actual in args.choice def _eval_correctness_regex(expected, actual): - import re + import regex as re return re.match(args.regex, actual) is not None diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index ab364a84d6cb..0c86e4072957 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -2,11 +2,11 @@ import math import pickle -import re from collections import defaultdict import matplotlib.pyplot as plt import pandas as pd +import regex as re import seaborn as sns from torch.utils.benchmark import Measurement as TMeasurement diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index f97a1f32e621..e3cc606db7a9 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -20,12 +20,12 @@ import argparse import datetime import os -import re from typing import Union import albumentations import numpy as np import rasterio +import regex as re import torch from einops import rearrange from terratorch.datamodules import Sen1Floods11NonGeoDataModule diff --git a/pyproject.toml b/pyproject.toml index 762ac9e11566..2e4242f6d5c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ requires = [ "setuptools-scm>=8.0", "torch == 2.7.0", "wheel", + "regex", "jinja2", ] build-backend = "setuptools.build_meta" diff --git a/requirements/build.txt b/requirements/build.txt index 5edc593b9270..320e5b892584 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -7,3 +7,4 @@ setuptools-scm>=8 torch==2.7.0 wheel jinja2>=3.1.6 +regex diff --git a/requirements/common.txt b/requirements/common.txt index 80f90e60007e..f31824b55026 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,3 +1,4 @@ +regex # Replace re for higher-performance regex matching cachetools psutil sentencepiece # Required for LLaMA tokenizer. diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 3aebcaa623c0..e9b466d3a82d 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -38,4 +38,4 @@ matplotlib # required for qwen-vl test # required for Multi-Modal Models Test (Standard) num2words # required for smolvlm test pqdm -timm # required for internvl test +timm # required for internvl test \ No newline at end of file diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 7675fbdf3efe..180f2f978501 --- a/setup.py +++ b/setup.py @@ -5,12 +5,12 @@ import json import logging import os -import re import subprocess import sys from pathlib import Path from shutil import which +import regex as re import torch from packaging.version import Version, parse from setuptools import Extension, setup @@ -389,7 +389,6 @@ def run(self) -> None: # vllm_flash_attn python code: # Regex from # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` - import re compiled_regex = re.compile( r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") file_members += list( diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index fdbdccd4654c..dd5d17885eb9 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re import weakref from enum import Enum import jsonschema import pytest +import regex as re from pydantic import BaseModel from vllm.distributed import cleanup_dist_env_and_memory diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index a10b42ea3a4b..2509ef0d280a 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -2,13 +2,13 @@ # imports for guided decoding tests import json -import re from typing import Optional import jsonschema import openai # use the official client for correctness check import pytest import pytest_asyncio +import regex as re import requests import torch from openai import BadRequestError, OpenAI diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 1d9aa4972b70..9d12f27a2b87 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 - # imports for guided decoding tests import json -import re import shutil from tempfile import TemporaryDirectory from typing import Optional @@ -11,6 +9,7 @@ import openai # use the official client for correctness check import pytest import pytest_asyncio +import regex as re # downloading lora to test lora requests from huggingface_hub import snapshot_download from openai import BadRequestError diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index f889189a9968..e384915899d3 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # imports for guided decoding tests -import re - import openai import pytest +import regex as re from ...utils import RemoteOpenAIServer @@ -32,7 +31,7 @@ async def test_out_of_vocab_token_ids(): client = remote_server.get_async_client() with pytest.raises(openai.BadRequestError, - match=re.compile('.*out of vocabulary.*')): + match=re.compile('.*out of vocabulary.*').pattern): await client.completions.create(model=model_name, prompt=[999999], max_tokens=5, @@ -46,9 +45,10 @@ async def test_reject_multistep_with_guided_decoding(): with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() - with pytest.raises(openai.BadRequestError, - match=re.compile( - '.*Guided decoding .* multi-step decoding.*')): + with pytest.raises( + openai.BadRequestError, + match=re.compile( + '.*Guided decoding .* multi-step decoding.*').pattern): await client.completions.create( model=model_name, prompt="Hello", diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index 5a12b5910949..e51dbee479c5 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re from collections.abc import Sequence from typing import Optional import librosa import pytest +import regex as re from huggingface_hub import snapshot_download from transformers import AutoTokenizer diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index b71400fc8312..743c7f947697 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -3,11 +3,11 @@ for manipulating the input / output of HF & vLLM test runners, which are typically specific to a small subset of models. """ -import re import types from pathlib import PosixPath from typing import Optional, Union +import regex as re import torch from PIL.Image import Image from transformers import (AutoConfig, AutoTokenizer, BatchFeature, diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py index 2ab87a0ef41f..291769848145 100644 --- a/tests/tool_use/test_tool_choice_required.py +++ b/tests/tool_use/test_tool_choice_required.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from copy import deepcopy from unittest.mock import MagicMock import pytest +import regex as re from pydantic import TypeAdapter from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, @@ -333,4 +333,4 @@ def test_streaming_output_valid(output, empty_params, delta_len): combined_messages += message.tool_calls[0].function.arguments combined_messages += "}]" assert json.loads(combined_messages) == output - assert json.dumps(json.loads(combined_messages)) == output_json + assert json.dumps(json.loads(combined_messages)) == output_json \ No newline at end of file diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 25bbcd901d6a..5f1fff200de3 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -4,12 +4,12 @@ from __future__ import annotations import json -import re from enum import Enum from typing import TYPE_CHECKING, Any import jsonschema import pytest +import regex as re from pydantic import BaseModel from tests.reasoning.utils import run_reasoning_extraction diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 3ffc54f520b4..333ad23795f3 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -import re from typing import Optional import openai # use the official client for correctness check import pytest import pytest_asyncio +import regex as re from openai import BadRequestError from tests.utils import RemoteOpenAIServer diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py index f540895bbf14..932b652aea32 100644 --- a/tests/v1/sample/utils.py +++ b/tests/v1/sample/utils.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import re from enum import Enum from typing import Optional +import regex as re + from vllm import CompletionOutput diff --git a/vllm/collect_env.py b/vllm/collect_env.py index 85746b7ef606..86eb465b8f65 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -815,4 +815,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index 37cec84089d1..c0671d2524ec 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -6,7 +6,6 @@ import hashlib import inspect import json -import re import textwrap import uuid import warnings @@ -20,6 +19,7 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, Protocol, TypeVar, Union, cast, get_args, get_origin) +import regex as re import torch from torch.distributed import ProcessGroup, ReduceOp from transformers import PretrainedConfig diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 988ba14dbc27..3b90880167dc 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -4,7 +4,6 @@ import argparse import dataclasses import json -import re import sys import threading import warnings @@ -13,6 +12,7 @@ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union, cast, get_args, get_origin) +import regex as re import torch from typing_extensions import TypeIs, deprecated diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0ab6fcdca1a4..2da89b4f5944 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -7,7 +7,6 @@ import inspect import multiprocessing import os -import re import signal import socket import tempfile @@ -21,6 +20,7 @@ from typing import Annotated, Optional, Union import prometheus_client +import regex as re import uvloop from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index da01eb472c44..393cf381b16b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3,11 +3,11 @@ # Adapted from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import json -import re import time from http import HTTPStatus from typing import Annotated, Any, ClassVar, Literal, Optional, Union +import regex as re import torch from fastapi import HTTPException, UploadFile from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index ee18e0b0a454..bc11686d7be8 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -2,7 +2,6 @@ import asyncio import json -import re import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence @@ -10,6 +9,7 @@ import jinja2 import partial_json_parser +import regex as re from fastapi import Request from pydantic import TypeAdapter diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index bd8e87e4cee8..14e743e13a72 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Sequence from typing import Union +import regex as re + from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index a589f814f88f..383e0d44de99 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from json import JSONDecoder from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index e56a8ef7193c..2b9f9852bcb3 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index fbe2ecbb4701..2714a545f997 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 9338718908cd..4eda7044cbba 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from json import JSONDecoder from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from transformers import PreTrainedTokenizerBase diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index 9dbfe85ecc68..fecad7e653ab 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from random import choices from string import ascii_letters, digits from typing import Union import partial_json_parser +import regex as re from partial_json_parser.core.options import Allow from pydantic import Field diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index e4ac2c47ba08..b403a146716d 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re from collections.abc import Sequence from typing import Any, Optional +import regex as re from transformers import PreTrainedTokenizerBase from vllm.entrypoints.chat_utils import random_tool_call_id diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 5f5ee43b0482..548ff39d1ca4 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -2,10 +2,10 @@ import ast import json -import re from collections.abc import Sequence from typing import Any, Union +import regex as re from transformers import PreTrainedTokenizerBase from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 83aef62451a1..af5cebdf2a8b 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -3,11 +3,11 @@ import copy import math import os -import re from collections.abc import Sequence from dataclasses import dataclass, field from typing import Any, Callable, Optional, Union +import regex as re import safetensors.torch import torch from torch import nn diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index b66850d4304f..619dd3bdc40a 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re from typing import Optional, Union import huggingface_hub +import regex as re from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, HFValidationError, RepositoryNotFoundError) from torch import nn diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 1ad1ef8fbf16..3f77cf394d9a 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -import re +import regex as re def has_xgrammar_unsupported_json_features(schema: dict) -> bool: diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 7ca7bab818fc..d2e568609945 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -4,10 +4,10 @@ from __future__ import annotations import json -import re from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any +import regex as re import torch import vllm.envs diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index ccd54281ceb7..75e81c4dd49d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Iterable, Mapping from types import MappingProxyType from typing import Optional +import regex as re from compressed_tensors import CompressionFormat from torch.nn import Module diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index b108b02a43e2..1c5680f952ab 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -228,7 +228,7 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config": exclude_modules, group_size) def is_layer_excluded(self, prefix: str, exclude_modules: list): - import re + import regex as re for pattern in exclude_modules: regex_str = pattern.replace('.', r'\.').replace('*', r'.*') if re.fullmatch(regex_str, prefix): diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index d1d293b01791..5e56bcb7564c 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Iterable, Mapping from types import MappingProxyType from typing import Any, Optional +import regex as re + def deep_compare(dict1: Any, dict2: Any) -> bool: if type(dict1) is not type(dict2): diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index ff7a8169e6fb..36161d13b24f 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -import re from copy import deepcopy from typing import Optional, Union +import regex as re import torch from vllm.config import QuantizationConfig diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 900f12ebe6ca..6f9408d892c3 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -7,7 +7,6 @@ import io import json import os -import re import threading import time from collections.abc import Generator @@ -15,6 +14,7 @@ from functools import partial from typing import Any, BinaryIO, Optional, Union +import regex as re import torch from torch import nn from torch.utils._python_dispatch import TorchDispatchMode diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index adcfcaa6b1e6..cbca6a4c8f9d 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -250,7 +250,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params def map_model_name_to_mtp_param_name(self, name: str) -> str: - import re + import regex as re name_without_prefix = [ "token_layernorm", "hidden_layernorm", "input_proj", "final_layernorm" diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 7724e52c1ce1..36bab9ee13b1 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -2,10 +2,10 @@ """Inference-only MiniMaxText01 model.""" import copy import math -import re from collections.abc import Iterable from typing import Optional, Union +import regex as re import torch import torch.distributed import torch.nn.functional as F diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index bb4d46be3f99..b757e661d771 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -14,10 +14,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal, Optional, TypedDict, Union +import regex as re import torch import torch.nn as nn from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig, diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 3701153bace5..57a66b793711 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -7,12 +7,12 @@ import copy import math -import re import unicodedata from collections.abc import Collection, Mapping, Sequence, Set from functools import lru_cache, partial from typing import Callable, Literal, Optional, TypedDict, Union +import regex as re import torch from torch import nn from torchvision import transforms diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index b22d81d88abe..b87a2ebf211a 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -14,11 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """Wrapper around `transformers` models""" -import re from collections.abc import Iterable from contextlib import nullcontext from typing import Literal, Optional, Union +import regex as re import torch from torch import nn from transformers import AutoModel, PretrainedConfig, PreTrainedModel diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index f56110d94ab2..f7a3c327982d 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 import json -import re import sys from abc import ABC, abstractmethod from collections import defaultdict @@ -12,6 +11,7 @@ from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, TypeVar, Union, cast) +import regex as re import torch from typing_extensions import assert_never diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 0dae02d33fec..07a63e294df4 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -import re from collections.abc import Sequence from typing import Optional, Union +import regex as re from transformers import PreTrainedTokenizerBase from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 551c2d55b4fc..05de6a603655 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import os -import re from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, Union, cast import huggingface_hub +import regex as re from huggingface_hub import HfApi, hf_hub_download from vllm.logger import init_logger diff --git a/vllm/utils.py b/vllm/utils.py index 511db7e67bc8..50296aada4cc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -19,7 +19,6 @@ import multiprocessing import os import pickle -import re import signal import socket import subprocess @@ -54,6 +53,7 @@ import numpy as np import numpy.typing as npt import psutil +import regex as re import torch import torch.types import yaml diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index f33f4972e103..111e92dc0990 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -2,7 +2,7 @@ from __future__ import annotations -import re +import regex as re def grammar_is_likely_lark(grammar_str: str) -> bool: From 2b10ba749177513e6423ff26bbb6d45fe17ee62b Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Fri, 23 May 2025 19:30:16 -0400 Subject: [PATCH 117/192] [Bugfix][Nixl] Fix Preemption Bug (#18631) Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com> --- .../unit/test_remote_prefill_lifecycle.py | 81 +++++++++++++++++++ vllm/v1/core/sched/scheduler.py | 31 +++---- 2 files changed, 97 insertions(+), 15 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index fc4928f9ebd1..6fcff0d62045 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -340,3 +340,84 @@ def test_full_block_prompt(): output = outputs[0] assert output.finish_reason == FinishReason.STOP assert_scheduler_empty(scheduler) + + +def test_cannot_schedule_after_recv(): + """ + Test that we can handle no schedule after recv due to not + enough remaining KV blocks. + """ + + # NOTE: the KVCacheManager will use 1 null block. + # So there are 5 total working blocks. + TOTAL_NUM_BLOCKS = 6 + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config, num_blocks=TOTAL_NUM_BLOCKS) + + # Prime the KVCache. + NUM_PROMPT_BLOCKS = 2 + BLOCK_SIZE = vllm_config.cache_config.block_size + # Prompt will use 2 blocks + 1 block after we schedule. + NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS) + NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5)) + + request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL) + request_remote = create_request(request_id=2, + num_tokens=NUM_TOKENS_REMOTE, + do_remote_prefill=True) + + # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode). + scheduler.add_request(request_normal) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # Step 2: 5 blocks are in use (2 new for remote blocks). + scheduler.add_request(request_remote) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Step 3: finish recving (5 blocks in use) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output( + reqs=[request_normal], finished_recving=[request_remote.request_id]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Step 4: try to schedule, not enough blocks. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Step 5: finish the request, free it. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_normal], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + + # Step 6: now we can schedule (with 2 blocks computed). + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote]) + assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens == + NUM_PROMPT_BLOCKS * BLOCK_SIZE) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # Step 7: free everything. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + _ = scheduler.schedule() + assert_scheduler_empty(scheduler) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 1f54560a10a7..efc0de350fba 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -310,15 +310,16 @@ def schedule(self) -> SchedulerOutput: break request = self.waiting[0] - num_prealloc_computed_tokens = 0 - # P/D: skip request if still waiting for remote kvs. + + # KVTransfer: skip request if still waiting for remote kvs. if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: is_ready = self._update_waiting_for_remote_kv(request) if is_ready: request.status = RequestStatus.WAITING - num_prealloc_computed_tokens = ( - request.num_computed_tokens) else: + logger.debug( + "%s is still in WAITING_FOR_REMOTE_KVS state.", + request.request_id) self.waiting.popleft() skipped_waiting_requests.appendleft(request) continue @@ -349,8 +350,9 @@ def schedule(self) -> SchedulerOutput: load_kv_async = False # Get already-cached tokens. - if num_prealloc_computed_tokens == 0: - new_computed_blocks, num_native_computed_tokens = \ + if request.num_computed_tokens == 0: + # Get locally-cached tokens. + new_computed_blocks, num_new_local_computed_tokens = \ self.kv_cache_manager.get_computed_blocks( request) @@ -358,23 +360,22 @@ def schedule(self) -> SchedulerOutput: if self.connector is not None: num_external_computed_tokens, load_kv_async = ( self.connector.get_num_new_matched_tokens( - request, num_native_computed_tokens)) + request, num_new_local_computed_tokens)) # Total computed tokens (local + external). - num_computed_tokens = (num_native_computed_tokens + + num_computed_tokens = (num_new_local_computed_tokens + num_external_computed_tokens) + # KVTransfer: WAITING reqs have num_computed_tokens > 0 + # after async KV recvs are completed. else: - # P/D: skip checking prefix cache if loaded from remote kvs. new_computed_blocks = KVCacheBlocks.create_empty() - num_native_computed_tokens = 0 - - # Total computed tokens (allocated in prior step). - num_computed_tokens = num_prealloc_computed_tokens + num_new_local_computed_tokens = 0 + num_computed_tokens = request.num_computed_tokens encoder_inputs_to_schedule = None new_encoder_budget = encoder_budget - # P/D: loading remote KV, do not allocate for new work. + # KVTransfer: loading remote KV, do not allocate for new work. if load_kv_async: assert num_external_computed_tokens > 0 num_new_tokens = 0 @@ -405,7 +406,7 @@ def schedule(self) -> SchedulerOutput: new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, - num_native_computed_tokens, + num_new_local_computed_tokens, new_computed_blocks, num_lookahead_tokens=self.num_lookahead_tokens, delay_cache_blocks=load_kv_async, From 45ab403a1f29f661262ebe651dde62cb8ed6c98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathieu=20Border=C3=A9?= <mathieu@bordere.org> Date: Sat, 24 May 2025 02:46:34 +0200 Subject: [PATCH 118/192] config.py: Clarify that only local GGUF checkpoints are supported. (#18623) Signed-off-by: Mathieu Bordere <mathieu@letmetweakit.com> --- vllm/transformers_utils/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 5f45ff133855..2a2a8c181874 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -300,7 +300,10 @@ def get_config( " - For Hugging Face models: ensure the presence of a " "'config.json'.\n" " - For Mistral models: ensure the presence of a " - "'params.json'.\n").format(model=model) + "'params.json'.\n" + "3. For GGUF: pass the local path of the GGUF checkpoint.\n" + " Loading GGUF from a remote repo directly is not yet " + "supported.\n").format(model=model) raise ValueError(error_message) from e From ec82c3e388b962a30a02fa376c222cef787b3c14 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng <wenhua.cheng@intel.com> Date: Sat, 24 May 2025 13:01:40 +0800 Subject: [PATCH 119/192] FIX MOE issue in AutoRound format (#18586) Signed-off-by: wenhuach21 <wenhua.cheng@intel.com> --- README.md | 2 +- .../layers/quantization/auto_round.py | 56 ++++++++++--------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 5b87ae838885..c119ad42ac4b 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ vLLM is fast with: - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph -- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. +- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8. - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. - Speculative decoding - Chunked prefill diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index a5e63843cf62..2d9f5e52bd65 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -8,6 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead @@ -74,7 +75,7 @@ def __repr__(self) -> str: f"group_size={self.group_size}, sym={self.sym})") @classmethod - def get_name(cls): ## use str will trigger preci issue + def get_name(cls) -> QuantizationMethods: return "auto-round" @classmethod @@ -142,18 +143,18 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): prefix, layer.__class__.__name__, weight_bits, group_size, sym) if backend == "auto" or "marlin" in backend: + AWQ_TYPE_MAP = { + 4: scalar_types.uint4, + 8: scalar_types.uint8, + } + use_marlin = (weight_bits + in AWQ_TYPE_MAP) and check_marlin_supported( + AWQ_TYPE_MAP[weight_bits], group_size, not sym) + if isinstance(layer, FusedMoE): - use_marlin = check_moe_marlin_supports_layer(layer, group_size) - else: + use_marlin = use_marlin and check_moe_marlin_supports_layer( + layer, group_size) - AWQ_TYPE_MAP = { - 4: scalar_types.uint4, - 8: scalar_types.uint8, - } - use_marlin = ((weight_bits, sym) in AWQ_TYPE_MAP - and check_marlin_supported( - AWQ_TYPE_MAP[(weight_bits)], group_size, - not sym)) else: use_marlin = False if use_marlin: @@ -180,10 +181,11 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) config = { - "linear_quant_method": "awq", - "weight_bits": weight_bits, + "quant_method": "awq", + "bits": weight_bits, "group_size": group_size, "zero_point": not sym, + "lm_head": False, } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) @@ -213,18 +215,18 @@ def apply_gptq_quant_layer(self, prefix, layer.__class__.__name__, weight_bits, group_size, sym) if backend == "auto" or "marlin" in backend: + GPTQ_TYPE_MAP = { + (4, True): scalar_types.uint4b8, + (8, True): scalar_types.uint8b128, + } + use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP + and check_marlin_supported( + GPTQ_TYPE_MAP[(weight_bits, sym)], + group_size, + has_zp=not sym)) if isinstance(layer, FusedMoE): - use_marlin = check_moe_marlin_supports_layer(layer, group_size) - else: - GPTQ_TYPE_MAP = { - (4, True): scalar_types.uint4b8, - (8, True): scalar_types.uint8b128, - } - use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP - and check_marlin_supported( - GPTQ_TYPE_MAP[(weight_bits, sym)], - group_size, - has_zp=not sym)) + use_marlin = use_marlin and check_moe_marlin_supports_layer( + layer, group_size) else: use_marlin = False if use_marlin: @@ -251,11 +253,11 @@ def apply_gptq_quant_layer(self, from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) config = { - "linear_quant_method": "gptq", - "weight_bits": weight_bits, + "quant_method": "gptq", + "bits": weight_bits, "group_size": group_size, "sym": sym, - "lm_head_quantized": False, + "lm_head": False, } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) From d55e446d1320d0f5f22bc3584f81f18d7924f166 Mon Sep 17 00:00:00 2001 From: qizixi <22851944+zixi-qi@users.noreply.github.com> Date: Fri, 23 May 2025 23:51:22 -0700 Subject: [PATCH 120/192] [V1][Spec Decode] Small refactors to improve eagle bookkeeping performance (#18424) Signed-off-by: qizixi <qizixi@meta.com> --- tests/v1/spec_decode/test_eagle.py | 6 +++++- vllm/v1/spec_decode/eagle.py | 10 +++------- vllm/v1/worker/gpu_model_runner.py | 24 +++++++++++++----------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index e000d955cfc0..7be1c5b89938 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -100,8 +100,12 @@ def test_prepare_inputs(): dtype=torch.int32, device=device) + # n1 + n2 + n3 - a - b -c + num_tokens = cu_target_query_lens[-1].item() - num_rejected_tokens.sum( + ).item() + cu_num_tokens, token_indices = EagleProposer.prepare_inputs( - cu_target_query_lens, num_rejected_tokens) + cu_target_query_lens, num_rejected_tokens, num_tokens) assert torch.equal(cu_num_tokens, expected_cu_num_tokens) assert token_indices.shape[0] == expected_cu_num_tokens[-1].item() diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 3926a86ee591..876e1ddd14a6 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -271,6 +271,7 @@ def prepare_inputs( cu_target_query_lens: torch.Tensor, # [batch_size] num_rejected_tokens: torch.Tensor, + num_tokens: int, ) -> tuple[torch.Tensor, torch.Tensor]: # cu_target_query_lens: [0, a, a + b, a + b + c] # num_rejected_tokens: [n1, n2, n3] @@ -288,18 +289,13 @@ def prepare_inputs( # [a - n1, b - n2, c - n3] -> # [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3] - cu_num_tokens = torch.empty_like(cu_target_query_lens) + cu_num_tokens = torch.zeros_like(cu_target_query_lens) torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:]) - cu_num_tokens[0] = 0 - - # FIXME(woosuk): Avoid synchronization. - num_tokens = cu_num_tokens[-1].item() token_indices = torch.empty( num_tokens, dtype=torch.int32, - device=cu_num_tokens.device, + device=cu_target_query_lens.device, ) - batch_size = num_rejected_tokens.shape[0] BLOCK_SIZE = 1024 prepare_eagle_input_kernel[(batch_size, )]( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 42847e2f8c36..5120495dbb9b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -34,8 +34,8 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, cdiv, check_use_alibi, - is_pin_memory_available) + GiB_bytes, LazyLoader, async_tensor_h2d, cdiv, + check_use_alibi, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -281,7 +281,7 @@ def __init__( def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: """ Update the order of requests in the batch based on the attention - backend's needs. For example, some attention backends (namely MLA) may + backend's needs. For example, some attention backends (namely MLA) may want to separate requests based on if the attention computation will be compute-bound or memory-bound. @@ -1360,9 +1360,10 @@ def execute_model( scheduler_output.num_scheduled_tokens[req_id]) next_token_id = req_state.get_token_id(seq_len) next_token_ids.append(next_token_id) - next_token_ids = torch.tensor(next_token_ids, - dtype=torch.int32, - device=self.device) + next_token_ids = async_tensor_h2d(next_token_ids, + dtype=torch.int32, + target_device=self.device, + pin_memory=True) eagle_attn_metadata = attn_metadata[self.drafter.attn_layer_name] # NOTE: deepseek_mtp uses MLA which does not have `block_table` @@ -1390,14 +1391,16 @@ def execute_model( n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0 for i, n in enumerate(num_draft_tokens) ] - num_rejected_tokens = torch.tensor( + num_rejected_tokens_tensor = async_tensor_h2d( num_rejected_tokens, dtype=torch.int32, - device=self.device, - ) + target_device=self.device, + pin_memory=True) + num_tokens = num_scheduled_tokens - sum(num_rejected_tokens) cu_num_tokens, token_indices = self.drafter.prepare_inputs( eagle_attn_metadata.query_start_loc, - num_rejected_tokens, + num_rejected_tokens_tensor, + num_tokens, ) target_token_ids = self.input_ids[token_indices] target_positions = positions[token_indices] @@ -1408,7 +1411,6 @@ def execute_model( target_hidden_states = hidden_states[token_indices] target_slot_mapping = eagle_attn_metadata.slot_mapping[ token_indices] - draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, target_positions=target_positions, From 441dc63ac7414cc9a7c75af678bef0fb673760ea Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sat, 24 May 2025 15:53:22 +0800 Subject: [PATCH 121/192] [Frontend] improve vllm serve --help display (#18643) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- vllm/entrypoints/cli/main.py | 7 +++-- vllm/entrypoints/cli/serve.py | 7 ++++- vllm/entrypoints/utils.py | 59 +++++++++++++++++++++++++++++++++++ vllm/utils.py | 6 ++-- 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index b7c1afce7118..6676c294c81c 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -9,7 +9,7 @@ import vllm.entrypoints.cli.openai import vllm.entrypoints.cli.serve import vllm.version -from vllm.entrypoints.utils import cli_env_setup +from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup from vllm.utils import FlexibleArgumentParser CMD_MODULES = [ @@ -32,7 +32,10 @@ def signal_handler(sig, frame): def main(): cli_env_setup() - parser = FlexibleArgumentParser(description="vLLM CLI") + parser = FlexibleArgumentParser( + description="vLLM CLI", + epilog=VLLM_SERVE_PARSER_EPILOG, + ) parser.add_argument('-v', '--version', action='version', diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 04be7c033998..957fec290bf2 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -11,6 +11,8 @@ from vllm.entrypoints.openai.api_server import run_server from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) +from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG, + show_filtered_argument_or_group_from_help) from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, get_tcp_uri @@ -77,7 +79,10 @@ def subparser_init( "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" ) - return make_arg_parser(serve_parser) + serve_parser = make_arg_parser(serve_parser) + show_filtered_argument_or_group_from_help(serve_parser) + serve_parser.epilog = VLLM_SERVE_PARSER_EPILOG + return serve_parser def cmd_init() -> list[CLISubcommand]: diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 2fe6e1a9e9c4..cc651a172b40 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -13,6 +13,13 @@ logger = init_logger(__name__) +VLLM_SERVE_PARSER_EPILOG = ( + "Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n" + " - To view a argument group: --help=ModelConfig\n" + " - To view a single argument: --help=max-num-seqs\n" + " - To search by keyword: --help=max\n" + " - To list all groups: --help=listgroup") + async def listen_for_disconnect(request: Request) -> None: """Returns if a disconnect message is received""" @@ -158,3 +165,55 @@ def _validate_truncation_size( tokenization_kwargs["max_length"] = truncate_prompt_tokens return truncate_prompt_tokens + + +def show_filtered_argument_or_group_from_help(parser): + import sys + for arg in sys.argv: + if arg.startswith('--help='): + search_keyword = arg.split('=', 1)[1] + + # List available groups + if search_keyword == 'listgroup': + print("\nAvailable argument groups:") + for group in parser._action_groups: + if group.title and not group.title.startswith( + "positional arguments"): + print(f" - {group.title}") + if group.description: + print(" " + group.description.strip()) + print() + sys.exit(0) + + # For group search + formatter = parser._get_formatter() + for group in parser._action_groups: + if group.title and group.title.lower() == search_keyword.lower( + ): + formatter.start_section(group.title) + formatter.add_text(group.description) + formatter.add_arguments(group._group_actions) + formatter.end_section() + print(formatter.format_help()) + sys.exit(0) + + # For single arg + matched_actions = [] + + for group in parser._action_groups: + for action in group._group_actions: + # search option name + if any(search_keyword.lower() in opt.lower() + for opt in action.option_strings): + matched_actions.append(action) + + if matched_actions: + print(f"\nParameters matching '{search_keyword}':\n") + formatter = parser._get_formatter() + formatter.add_arguments(matched_actions) + print(formatter.format_help()) + sys.exit(0) + + print(f"\nNo group or parameter matching '{search_keyword}'") + print("Tip: use `--help=listgroup` to view all groups.") + sys.exit(1) diff --git a/vllm/utils.py b/vllm/utils.py index 50296aada4cc..5be5304ee0ab 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -33,7 +33,8 @@ import warnings import weakref from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser, - ArgumentTypeError, _ArgumentGroup) + ArgumentTypeError, RawDescriptionHelpFormatter, + _ArgumentGroup) from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import UserDict, defaultdict from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable, @@ -1323,7 +1324,8 @@ def __call__(self, parser, namespace, values, option_string=None): "Expected 'true' or 'false'.") -class SortedHelpFormatter(ArgumentDefaultsHelpFormatter): +class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, + RawDescriptionHelpFormatter): """SortedHelpFormatter that sorts arguments by their option strings.""" def _split_lines(self, text, width): From a859320575651cb310be2064f7054d1c91547f77 Mon Sep 17 00:00:00 2001 From: Yuanhao WU <Nalkey@users.noreply.github.com> Date: Sat, 24 May 2025 17:15:36 +0800 Subject: [PATCH 122/192] [Model] Add support for Qwen2.5-Omni-7B-AWQ (Qwen2_5OmniForConditionalGeneration) (#18647) --- tests/models/registry.py | 2 ++ vllm/model_executor/models/registry.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/models/registry.py b/tests/models/registry.py index 22d532aa71e0..977be6475714 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -398,6 +398,8 @@ def check_available_online( "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501 "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B", min_transformers_version="4.52"), + "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ", # noqa: E501 + min_transformers_version="4.52"), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 61115afa76d4..3d842848a419 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -208,6 +208,7 @@ "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 + "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"), # [Encoder-decoder] From c1e4a4052d65d72d45e39db1edb6b7deb4ffd426 Mon Sep 17 00:00:00 2001 From: qizixi <22851944+zixi-qi@users.noreply.github.com> Date: Sat, 24 May 2025 02:45:34 -0700 Subject: [PATCH 123/192] [V1][Spec Decode] Support multi-layer eagle draft model (#18030) Signed-off-by: qizixi <qizixi@meta.com> --- tests/v1/spec_decode/test_eagle.py | 3 +++ vllm/v1/spec_decode/eagle.py | 33 ++++++++++++++++++++++++++---- vllm/v1/worker/gpu_model_runner.py | 18 +++++++++++----- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 7be1c5b89938..b49ac45f3129 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -246,6 +246,9 @@ def create_deterministic_logits(token_ids): # Assign the mock to the proposer proposer.model = model_mock + # Assign draft attn_layer_names since load_model is not invoked + proposer.attn_layer_names = ["layer.0"] + # Create input tensors cu_num_tokens = torch.tensor([0, seq_len_1, total_tokens], dtype=torch.int32, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 876e1ddd14a6..971b06758c21 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -12,6 +12,7 @@ from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata, FlashAttentionMetadata) +from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.spec_decode.utils import prepare_eagle_input_kernel @@ -150,6 +151,11 @@ def propose( else: raise ValueError(f"Unsupported method: {self.method}") + # At this moment, we assume all eagle layers belong to the same KV + # cache group, thus using the same attention metadata. + per_layer_attn_metadata = {} + for layer_name in self.attn_layer_names: + per_layer_attn_metadata[layer_name] = attn_metadata if self.use_cuda_graph and \ num_tokens <= self.cudagraph_batch_sizes[-1]: num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) @@ -159,7 +165,7 @@ def propose( self.positions[:num_tokens] = target_positions self.hidden_states[:num_tokens] = target_hidden_states - with set_forward_context(attn_metadata, + with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens): ret_hidden_states = self.model( @@ -245,7 +251,7 @@ def propose( self.hidden_states[:batch_size] = hidden_states # Run the model. - with set_forward_context(attn_metadata, + with set_forward_context(per_layer_attn_metadata, self.vllm_config, num_tokens=input_batch_size): last_hidden_states, hidden_states = self.model( @@ -318,8 +324,8 @@ def load_model(self, target_model: nn.Module) -> None: draft_attn_layer_names = ( get_layers_from_vllm_config(self.vllm_config, Attention).keys() - target_attn_layer_names) - assert len(draft_attn_layer_names) == 1 - self.attn_layer_name = next(iter(draft_attn_layer_names)) + + self.attn_layer_names = list(draft_attn_layer_names) # share embed_tokens with the target model if needed if get_pp_group().world_size == 1: @@ -355,6 +361,25 @@ def dummy_run( self.hidden_states[:num_tokens], ) + def validate_same_kv_cache_group(self, + kv_cache_config: KVCacheConfig) -> None: + """ + Validate that all eagle layers belong to the same KVCacheGroup. + Need this assumption to ensure all eagle layers can use the + same AttentionMetadata. + May extend to multiple AttentionMetadata in the future. + """ + kv_cache_groups: dict[str, int] = {} + for id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups): + for layer_name in kv_cache_group.layer_names: + kv_cache_groups[layer_name] = id + assert len( + set([ + kv_cache_groups[layer_name] + for layer_name in self.attn_layer_names + ]) + ) == 1, "All eagle layers should belong to the same kv cache group" + # NOTE(woosuk): Currently, the below code is not used and we always use argmax # to sample the draft tokens. We will use this after we find a way to manage diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5120495dbb9b..aa47ac253bb9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1360,11 +1360,13 @@ def execute_model( scheduler_output.num_scheduled_tokens[req_id]) next_token_id = req_state.get_token_id(seq_len) next_token_ids.append(next_token_id) - next_token_ids = async_tensor_h2d(next_token_ids, - dtype=torch.int32, - target_device=self.device, - pin_memory=True) - eagle_attn_metadata = attn_metadata[self.drafter.attn_layer_name] + next_token_ids = torch.tensor(next_token_ids, + dtype=torch.int32, + device=self.device) + # At this moment, we assume all eagle layers belong to the same KV + # cache group, thus using the same attention metadata. + eagle_attn_metadata = attn_metadata[ + self.drafter.attn_layer_names[0]] # NOTE: deepseek_mtp uses MLA which does not have `block_table` if hasattr(eagle_attn_metadata, "block_table"): @@ -2018,6 +2020,12 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: # KV cache specs. raise ValueError("Unknown KV cache spec type.") + if self.speculative_config and self.speculative_config.use_eagle(): + assert isinstance(self.drafter, EagleProposer) + # validate all draft model layers belong to the same kv cache + # group + self.drafter.validate_same_kv_cache_group(kv_cache_config) + bind_kv_cache( kv_caches, self.vllm_config.compilation_config.static_forward_context, From 07458a51ce8f31a2be0cc9da69d3e3ef6fb0f16d Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 17:57:15 +0800 Subject: [PATCH 124/192] [Doc] Update README links, mark external links (#18635) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- README.md | 6 +++--- docs/mkdocs/stylesheets/extra.css | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c119ad42ac4b..d2b795b962a8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ <p align="center"> <picture> - <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png"> - <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%> + <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png"> + <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%> </picture> </p> @@ -107,7 +107,7 @@ Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contribut vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! <!-- Note: Please sort them in alphabetical order. --> -<!-- Note: Please keep these consistent with docs/source/community/sponsors.md --> +<!-- Note: Please keep these consistent with docs/community/sponsors.md --> Cash Donations: - a16z - Dropbox diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css index 1c57779c52c9..165f53efc085 100644 --- a/docs/mkdocs/stylesheets/extra.css +++ b/docs/mkdocs/stylesheets/extra.css @@ -1,4 +1,24 @@ +/* Warning for latest docs */ .md-banner { background-color: var(--md-warning-bg-color); color: var(--md-warning-fg-color); } + +/* https://christianoliff.com/blog/styling-external-links-with-an-icon-in-css/ */ +a:not(:has(svg)):not(.md-icon) { + align-items: center; + + &[href^="//"]::after, + &[href^="http://"]::after, + &[href^="https://"]::after { + content: ""; + width: 12px; + height: 12px; + margin-left: 4px; + background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' stroke='gray' viewBox='0 0 16 16'%3E%3Cpath fill-rule='evenodd' d='M8.636 3.5a.5.5 0 0 0-.5-.5H1.5A1.5 1.5 0 0 0 0 4.5v10A1.5 1.5 0 0 0 1.5 16h10a1.5 1.5 0 0 0 1.5-1.5V7.864a.5.5 0 0 0-1 0V14.5a.5.5 0 0 1-.5.5h-10a.5.5 0 0 1-.5-.5v-10a.5.5 0 0 1 .5-.5h6.636a.5.5 0 0 0 .5-.5z'/%3E%3Cpath fill-rule='evenodd' d='M16 .5a.5.5 0 0 0-.5-.5h-5a.5.5 0 0 0 0 1h3.793L6.146 9.146a.5.5 0 1 0 .708.708L15 1.707V5.5a.5.5 0 0 0 1 0v-5z'/%3E%3C/svg%3E"); + background-position: center; + background-repeat: no-repeat; + background-size: contain; + display: inline-block; + } +} From e77dc4bad849f8038d5c3d1ab06c0978a66b929a Mon Sep 17 00:00:00 2001 From: Mengqing Cao <cmq0113@163.com> Date: Sat, 24 May 2025 20:09:15 +0800 Subject: [PATCH 125/192] [MISC][pre-commit] Add pre-commit check for triton import (#17716) Signed-off-by: Mengqing Cao <cmq0113@163.com> --- .pre-commit-config.yaml | 7 ++++ tools/check_triton_import.py | 75 ++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 tools/check_triton_import.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 658de23cf4da..ee186be21094 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -128,6 +128,13 @@ repos: name: Update Dockerfile dependency graph entry: tools/update-dockerfile-graph.sh language: script + # forbid directly import triton + - id: forbid-direct-triton-import + name: "Forbid direct 'import triton'" + entry: python tools/check_triton_import.py + language: python + types: [python] + pass_filenames: false # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py new file mode 100644 index 000000000000..d938ff1df594 --- /dev/null +++ b/tools/check_triton_import.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +import re +import subprocess +import sys + +FORBIDDEN_IMPORT_RE = re.compile(r"^(from|import)\s+triton(\s|\.|$)") + +# the way allowed to import triton +ALLOWED_LINES = { + "from vllm.triton_utils import triton", + "from vllm.triton_utils import tl", + "from vllm.triton_utils import tl, triton", +} + + +def is_forbidden_import(line: str) -> bool: + stripped = line.strip() + return bool( + FORBIDDEN_IMPORT_RE.match(stripped)) and stripped not in ALLOWED_LINES + + +def parse_diff(diff: str) -> list[str]: + violations = [] + current_file = None + current_lineno = None + + for line in diff.splitlines(): + if line.startswith("+++ b/"): + current_file = line[6:] + elif line.startswith("@@"): + match = re.search(r"\+(\d+)", line) + if match: + current_lineno = int( + match.group(1)) - 1 # next "+ line" is here + elif line.startswith("+") and not line.startswith("++"): + current_lineno += 1 + code_line = line[1:] + if is_forbidden_import(code_line): + violations.append( + f"{current_file}:{current_lineno}: {code_line.strip()}") + return violations + + +def get_diff(diff_type: str) -> str: + if diff_type == "staged": + return subprocess.check_output( + ["git", "diff", "--cached", "--unified=0"], text=True) + elif diff_type == "unstaged": + return subprocess.check_output(["git", "diff", "--unified=0"], + text=True) + else: + raise ValueError(f"Unknown diff_type: {diff_type}") + + +def main(): + all_violations = [] + for diff_type in ["staged", "unstaged"]: + try: + diff_output = get_diff(diff_type) + violations = parse_diff(diff_output) + all_violations.extend(violations) + except subprocess.CalledProcessError as e: + print(f"[{diff_type}] Git diff failed: {e}", file=sys.stderr) + + if all_violations: + print("❌ Forbidden direct `import triton` detected." + " ➤ Use `from vllm.triton_utils import triton` instead.\n") + for v in all_violations: + print(f"❌ {v}") + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From ef1dd6870f848c5814528a81b71bc87ba317e63f Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 21:06:35 +0800 Subject: [PATCH 126/192] [Doc] Fix indentation problems in V0 Paged Attention docs (#18659) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/deployment/k8s.md | 1 + docs/design/kernel/paged_attention.md | 744 +++++++++++++------------- 2 files changed, 372 insertions(+), 373 deletions(-) diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index bd2bd44cd522..6b08c4960d02 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -9,6 +9,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le * [Deployment with GPUs](#deployment-with-gpus) Alternatively, you can deploy vLLM to Kubernetes using any of the following: + * [Helm](frameworks/helm.md) * [InftyAI/llmaz](integrations/llmaz.md) * [KServe](integrations/kserve.md) diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md index fdfa38a29f83..6ebe1ee48acf 100644 --- a/docs/design/kernel/paged_attention.md +++ b/docs/design/kernel/paged_attention.md @@ -3,78 +3,76 @@ title: vLLM Paged Attention --- [](){ #design-paged-attention } -- Currently, vLLM utilizes its own implementation of a multi-head query - attention kernel (`csrc/attention/attention_kernels.cu`). - This kernel is designed to be compatible with - vLLM's paged KV caches, where the key and value cache are stored in - separate blocks (note that this block concept differs from the GPU - thread block. So in a later document, I will refer to vLLM paged - attention block as "block", while refer to GPU thread block as - "thread block"). -- To achieve high performance, this kernel relies on a specially - designed memory layout and access method, specifically when threads - read data from global memory to shared memory. The purpose of this - document is to provide a high-level explanation of the kernel - implementation step by step, aiding those who wish to learn about the - vLLM multi-head query attention kernel. After going through this - document, users will likely have a better understanding and feel easier - to follow the actual implementation. -- Please note that this document may not cover all details, such as how - to calculate the correct index for the corresponding data or the dot - multiplication implementation. However, after reading this document - and becoming familiar with the high-level logic flow, it should be - easier for you to read the actual code and understand the details. +Currently, vLLM utilizes its own implementation of a multi-head query +attention kernel (`csrc/attention/attention_kernels.cu`). +This kernel is designed to be compatible with +vLLM's paged KV caches, where the key and value cache are stored in +separate blocks (note that this block concept differs from the GPU +thread block. So in a later document, I will refer to vLLM paged +attention block as "block", while refer to GPU thread block as +"thread block"). + +To achieve high performance, this kernel relies on a specially +designed memory layout and access method, specifically when threads +read data from global memory to shared memory. The purpose of this +document is to provide a high-level explanation of the kernel +implementation step by step, aiding those who wish to learn about the +vLLM multi-head query attention kernel. After going through this +document, users will likely have a better understanding and feel easier +to follow the actual implementation. + +Please note that this document may not cover all details, such as how +to calculate the correct index for the corresponding data or the dot +multiplication implementation. However, after reading this document +and becoming familiar with the high-level logic flow, it should be +easier for you to read the actual code and understand the details. ## Inputs -- The kernel function takes a list of arguments for the current thread - to perform its assigned work. The three most important arguments are - the input pointers `q`, `k_cache`, and `v_cache`, which point - to query, key, and value data on global memory that need to be read - and processed. The output pointer `out` points to global memory - where the result should be written. These four pointers actually - refer to multi-dimensional arrays, but each thread only accesses the - portion of data assigned to it. I have omitted all other runtime - parameters here for simplicity. - - ```cpp - template< - typename scalar_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - int PARTITION_SIZE = 0> - __device__ void paged_attention_kernel( - ... // Other side args. - const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - ... // Other side args. - ) - ``` - -- There are also a list of template arguments above the function - signature that are determined during compilation time. `scalar_t` - represents the data type of the query, key, and value data elements, - such as FP16. `HEAD_SIZE` indicates the number of elements in each - head. `BLOCK_SIZE` refers to the number of tokens in each block. - `NUM_THREADS` denotes the number of threads in each thread block. - `PARTITION_SIZE` represents the number of tensor parallel GPUs (For - simplicity, we assume this is 0 and tensor parallel is disabled). - -- With these arguments, we need to perform a sequence of preparations. - This includes calculating the current head index, block index, and - other necessary variables. However, for now, we can ignore these - preparations and proceed directly to the actual calculations. It will - be easier to understand them once we grasp the entire flow. +The kernel function takes a list of arguments for the current thread +to perform its assigned work. The three most important arguments are +the input pointers `q`, `k_cache`, and `v_cache`, which point +to query, key, and value data on global memory that need to be read +and processed. The output pointer `out` points to global memory +where the result should be written. These four pointers actually +refer to multi-dimensional arrays, but each thread only accesses the +portion of data assigned to it. I have omitted all other runtime +parameters here for simplicity. + +```cpp +template<typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, int PARTITION_SIZE = 0> +__device__ void paged_attention_kernel( + ... // Other side args. + const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + ... // Other side args. +) +``` + +There are also a list of template arguments above the function +signature that are determined during compilation time. `scalar_t` +represents the data type of the query, key, and value data elements, +such as FP16. `HEAD_SIZE` indicates the number of elements in each +head. `BLOCK_SIZE` refers to the number of tokens in each block. +`NUM_THREADS` denotes the number of threads in each thread block. +`PARTITION_SIZE` represents the number of tensor parallel GPUs (For +simplicity, we assume this is 0 and tensor parallel is disabled). + +With these arguments, we need to perform a sequence of preparations. +This includes calculating the current head index, block index, and +other necessary variables. However, for now, we can ignore these +preparations and proceed directly to the actual calculations. It will +be easier to understand them once we grasp the entire flow. ## Concepts -- Just before we dive into the calculation flow, I want to describe a - few concepts that are needed for later sections. However, you may - skip this section and return later if you encounter any confusing - terminologies. +Just before we dive into the calculation flow, I want to describe a +few concepts that are needed for later sections. However, you may +skip this section and return later if you encounter any confusing +terminologies. + - **Sequence**: A sequence represents a client request. For example, the data pointed to by `q` has a shape of `[num_seqs, num_heads, head_size]`. That represents there are total @@ -129,236 +127,236 @@ title: vLLM Paged Attention ## Query -- This section will introduce how query data is stored in memory and - fetched by each thread. As mentioned above, each thread group fetches - one query token data, while each thread itself only handles a part of - one query token data. Within each warp, every thread group will fetch - the same query token data, but will multiply it with different key - token data. +This section will introduce how query data is stored in memory and +fetched by each thread. As mentioned above, each thread group fetches +one query token data, while each thread itself only handles a part of +one query token data. Within each warp, every thread group will fetch +the same query token data, but will multiply it with different key +token data. - ```cpp - const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; - ``` +```cpp +const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; +``` <figure markdown="span"> ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" } </figure> -- Each thread defines its own `q_ptr` which points to the assigned - query token data on global memory. For example, if `VEC_SIZE` is 4 - and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains - total of 128 elements divided into 128 / 4 = 32 vecs. +Each thread defines its own `q_ptr` which points to the assigned +query token data on global memory. For example, if `VEC_SIZE` is 4 +and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains +total of 128 elements divided into 128 / 4 = 32 vecs. <figure markdown="span"> ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" } </figure> - ```cpp - __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; - ``` +```cpp +__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; +``` -- Next, we need to read the global memory data pointed to by `q_ptr` - into shared memory as `q_vecs`. It is important to note that each - vecs is assigned to a different row. For example, if the - `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs, - while thread 1 handles the 1st row vecs. By reading the query data in - this way, neighboring threads like thread 0 and thread 1 can read - neighbor memory, achieving the memory coalescing to improve - performance. +Next, we need to read the global memory data pointed to by `q_ptr` +into shared memory as `q_vecs`. It is important to note that each +vecs is assigned to a different row. For example, if the +`THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs, +while thread 1 handles the 1st row vecs. By reading the query data in +this way, neighboring threads like thread 0 and thread 1 can read +neighbor memory, achieving the memory coalescing to improve +performance. ## Key -- Similar to the "Query" section, this section introduces memory layout - and assignment for keys. While each thread group only handle one - query token one kernel run, it may handle multiple key tokens across - multiple iterations. Meanwhile, each warp will process multiple blocks - of key tokens in multiple iterations, ensuring that all context - tokens are processed by the entire thread group after the kernel run. - In this context, "handle" refers to performing the dot multiplication - between query data and key data. - - ```cpp - const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride - + physical_block_offset * x; - ``` - -- Unlike to `q_ptr`, `k_ptr` in each thread will point to different - key token at different iterations. As shown above, that `k_ptr` - points to key token data based on `k_cache` at assigned block, - assigned head and assigned token. +Similar to the "Query" section, this section introduces memory layout +and assignment for keys. While each thread group only handle one +query token one kernel run, it may handle multiple key tokens across +multiple iterations. Meanwhile, each warp will process multiple blocks +of key tokens in multiple iterations, ensuring that all context +tokens are processed by the entire thread group after the kernel run. +In this context, "handle" refers to performing the dot multiplication +between query data and key data. + +```cpp +const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + + physical_block_offset * x; +``` + +Unlike to `q_ptr`, `k_ptr` in each thread will point to different +key token at different iterations. As shown above, that `k_ptr` +points to key token data based on `k_cache` at assigned block, +assigned head and assigned token. <figure markdown="span"> ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" } </figure> -- The diagram above illustrates the memory layout for key data. It - assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is - 8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each - rectangle represents all the elements for one key token at one head, - which will be processed by one thread group. The left half shows the - total 16 blocks of key token data for warp 0, while the right half - represents the remaining key token data for other warps or - iterations. Inside each rectangle, there are a total 32 vecs (128 - elements for one token) that will be processed by 2 threads (one - thread group) separately. +The diagram above illustrates the memory layout for key data. It +assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is +8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each +rectangle represents all the elements for one key token at one head, +which will be processed by one thread group. The left half shows the +total 16 blocks of key token data for warp 0, while the right half +represents the remaining key token data for other warps or +iterations. Inside each rectangle, there are a total 32 vecs (128 +elements for one token) that will be processed by 2 threads (one +thread group) separately. <figure markdown="span"> ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" } </figure> - ```cpp - K_vec k_vecs[NUM_VECS_PER_THREAD] - ``` - -- Next, we need to read the key token data from `k_ptr` and store - them on register memory as `k_vecs`. We use register memory for - `k_vecs` because it will only be accessed by one thread once, - whereas `q_vecs` will be accessed by multiple threads multiple - times. Each `k_vecs` will contain multiple vectors for later - calculation. Each vec will be set at each inner iteration. The - assignment of vecs allows neighboring threads in a warp to read - neighboring memory together, which again promotes the memory - coalescing. For instance, thread 0 will read vec 0, while thread 1 - will read vec 1. In the next inner loop, thread 0 will read vec 2, - while thread 1 will read vec 3, and so on. - -- You may still be a little confused about the overall flow. Don't - worry, please keep reading the next "QK" section. It will illustrate - the query and key calculation flow in a clearer and higher-level - manner. +```cpp +K_vec k_vecs[NUM_VECS_PER_THREAD] +``` + +Next, we need to read the key token data from `k_ptr` and store +them on register memory as `k_vecs`. We use register memory for +`k_vecs` because it will only be accessed by one thread once, +whereas `q_vecs` will be accessed by multiple threads multiple +times. Each `k_vecs` will contain multiple vectors for later +calculation. Each vec will be set at each inner iteration. The +assignment of vecs allows neighboring threads in a warp to read +neighboring memory together, which again promotes the memory +coalescing. For instance, thread 0 will read vec 0, while thread 1 +will read vec 1. In the next inner loop, thread 0 will read vec 2, +while thread 1 will read vec 3, and so on. + +You may still be a little confused about the overall flow. Don't +worry, please keep reading the next "QK" section. It will illustrate +the query and key calculation flow in a clearer and higher-level +manner. ## QK -- As shown the pseudo code below, before the entire for loop block, we - fetch the query data for one token and store it in `q_vecs`. Then, - in the outer for loop, we iterate through different `k_ptrs` that - point to different tokens and prepare the `k_vecs` in the inner for - loop. Finally, we perform the dot multiplication between the - `q_vecs` and each `k_vecs`. - - ```cpp - q_vecs = ... - for ... { - k_ptr = ... - for ... { +As shown the pseudo code below, before the entire for loop block, we +fetch the query data for one token and store it in `q_vecs`. Then, +in the outer for loop, we iterate through different `k_ptrs` that +point to different tokens and prepare the `k_vecs` in the inner for +loop. Finally, we perform the dot multiplication between the +`q_vecs` and each `k_vecs`. + +```cpp +q_vecs = ... +for ... { + k_ptr = ... + for ... { k_vecs[i] = ... - } - ... - float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs); - } - ``` - -- As mentioned before, for each thread, it only fetches part of the - query and key token data at a time. However, there will be a cross - thread group reduction happen in the `Qk_dot<>::dot` . So `qk` - returned here is not just between part of the query and key token dot - multiplication, but actually a full result between entire query and - key token data. - -- For example, if the value of `HEAD_SIZE` is 128 and - `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain - total 64 elements. However, the returned `qk` is actually the - result of dot multiplication between 128 query elements and 128 key - elements. If you want to learn more about the details of the dot - multiplication and reduction, you may refer to the implementation of - `Qk_dot<>::dot`. However, for the sake of simplicity, I will not - cover it in this document. + } + ... + float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs); +} +``` + +As mentioned before, for each thread, it only fetches part of the +query and key token data at a time. However, there will be a cross +thread group reduction happen in the `Qk_dot<>::dot` . So `qk` +returned here is not just between part of the query and key token dot +multiplication, but actually a full result between entire query and +key token data. + +For example, if the value of `HEAD_SIZE` is 128 and +`THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain +total 64 elements. However, the returned `qk` is actually the +result of dot multiplication between 128 query elements and 128 key +elements. If you want to learn more about the details of the dot +multiplication and reduction, you may refer to the implementation of +`Qk_dot<>::dot`. However, for the sake of simplicity, I will not +cover it in this document. ## Softmax -- Next, we need to calculate the normalized softmax for all `qk`s, - as shown above, where each $x$ represents a `qk`. To do this, - we must obtain the reduced value of `qk_max`($m(x)$) and - the `exp_sum`($\ell(x)$) of all `qk`s. The reduction - should be performed across the entire thread block, encompassing - results between the query token and all context key tokens. +Next, we need to calculate the normalized softmax for all `qk`s, +as shown above, where each $x$ represents a `qk`. To do this, +we must obtain the reduced value of `qk_max`($m(x)$) and +the `exp_sum`($\ell(x)$) of all `qk`s. The reduction +should be performed across the entire thread block, encompassing +results between the query token and all context key tokens. - $$ - \begin{gather*} - m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ - \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} - \end{gather*} - $$ +$$ +\begin{gather*} +m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ +\quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} +\end{gather*} +$$ ### `qk_max` and `logits` -- Just right after we get the `qk` result, we can set the temporary - `logits` result with `qk` (In the end, the `logits` should - store the normalized softmax result). Also we can compare and collect - the `qk_max` for all `qk`s that are calculated by current - thread group. - - ```cpp - if (thread_group_offset == 0) { - const bool mask = token_idx >= context_len; - logits[token_idx - start_token_idx] = mask ? 0.f : qk; - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - } - ``` - -- Please note that the `logits` here is on shared memory, so each - thread group will set the fields for its own assigned context tokens. - Overall, the size of logits should be number of context tokens. - - ```cpp - for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - - if (lane == 0) { - red_smem[warp_idx] = qk_max; - } - ``` - -- Then we need to get the reduced `qk_max` across each warp. The main - idea is to make threads in warp to communicate with each other and - get the final max `qk` . - - ```cpp - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - qk_max = VLLM_SHFL_SYNC(qk_max, 0); - ``` - -- Finally, we can get the reduced `qk_max` from whole thread block by - compare the `qk_max` from all warps in this thread block. Then we - need to broadcast the final result to each thread. +Just right after we get the `qk` result, we can set the temporary +`logits` result with `qk` (In the end, the `logits` should +store the normalized softmax result). Also we can compare and collect +the `qk_max` for all `qk`s that are calculated by current +thread group. + +```cpp +if (thread_group_offset == 0) { + const bool mask = token_idx >= context_len; + logits[token_idx - start_token_idx] = mask ? 0.f : qk; + qk_max = mask ? qk_max : fmaxf(qk_max, qk); +} +``` + +Please note that the `logits` here is on shared memory, so each +thread group will set the fields for its own assigned context tokens. +Overall, the size of logits should be number of context tokens. + +```cpp +for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); +} + +if (lane == 0) { + red_smem[warp_idx] = qk_max; +} +``` + +Then we need to get the reduced `qk_max` across each warp. The main +idea is to make threads in warp to communicate with each other and +get the final max `qk` . + +```cpp +for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); +} +qk_max = VLLM_SHFL_SYNC(qk_max, 0); +``` + +Finally, we can get the reduced `qk_max` from whole thread block by +compare the `qk_max` from all warps in this thread block. Then we +need to broadcast the final result to each thread. ### `exp_sum` -- Similar to `qk_max`, we need to get the reduced sum value from the - entire thread block too. - - ```cpp - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - float val = __expf(logits[i] - qk_max); - logits[i] = val; - exp_sum += val; - } - ... - exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum); - ``` - -- Firstly, sum all exp values from each thread group, and meanwhile, - convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. - Please note, the `qk_max` here is already the max `qk` across the - whole thread block. And then we can do reduction for `exp_sum` - across whole thread block just like the `qk_max`. - - ```cpp - const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - logits[i] *= inv_sum; - } - ``` - -- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain - the final normalized softmax result as `logits`. This `logits` - variable will be used for dot multiplication with the value data in - later steps. Now, it should store the normalized softmax result of - `qk` for all assigned context tokens. +Similar to `qk_max`, we need to get the reduced sum value from the +entire thread block too. + +```cpp +for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + float val = __expf(logits[i] - qk_max); + logits[i] = val; + exp_sum += val; +} +... +exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum); +``` + +Firstly, sum all exp values from each thread group, and meanwhile, +convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. +Please note, the `qk_max` here is already the max `qk` across the +whole thread block. And then we can do reduction for `exp_sum` +across whole thread block just like the `qk_max`. + +```cpp +const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); +for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + logits[i] *= inv_sum; +} +``` + +Finally, with the reduced `qk_max` and `exp_sum`, we can obtain +the final normalized softmax result as `logits`. This `logits` +variable will be used for dot multiplication with the value data in +later steps. Now, it should store the normalized softmax result of +`qk` for all assigned context tokens. ## Value @@ -374,127 +372,127 @@ title: vLLM Paged Attention ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" } </figure> -- Now we need to retrieve the value data and perform dot multiplication - with `logits`. Unlike query and key, there is no thread group - concept for value data. As shown in diagram, different from key token - memory layout, elements from the same column correspond to the same - value token. For one block of value data, there are `HEAD_SIZE` of - rows and `BLOCK_SIZE` of columns that are split into multiple - `v_vecs`. - -- Each thread always fetches `V_VEC_SIZE` elements from the same - `V_VEC_SIZE` of tokens at a time. As a result, a single thread - retrieves multiple `v_vec`s from different rows and the same - columns through multiple inner iterations. For each `v_vec`, it - needs to be dot multiplied with the corresponding `logits_vec`, - which is also `V_VEC_SIZE` elements from `logits`. Overall, with - multiple inner iterations, each warp will process one block of value - tokens. And with multiple outer iterations, the whole context value - tokens are processed - - ```cpp - float accs[NUM_ROWS_PER_THREAD]; - for ... { // Iteration over different blocks. - logits_vec = ... - for ... { // Iteration over different rows. - v_vec = ... - ... - accs[i] += dot(logits_vec, v_vec); - } - } - ``` - -- As shown in the above pseudo code, in the outer loop, similar to - `k_ptr`, `logits_vec` iterates over different blocks and reads - `V_VEC_SIZE` elements from `logits`. In the inner loop, each - thread reads `V_VEC_SIZE` elements from the same tokens as a - `v_vec` and performs dot multiplication. It is important to note - that in each inner iteration, the thread fetches different head - position elements for the same tokens. The dot result is then - accumulated in `accs`. Therefore, each entry of `accs` is mapped - to a head position assigned to the current thread. - -- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each - thread fetches 8 value elements for 8 tokens at a time. Each element - is from different tokens at the same head position. If `HEAD_SIZE` - is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to - fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are - a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle - a whole block of value tokens. And each `accs` in each thread - contains 8 elements that accumulated at 8 different head positions. - For the thread 0, the `accs` variable will have 8 elements, which - are 0th, 32th … 224th elements of a value head that are accumulated - from all assigned 8 tokens. +Now we need to retrieve the value data and perform dot multiplication +with `logits`. Unlike query and key, there is no thread group +concept for value data. As shown in diagram, different from key token +memory layout, elements from the same column correspond to the same +value token. For one block of value data, there are `HEAD_SIZE` of +rows and `BLOCK_SIZE` of columns that are split into multiple +`v_vecs`. + +Each thread always fetches `V_VEC_SIZE` elements from the same +`V_VEC_SIZE` of tokens at a time. As a result, a single thread +retrieves multiple `v_vec`s from different rows and the same +columns through multiple inner iterations. For each `v_vec`, it +needs to be dot multiplied with the corresponding `logits_vec`, +which is also `V_VEC_SIZE` elements from `logits`. Overall, with +multiple inner iterations, each warp will process one block of value +tokens. And with multiple outer iterations, the whole context value +tokens are processed + +```cpp +float accs[NUM_ROWS_PER_THREAD]; +for ... { // Iteration over different blocks. + logits_vec = ... + for ... { // Iteration over different rows. + v_vec = ... + ... + accs[i] += dot(logits_vec, v_vec); + } +} +``` + +As shown in the above pseudo code, in the outer loop, similar to +`k_ptr`, `logits_vec` iterates over different blocks and reads +`V_VEC_SIZE` elements from `logits`. In the inner loop, each +thread reads `V_VEC_SIZE` elements from the same tokens as a +`v_vec` and performs dot multiplication. It is important to note +that in each inner iteration, the thread fetches different head +position elements for the same tokens. The dot result is then +accumulated in `accs`. Therefore, each entry of `accs` is mapped +to a head position assigned to the current thread. + +For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each +thread fetches 8 value elements for 8 tokens at a time. Each element +is from different tokens at the same head position. If `HEAD_SIZE` +is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to +fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are +a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle +a whole block of value tokens. And each `accs` in each thread +contains 8 elements that accumulated at 8 different head positions. +For the thread 0, the `accs` variable will have 8 elements, which +are 0th, 32th … 224th elements of a value head that are accumulated +from all assigned 8 tokens. ## LV -- Now, we need to perform reduction for `accs` within each warp. This - process allows each thread to accumulate the `accs` for the - assigned head positions of all tokens in one block. +Now, we need to perform reduction for `accs` within each warp. This +process allows each thread to accumulate the `accs` for the +assigned head positions of all tokens in one block. - ```cpp - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - float acc = accs[i]; - for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { +```cpp +for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + float acc = accs[i]; + for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { acc += VLLM_SHFL_XOR_SYNC(acc, mask); - } - accs[i] = acc; - } - ``` - -- Next, we perform reduction for `accs` across all warps, allowing - each thread to have the accumulation of `accs` for the assigned - head positions of all context tokens. Please note that each `accs` - in every thread only stores the accumulation for a portion of - elements of the entire head for all context tokens. However, overall, - all results for output have been calculated but are just stored in - different thread register memory. - - ```cpp - float* out_smem = reinterpret_cast<float*>(shared_mem); - for (int i = NUM_WARPS; i > 1; i /= 2) { - // Upper warps write to shared memory. - ... - float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - dst[row_idx] = accs[i]; - } - - // Lower warps update the output. - const float* src = &out_smem[warp_idx * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - accs[i] += src[row_idx]; - } - - // Write out the accs. - } - ``` + } + accs[i] = acc; +} +``` + +Next, we perform reduction for `accs` across all warps, allowing +each thread to have the accumulation of `accs` for the assigned +head positions of all context tokens. Please note that each `accs` +in every thread only stores the accumulation for a portion of +elements of the entire head for all context tokens. However, overall, +all results for output have been calculated but are just stored in +different thread register memory. + +```cpp +float* out_smem = reinterpret_cast<float*>(shared_mem); +for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. + ... + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. +} +``` ## Output -- Now we can write all of calculated result from local register memory - to final output global memory. - - ```cpp - scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE - + partition_idx * HEAD_SIZE; - ``` - -- First, we need to define the `out_ptr` variable, which points to - the start address of the assigned sequence and assigned head. - - ```cpp - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - from_float(*(out_ptr + row_idx), accs[i]); - } - } - ``` - -- Finally, we need to iterate over different assigned head positions - and write out the corresponding accumulated result based on the - `out_ptr`. +Now we can write all of calculated result from local register memory +to final output global memory. + +```cpp +scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + + partition_idx * HEAD_SIZE; +``` + +First, we need to define the `out_ptr` variable, which points to +the start address of the assigned sequence and assigned head. + +```cpp +for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; + if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { + from_float(*(out_ptr + row_idx), accs[i]); + } +} +``` + +Finally, we need to iterate over different assigned head positions +and write out the corresponding accumulated result based on the +`out_ptr`. From 6d166a8d35b2b3f65d6c313040fdd3a76cb89b5c Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 21:06:38 +0800 Subject: [PATCH 127/192] [Doc] Add community links (#18657) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/.nav.yml | 4 +++- docs/community/meetups.md | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/.nav.yml b/docs/.nav.yml index e2b0ed560700..a41696d1404e 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -53,4 +53,6 @@ nav: preserve_directory_names: true - Community: - community/* - - vLLM Blog: https://blog.vllm.ai + - Blog: https://blog.vllm.ai + - Forum: https://discuss.vllm.ai + - Slack: https://slack.vllm.ai diff --git a/docs/community/meetups.md b/docs/community/meetups.md index 2c47be443a5e..8ea42e3cad18 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -1,5 +1,5 @@ --- -title: vLLM Meetups +title: Meetups --- [](){ #meetups } From 2cd4d58df4cb4187e36ba9bdabc2819e6f579848 Mon Sep 17 00:00:00 2001 From: ztang2370 <ztang2370@gmail.com> Date: Sat, 24 May 2025 21:36:13 +0800 Subject: [PATCH 128/192] [Model] use AutoWeightsLoader for gpt2 (#18625) Signed-off-by: zt2370 <ztang2370@gmail.com> --- vllm/model_executor/models/gpt2.py | 73 ++++++++++++++++++------------ 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 470a7053e1b6..c2c310fca4d9 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -43,7 +43,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -235,6 +235,35 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if ".attn.bias" in name or ".attn.masked_bias" in name: + # Skip attention mask. + # NOTE: "c_attn.bias" should not be skipped. + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + # The HF's GPT-2 implementation uses Conv1D instead of Linear. + # Because of this, we need to transpose the weights. + # Note(zhuohan): the logic below might break quantized models. + for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]: + if conv1d_weight_name not in name: + continue + if not name.endswith(".weight"): + continue + loaded_weight = loaded_weight.t() + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class GPT2LMHeadModel(nn.Module, SupportsPP): @@ -283,32 +312,16 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if ".attn.bias" in name or ".attn.masked_bias" in name: - # Skip attention mask. - # NOTE: "c_attn.bias" should not be skipped. - continue - if not name.startswith("transformer.") and not name.startswith( - "lm_head"): - name = "transformer." + name - - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - # The HF's GPT-2 implementation uses Conv1D instead of Linear. - # Because of this, we need to transpose the weights. - # Note(zhuohan): the logic below might break quantized models. - for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]: - if conv1d_weight_name not in name: - continue - if not name.endswith(".weight"): - continue - loaded_weight = loaded_weight.t() - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + weights = _add_transformer_prefix(weights) + return loader.load_weights(weights) + + +def _add_transformer_prefix( + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: + for name, tensor in weights: + if not name.startswith('transformer.') and not name.startswith( + "lm_head"): + name = 'transformer.' + name + yield name, tensor From 1cb194a0183db9b51cec6cb9ff473c276d8186de Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 22:25:33 +0800 Subject: [PATCH 129/192] [Doc] Reorganize user guide (#18661) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- CONTRIBUTING.md | 2 +- README.md | 2 +- docs/.nav.yml | 31 +-- docs/configuration/README.md | 4 + docs/configuration/conserving_memory.md | 144 ++++++++++++++ .../{serving => configuration}/engine_args.md | 0 docs/configuration/model_resolution.md | 23 +++ .../optimization.md | 5 +- docs/{serving => configuration}/serve_args.md | 0 docs/contributing/{overview.md => README.md} | 0 .../benchmarks.md | 0 docs/design/multiprocessing.md | 2 +- docs/design/v1/metrics.md | 2 +- docs/features/tool_calling.md | 2 +- docs/serving/offline_inference.md | 185 ------------------ docs/usage/README.md | 7 + docs/{serving => usage}/env_vars.md | 0 docs/{getting_started => usage}/faq.md | 0 docs/{serving => usage}/metrics.md | 0 .../reproducibility.md} | 2 +- docs/{deployment => usage}/security.md | 2 +- .../troubleshooting.md | 6 +- docs/{serving => usage}/usage_stats.md | 0 .../v1_user_guide.md => usage/v1_guide.md} | 2 +- vllm/envs.py | 2 +- vllm/utils.py | 2 +- 27 files changed, 211 insertions(+), 216 deletions(-) create mode 100644 docs/configuration/README.md create mode 100644 docs/configuration/conserving_memory.md rename docs/{serving => configuration}/engine_args.md (100%) create mode 100644 docs/configuration/model_resolution.md rename docs/{performance => configuration}/optimization.md (99%) rename docs/{serving => configuration}/serve_args.md (100%) rename docs/contributing/{overview.md => README.md} (100%) rename docs/{performance => contributing}/benchmarks.md (100%) create mode 100644 docs/usage/README.md rename docs/{serving => usage}/env_vars.md (100%) rename docs/{getting_started => usage}/faq.md (100%) rename docs/{serving => usage}/metrics.md (100%) rename docs/{serving/seed_parameter_behavior.md => usage/reproducibility.md} (98%) rename docs/{deployment => usage}/security.md (99%) rename docs/{getting_started => usage}/troubleshooting.md (97%) rename docs/{serving => usage}/usage_stats.md (100%) rename docs/{getting_started/v1_user_guide.md => usage/v1_guide.md} (99%) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7042e81a84da..65be771b94fb 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) <!--- pyml disable-next-line no-emphasis-as-heading --> -**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions) +**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6d46a6dca371..2947aad75ee5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,3 @@ # Contributing to vLLM -You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). +You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing). diff --git a/README.md b/README.md index d2b795b962a8..3dd214e9b451 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. ## Contributing We welcome and value any contributions and collaborations. -Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved. +Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing) for how to get involved. ## Sponsors diff --git a/docs/.nav.yml b/docs/.nav.yml index a41696d1404e..100841aecf61 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -5,29 +5,35 @@ nav: - getting_started/quickstart.md - getting_started/installation - Examples: - - LMCache: getting_started/examples/lmcache - - getting_started/examples/offline_inference - - getting_started/examples/online_serving - - getting_started/examples/other + - Offline Inference: getting_started/examples/offline_inference + - Online Serving: getting_started/examples/online_serving + - Others: + - LMCache: getting_started/examples/lmcache + - getting_started/examples/other/* - Quick Links: - - User Guide: serving/offline_inference.md - - Developer Guide: contributing/overview.md + - User Guide: usage/README.md + - Developer Guide: contributing/README.md - API Reference: api/README.md - Timeline: - Roadmap: https://roadmap.vllm.ai - Releases: https://github.com/vllm-project/vllm/releases - User Guide: + - usage/README.md + - General: + - usage/* - Inference and Serving: - serving/offline_inference.md - serving/openai_compatible_server.md - serving/* - serving/integrations - - Training: training - Deployment: - deployment/* - deployment/frameworks - deployment/integrations - - Performance: performance + - Training: training + - Configuration: + - Summary: configuration/README.md + - configuration/* - Models: - models/supported_models.md - models/generative_models.md @@ -37,12 +43,11 @@ nav: - features/compatibility_matrix.md - features/* - features/quantization - - Other: - - getting_started/* - Developer Guide: - - contributing/overview.md - - glob: contributing/* - flatten_single_child_sections: true + - contributing/README.md + - General: + - glob: contributing/* + flatten_single_child_sections: true - Model Implementation: contributing/model - Design Documents: - V0: design diff --git a/docs/configuration/README.md b/docs/configuration/README.md new file mode 100644 index 000000000000..442a8d441430 --- /dev/null +++ b/docs/configuration/README.md @@ -0,0 +1,4 @@ +# Configuration Options + +This section lists the most common options for running the vLLM engine. +For a full list, refer to the [configuration][configuration] page. diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md new file mode 100644 index 000000000000..a1283a503a6d --- /dev/null +++ b/docs/configuration/conserving_memory.md @@ -0,0 +1,144 @@ +# Conserving Memory + +Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. + +## Tensor Parallelism (TP) + +Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs. + +The following code splits the model across 2 GPUs. + +```python +from vllm import LLM + +llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", + tensor_parallel_size=2) +``` + +!!! warning + To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) + before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. + + To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. + +!!! note + With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). + + You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + +## Quantization + +Quantized models take less memory at the cost of lower precision. + +Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI)) +and used directly without extra configuration. + +Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details. + +## Context length and batch size + +You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) +and the maximum batch size (`max_num_seqs` option). + +```python +from vllm import LLM + +llm = LLM(model="adept/fuyu-8b", + max_model_len=2048, + max_num_seqs=2) +``` + +## Reduce CUDA Graphs + +By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. + +!!! warning + CUDA graph capture takes up more memory in V1 than in V0. + +You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: + +```python +from vllm import LLM +from vllm.config import CompilationConfig, CompilationLevel + +llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + # By default, it goes up to max_num_seqs + cudagraph_capture_sizes=[1, 2, 4, 8, 16], + ), +) +``` + +You can disable graph capturing completely via the `enforce_eager` flag: + +```python +from vllm import LLM + +llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", + enforce_eager=True) +``` + +## Adjust cache size + +If you run out of CPU RAM, try the following options: + +- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). +- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). + +## Multi-modal input limits + +You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model: + +```python +from vllm import LLM + +# Accept up to 3 images and 1 video per prompt +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"image": 3, "video": 1}) +``` + +You can go a step further and disable unused modalities completely by setting its limit to zero. +For example, if your application only accepts image input, there is no need to allocate any memory for videos. + +```python +from vllm import LLM + +# Accept any number of images but no videos +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + limit_mm_per_prompt={"video": 0}) +``` + +You can even run a multi-modal model for text-only inference: + +```python +from vllm import LLM + +# Don't accept images. Just text. +llm = LLM(model="google/gemma-3-27b-it", + limit_mm_per_prompt={"image": 0}) +``` + +## Multi-modal processor arguments + +For certain models, you can adjust the multi-modal processor arguments to +reduce the size of the processed multi-modal inputs, which in turn saves memory. + +Here are some examples: + +```python +from vllm import LLM + +# Available for Qwen2-VL series models +llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", + mm_processor_kwargs={ + "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 + }) + +# Available for InternVL series models +llm = LLM(model="OpenGVLab/InternVL2-2B", + mm_processor_kwargs={ + "max_dynamic_patch": 4, # Default is 12 + }) +``` diff --git a/docs/serving/engine_args.md b/docs/configuration/engine_args.md similarity index 100% rename from docs/serving/engine_args.md rename to docs/configuration/engine_args.md diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md new file mode 100644 index 000000000000..8757c257d3e9 --- /dev/null +++ b/docs/configuration/model_resolution.md @@ -0,0 +1,23 @@ +# Model Resolution + +vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository +and finding the corresponding implementation that is registered to vLLM. +Nevertheless, our model resolution may fail for the following reasons: + +- The `config.json` of the model repository lacks the `architectures` field. +- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. +- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. + +To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. +For example: + +```python +from vllm import LLM + +model = LLM( + model="cerebras/Cerebras-GPT-1.3B", + hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 +) +``` + +Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM. diff --git a/docs/performance/optimization.md b/docs/configuration/optimization.md similarity index 99% rename from docs/performance/optimization.md rename to docs/configuration/optimization.md index 57e01a384b52..811925c19e63 100644 --- a/docs/performance/optimization.md +++ b/docs/configuration/optimization.md @@ -1,7 +1,4 @@ ---- -title: Optimization and Tuning ---- -[](){ #optimization-and-tuning } +# Optimization and Tuning This guide covers optimization strategies and performance tuning for vLLM V1. diff --git a/docs/serving/serve_args.md b/docs/configuration/serve_args.md similarity index 100% rename from docs/serving/serve_args.md rename to docs/configuration/serve_args.md diff --git a/docs/contributing/overview.md b/docs/contributing/README.md similarity index 100% rename from docs/contributing/overview.md rename to docs/contributing/README.md diff --git a/docs/performance/benchmarks.md b/docs/contributing/benchmarks.md similarity index 100% rename from docs/performance/benchmarks.md rename to docs/contributing/benchmarks.md diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md index 649edfcce69b..412c42fd580e 100644 --- a/docs/design/multiprocessing.md +++ b/docs/design/multiprocessing.md @@ -123,7 +123,7 @@ what is happening. First, a log message from vLLM: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + https://docs.vllm.ai/en/latest/usage/debugging.html#python-multiprocessing for more information. ``` diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md index 2631f28e46e4..6080390ba0ed 100644 --- a/docs/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -57,7 +57,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` - `vllm:spec_decode_num_draft_tokens_total` (Counter) - `vllm:spec_decode_num_emitted_tokens_total` (Counter) -These are documented under [Inferencing and Serving -> Production Metrics](../../serving/metrics.md). +These are documented under [Inferencing and Serving -> Production Metrics](../../usage/metrics.md). ### Grafana Dashboard diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 75cd00e24d7b..6ee1060dd050 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -93,7 +93,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha ## Required Function Calling -vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine. +vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/usage/v1_guide.html#feature-model) for the V1 engine. When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md index 584d7cd143bc..b238199e4144 100644 --- a/docs/serving/offline_inference.md +++ b/docs/serving/offline_inference.md @@ -27,188 +27,3 @@ Please refer to the above pages for more details about each API. !!! info [API Reference][offline-inference-api] - -[](){ #configuration-options } - -## Configuration Options - -This section lists the most common options for running the vLLM engine. -For a full list, refer to the [configuration][configuration] page. - -[](){ #model-resolution } - -### Model resolution - -vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository -and finding the corresponding implementation that is registered to vLLM. -Nevertheless, our model resolution may fail for the following reasons: - -- The `config.json` of the model repository lacks the `architectures` field. -- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. -- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. - -To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. -For example: - -```python -from vllm import LLM - -model = LLM( - model="cerebras/Cerebras-GPT-1.3B", - hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 -) -``` - -Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM. - -[](){ #reducing-memory-usage } - -### Reducing memory usage - -Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem. - -#### Tensor Parallelism (TP) - -Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs. - -The following code splits the model across 2 GPUs. - -```python -from vllm import LLM - -llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", - tensor_parallel_size=2) -``` - -!!! warning - To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) - before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. - - To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. - -!!! note - With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). - - You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. - -#### Quantization - -Quantized models take less memory at the cost of lower precision. - -Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI)) -and used directly without extra configuration. - -Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details. - -#### Context length and batch size - -You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) -and the maximum batch size (`max_num_seqs` option). - -```python -from vllm import LLM - -llm = LLM(model="adept/fuyu-8b", - max_model_len=2048, - max_num_seqs=2) -``` - -#### Reduce CUDA Graphs - -By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU. - -!!! warning - CUDA graph capture takes up more memory in V1 than in V0. - -You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: - -```python -from vllm import LLM -from vllm.config import CompilationConfig, CompilationLevel - -llm = LLM( - model="meta-llama/Llama-3.1-8B-Instruct", - compilation_config=CompilationConfig( - level=CompilationLevel.PIECEWISE, - # By default, it goes up to max_num_seqs - cudagraph_capture_sizes=[1, 2, 4, 8, 16], - ), -) -``` - -You can disable graph capturing completely via the `enforce_eager` flag: - -```python -from vllm import LLM - -llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", - enforce_eager=True) -``` - -#### Adjust cache size - -If you run out of CPU RAM, try the following options: - -- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB). -- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB). - -#### Multi-modal input limits - -You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model: - -```python -from vllm import LLM - -# Accept up to 3 images and 1 video per prompt -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"image": 3, "video": 1}) -``` - -You can go a step further and disable unused modalities completely by setting its limit to zero. -For example, if your application only accepts image input, there is no need to allocate any memory for videos. - -```python -from vllm import LLM - -# Accept any number of images but no videos -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - limit_mm_per_prompt={"video": 0}) -``` - -You can even run a multi-modal model for text-only inference: - -```python -from vllm import LLM - -# Don't accept images. Just text. -llm = LLM(model="google/gemma-3-27b-it", - limit_mm_per_prompt={"image": 0}) -``` - -#### Multi-modal processor arguments - -For certain models, you can adjust the multi-modal processor arguments to -reduce the size of the processed multi-modal inputs, which in turn saves memory. - -Here are some examples: - -```python -from vllm import LLM - -# Available for Qwen2-VL series models -llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", - mm_processor_kwargs={ - "max_pixels": 768 * 768, # Default is 1280 * 28 * 28 - }) - -# Available for InternVL series models -llm = LLM(model="OpenGVLab/InternVL2-2B", - mm_processor_kwargs={ - "max_dynamic_patch": 4, # Default is 12 - }) -``` - -### Performance optimization and tuning - -You can potentially improve the performance of vLLM by finetuning various options. -Please refer to [this guide][optimization-and-tuning] for more details. diff --git a/docs/usage/README.md b/docs/usage/README.md new file mode 100644 index 000000000000..681db57d8e0f --- /dev/null +++ b/docs/usage/README.md @@ -0,0 +1,7 @@ +# Using vLLM + +vLLM supports the following usage patterns: + +- [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model. +- [Deployment](../deployment/docker.md): Scale up model instances for production. +- [Training](../training/rlhf.md): Train or fine-tune a model. diff --git a/docs/serving/env_vars.md b/docs/usage/env_vars.md similarity index 100% rename from docs/serving/env_vars.md rename to docs/usage/env_vars.md diff --git a/docs/getting_started/faq.md b/docs/usage/faq.md similarity index 100% rename from docs/getting_started/faq.md rename to docs/usage/faq.md diff --git a/docs/serving/metrics.md b/docs/usage/metrics.md similarity index 100% rename from docs/serving/metrics.md rename to docs/usage/metrics.md diff --git a/docs/serving/seed_parameter_behavior.md b/docs/usage/reproducibility.md similarity index 98% rename from docs/serving/seed_parameter_behavior.md rename to docs/usage/reproducibility.md index 301847292b83..542b83a10352 100644 --- a/docs/serving/seed_parameter_behavior.md +++ b/docs/usage/reproducibility.md @@ -1,4 +1,4 @@ -# Seed Parameter Behavior +# Reproducibility ## Overview diff --git a/docs/deployment/security.md b/docs/usage/security.md similarity index 99% rename from docs/deployment/security.md rename to docs/usage/security.md index 9c4d639c0b3d..f1661828d68a 100644 --- a/docs/deployment/security.md +++ b/docs/usage/security.md @@ -1,4 +1,4 @@ -# Security Guide +# Security ## Inter-Node Communication diff --git a/docs/getting_started/troubleshooting.md b/docs/usage/troubleshooting.md similarity index 97% rename from docs/getting_started/troubleshooting.md rename to docs/usage/troubleshooting.md index 07e30f9684ae..889cfccdacac 100644 --- a/docs/getting_started/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -23,7 +23,7 @@ It'd be better to store the model in a local disk. Additionally, have a look at ## Out of memory -If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options][reducing-memory-usage] to reduce the memory consumption. +If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](../configuration/conserving_memory.md) to reduce the memory consumption. ## Generation quality changed @@ -159,7 +159,7 @@ If you have seen a warning in your logs like this: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing + https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. ``` @@ -258,7 +258,7 @@ or: ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...] ``` -But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps][model-resolution] to explicitly specify the vLLM implementation for the model. +But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model. ## Failed to infer device type diff --git a/docs/serving/usage_stats.md b/docs/usage/usage_stats.md similarity index 100% rename from docs/serving/usage_stats.md rename to docs/usage/usage_stats.md diff --git a/docs/getting_started/v1_user_guide.md b/docs/usage/v1_guide.md similarity index 99% rename from docs/getting_started/v1_user_guide.md rename to docs/usage/v1_guide.md index de90b8a7851e..3d5d7ce45cce 100644 --- a/docs/getting_started/v1_user_guide.md +++ b/docs/usage/v1_guide.md @@ -1,4 +1,4 @@ -# vLLM V1 User Guide +# vLLM V1 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). diff --git a/vllm/envs.py b/vllm/envs.py index 88953af1042f..b007bf8c59b7 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -164,7 +164,7 @@ def get_vllm_port() -> Optional[int]: raise ValueError( f"VLLM_PORT '{port}' appears to be a URI. " "This may be caused by a Kubernetes service discovery issue" - "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html" + "check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html" ) except Exception: pass diff --git a/vllm/utils.py b/vllm/utils.py index 5be5304ee0ab..86873ff75817 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2531,7 +2531,7 @@ def _maybe_force_spawn(): logger.warning( "We must use the `spawn` multiprocessing start method. " "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " - "See https://docs.vllm.ai/en/latest/getting_started/" + "See https://docs.vllm.ai/en/latest/usage/" "troubleshooting.html#python-multiprocessing " "for more information. Reason: %s", reason) os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" From 2e6705784f3feb49e1100cc2022ff7ca7f899020 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sat, 24 May 2025 22:26:45 +0800 Subject: [PATCH 130/192] [CI/Build] `chmod +x` to `cleanup_pr_body.sh` (#18650) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .github/scripts/cleanup_pr_body.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .github/scripts/cleanup_pr_body.sh diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh old mode 100644 new mode 100755 From 4ceafb6299c790ddc2be863a71227ae03da77c60 Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Sat, 24 May 2025 22:52:09 +0800 Subject: [PATCH 131/192] [MISC] typo fix and clean import (#18664) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- vllm/distributed/kv_transfer/__init__.py | 3 +-- vllm/distributed/kv_transfer/kv_connector/simple_connector.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index a9f26607de49..8b6abf5a80dd 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -1,8 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_transfer_state import ( - ensure_kv_transfer_initialized, get_kv_transfer_group, + KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group, has_kv_transfer_group, is_v1_kv_transfer_group) __all__ = [ diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 2e4bd20740e2..ed8fe38161e9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -106,7 +106,7 @@ def __init__( else: # the current vLLM instance is KV consumer, so it needs to connect - # its recv pipe to the send pipe of KV producder + # its recv pipe to the send pipe of KV producer if self.config.kv_connector == "PyNcclConnector": self.consumer_data_pipe = PyNcclPipe( local_rank=local_rank, From b9018a3f9f81031fea0cf228f502a6d1c64b7a5f Mon Sep 17 00:00:00 2001 From: wangxiyuan <wangxiyuan1007@gmail.com> Date: Sat, 24 May 2025 22:53:36 +0800 Subject: [PATCH 132/192] [BugFix] Fix import error for fused_moe (#18642) Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> --- vllm/model_executor/layers/fused_moe/layer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 31295582c1b1..b101f5862fa7 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -41,6 +41,7 @@ from .pplx_prepare_finalize import PplxPrepareAndFinalize else: fused_experts = None # type: ignore + FusedMoEPermuteExpertsUnpermute = None # type: ignore FusedMoEPrepareAndFinalize = None # type: ignore if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 From 2807271c860f2dee3a6789305c0dee5fc089964d Mon Sep 17 00:00:00 2001 From: Aaron Pham <contact@aarnphm.xyz> Date: Sat, 24 May 2025 11:04:14 -0400 Subject: [PATCH 133/192] [CI] enforce import regex instead of re (#18665) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> --- .pre-commit-config.yaml | 7 +++ tools/enforce_regex_import.py | 83 +++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tools/enforce_regex_import.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ee186be21094..5ee909c3b8ca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -128,6 +128,13 @@ repos: name: Update Dockerfile dependency graph entry: tools/update-dockerfile-graph.sh language: script + - id: enforce-import-regex-instead-of-re + name: Enforce import regex as re + entry: python tools/enforce_regex_import.py + language: python + types: [python] + pass_filenames: false + additional_dependencies: [regex] # forbid directly import triton - id: forbid-direct-triton-import name: "Forbid direct 'import triton'" diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py new file mode 100644 index 000000000000..b55c4a94eac8 --- /dev/null +++ b/tools/enforce_regex_import.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import subprocess +from pathlib import Path + +import regex as re + +FORBIDDEN_PATTERNS = re.compile( + r'^\s*(?:import\s+re(?:$|\s|,)|from\s+re\s+import)') +ALLOWED_PATTERNS = [ + re.compile(r'^\s*import\s+regex\s+as\s+re\s*$'), + re.compile(r'^\s*import\s+regex\s*$'), +] + + +def get_staged_python_files() -> list[str]: + try: + result = subprocess.run( + ['git', 'diff', '--cached', '--name-only', '--diff-filter=AM'], + capture_output=True, + text=True, + check=True) + files = result.stdout.strip().split( + '\n') if result.stdout.strip() else [] + return [f for f in files if f.endswith('.py')] + except subprocess.CalledProcessError: + return [] + + +def is_forbidden_import(line: str) -> bool: + line = line.strip() + return bool( + FORBIDDEN_PATTERNS.match(line) + and not any(pattern.match(line) for pattern in ALLOWED_PATTERNS)) + + +def check_file(filepath: str) -> list[tuple[int, str]]: + violations = [] + try: + with open(filepath, encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if is_forbidden_import(line): + violations.append((line_num, line.strip())) + except (OSError, UnicodeDecodeError): + pass + return violations + + +def main() -> int: + files = get_staged_python_files() + if not files: + return 0 + + total_violations = 0 + + for filepath in files: + if not Path(filepath).exists(): + continue + + violations = check_file(filepath) + if violations: + print(f"\n❌ {filepath}:") + for line_num, line in violations: + print(f" Line {line_num}: {line}") + total_violations += 1 + + if total_violations > 0: + print(f"\n💡 Found {total_violations} violation(s).") + print("❌ Please replace 'import re' with 'import regex as re'") + print( + " Also replace 'from re import ...' with 'from regex import ...'" + ) # noqa: E501 + print("✅ Allowed imports:") + print(" - import regex as re") + print(" - import regex") # noqa: E501 + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 9ea7f1abf3c015c5e560da099ad347e88b4a517d Mon Sep 17 00:00:00 2001 From: Aaron Pham <contact@aarnphm.xyz> Date: Sat, 24 May 2025 11:25:20 -0400 Subject: [PATCH 134/192] fix(regression): clone from reference items (#18662) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> --- vllm/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index e9212a82506e..a4b4bd66c843 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1494,7 +1494,7 @@ def add_request(request_id: str, engine, params, **kwargs): for i in range(original_params.n): request_id_i = f"{request_id}_parallel_sample_{i}" group.seq_id_to_index[request_id_i] = i - params = params.clone() + params = original_params.clone() params.n = 1 if params.seed is not None: params.seed += i From b554ab736e7c725bf7ddebb80b814e6c53232b46 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 25 May 2025 00:09:10 +0800 Subject: [PATCH 135/192] [CI/Build] fix permission denied issue (#18645) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- .github/workflows/cleanup_pr_body.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 50fea0c43cb8..3250b6671989 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -23,4 +23,4 @@ jobs: - name: Update PR description env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" + run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" From 6825d9a998df3f22b5e19fb600d0a0c09950db28 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon <woosuk.kwon@berkeley.edu> Date: Sat, 24 May 2025 17:33:46 -0700 Subject: [PATCH 136/192] [BugFix][Spec Decode] Improve Prefix Caching Logic in Speculative Decoding (#18668) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> --- vllm/v1/core/kv_cache_manager.py | 3 ++- vllm/v1/core/sched/scheduler.py | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index da18ece7555a..0f6098d2b400 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -174,6 +174,7 @@ def allocate_slots( num_new_tokens: int, num_new_computed_tokens: int = 0, new_computed_blocks: Optional[KVCacheBlocks] = None, + num_draft_tokens: int = 0, num_lookahead_tokens: int = 0, delay_cache_blocks: bool = False, ) -> Optional[KVCacheBlocks]: @@ -273,7 +274,7 @@ def allocate_slots( # generated (accepted) tokens. self.single_type_manager.cache_blocks( request, self.req_to_block_hashes[request.request_id], - num_computed_tokens + num_new_tokens - len(request.spec_token_ids)) + num_computed_tokens + num_new_tokens - num_draft_tokens) return KVCacheBlocks(new_blocks) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index efc0de350fba..4c6b3eea0cb7 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -227,10 +227,15 @@ def schedule(self) -> SchedulerOutput: req_index += 1 continue + num_draft_tokens = max( + num_new_tokens + request.num_computed_tokens - + request.num_tokens, 0) + while True: new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens, + num_draft_tokens=num_draft_tokens, num_lookahead_tokens=self.num_lookahead_tokens) if new_blocks is None: # The request cannot be scheduled. From 7891fdf0c64777afba3c3a8dc93447600a0c5bba Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Sat, 24 May 2025 20:07:20 -0700 Subject: [PATCH 137/192] [V1] Fix _pickle.PicklingError: Can't pickle <class 'transformers_modules.deepseek-ai.DeepSeek-V2-Lite... (#18640) Signed-off-by: Seiji Eicher <seiji@anyscale.com> --- vllm/v1/engine/async_llm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0d646d8dd575..74c2251c7521 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -20,6 +20,8 @@ from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext @@ -80,6 +82,9 @@ def __init__( "AsyncLLMEngine.from_vllm_config(...) or explicitly set " "VLLM_USE_V1=0 or 1 and report this issue on Github.") + # Ensure we can serialize custom transformer configs + maybe_register_config_serialize_by_value() + self.model_config = vllm_config.model_config self.vllm_config = vllm_config self.log_requests = log_requests From 6c6dcd8611933fb2dd1842b26e63c06a9b018e4f Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Sun, 25 May 2025 11:17:47 +0800 Subject: [PATCH 138/192] [MISC] correct signature for LoaderFunction (#18670) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index a1cf43328bab..1547a016ab88 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -634,7 +634,7 @@ def row_parallel_weight_loader(param: torch.Tensor, return default_weight_loader(param, loaded_weight) -LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor] +LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None] def sharded_weight_loader(shard_axis: int) -> LoaderFunction: From cebc22f3b674e58fa004625c6afe5a6ec6f2d9de Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Sun, 25 May 2025 11:26:31 +0800 Subject: [PATCH 139/192] [Misc]Replace `cuda` hard code with `current_platform` in Ray (#14668) Signed-off-by: noemotiovon <757486878@qq.com> --- vllm/executor/ray_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 37cc07bfbb36..7bc98a16f041 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -87,9 +87,8 @@ def execute_model_spmd( # TODO(swang): This is needed right now because Ray Compiled Graph # executes on a background thread, so we need to reset torch's # current device. - import torch if not self.compiled_dag_cuda_device_set: - torch.cuda.set_device(self.worker.device) + current_platform.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True output = self.worker._execute_model_spmd(execute_model_req, @@ -113,8 +112,7 @@ def setup_device_if_necessary(self): # Not needed pass else: - import torch - torch.cuda.set_device(self.worker.device) + current_platform.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True From 6ab681bcbe9ba8318cf3aafd318f375ef8fd7de3 Mon Sep 17 00:00:00 2001 From: Mengqing Cao <cmq0113@163.com> Date: Sun, 25 May 2025 12:51:21 +0800 Subject: [PATCH 140/192] [Misc][ModelScope] Change to use runtime VLLM_USE_MODELSCOPE (#18655) Signed-off-by: Mengqing Cao <cmq0113@163.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- tests/test_regression.py | 3 +++ .../model_loader/default_loader.py | 4 ++-- vllm/transformers_utils/__init__.py | 4 ++-- vllm/transformers_utils/config.py | 21 +++++++++---------- vllm/transformers_utils/tokenizer.py | 4 ++-- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/test_regression.py b/tests/test_regression.py index 8c9d4a91c73b..e092945422ed 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -60,6 +60,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary with monkeypatch.context() as m: m.setenv("VLLM_USE_MODELSCOPE", "True") + # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail + # with 400 Client Error: Bad Request. + m.setenv("HF_TOKEN", "") llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") prompts = [ diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index ddbd60940e9e..29a6e0af4bc6 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -11,8 +11,8 @@ from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME +from vllm import envs from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig -from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.utils import ( @@ -64,7 +64,7 @@ def _maybe_download_from_modelscope( Returns the path to the downloaded model, or None if the model is not downloaded from ModelScope.""" - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. # pylint: disable=C. diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index b556976a51ba..84bd7a747656 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -from vllm.envs import VLLM_USE_MODELSCOPE +from vllm import envs -if VLLM_USE_MODELSCOPE: +if envs.VLLM_USE_MODELSCOPE: try: # Patch here, before each import happens import modelscope diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2a2a8c181874..69e7207cc350 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -24,7 +24,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME -from vllm.envs import VLLM_USE_MODELSCOPE +from vllm import envs from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable @@ -45,13 +45,12 @@ from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import resolve_obj_by_qualname -if VLLM_USE_MODELSCOPE: +if envs.VLLM_USE_MODELSCOPE: from modelscope import AutoConfig else: from transformers import AutoConfig MISTRAL_CONFIG_NAME = "params.json" -HF_TOKEN = os.getenv('HF_TOKEN', None) logger = init_logger(__name__) @@ -130,7 +129,7 @@ def lookup_files() -> list[str]: ] # if model is remote, use hf_hub api to list files try: - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: from vllm.transformers_utils.utils import ( modelscope_list_repo_files) return modelscope_list_repo_files(repo_id, @@ -185,7 +184,7 @@ def file_or_path_exists(model: Union[str, Path], config_name: str, return file_exists(str(model), config_name, revision=revision, - token=HF_TOKEN) + token=os.getenv('HF_TOKEN', None)) def patch_rope_scaling(config: PretrainedConfig) -> None: @@ -312,7 +311,7 @@ def get_config( model, revision=revision, code_revision=code_revision, - token=HF_TOKEN, + token=os.getenv('HF_TOKEN', None), **kwargs, ) @@ -324,7 +323,7 @@ def get_config( model, revision=revision, code_revision=code_revision, - token=HF_TOKEN, + token=os.getenv('HF_TOKEN', None), **kwargs, ) else: @@ -334,7 +333,7 @@ def get_config( trust_remote_code=trust_remote_code, revision=revision, code_revision=code_revision, - token=HF_TOKEN, + token=os.getenv('HF_TOKEN', None), **kwargs, ) except ValueError as e: @@ -352,7 +351,7 @@ def get_config( raise e elif config_format == ConfigFormat.MISTRAL: - config = load_params_config(model, revision, token=HF_TOKEN, **kwargs) + config = load_params_config(model, revision, **kwargs) else: supported_formats = [ fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO @@ -561,7 +560,7 @@ def get_sentence_transformer_tokenizer_config(model: str, # If model is on HuggingfaceHub, get the repo files repo_files = list_repo_files(model, revision=revision, - token=HF_TOKEN) + token=os.getenv('HF_TOKEN', None)) except Exception: repo_files = [] @@ -768,7 +767,7 @@ def get_hf_image_processor_config( **kwargs, ) -> dict[str, Any]: # ModelScope does not provide an interface for image_processor - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: return dict() # Separate model folder from file path for GGUF models if check_gguf_file(model): diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index e31580ede57b..fa7a208c48ed 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -13,7 +13,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.envs import VLLM_USE_MODELSCOPE +from vllm import envs from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer_base import (TokenizerBase, @@ -168,7 +168,7 @@ def get_tokenizer( ) -> AnyTokenizer: """Gets a tokenizer for the given model name via HuggingFace or ModelScope. """ - if VLLM_USE_MODELSCOPE: + if envs.VLLM_USE_MODELSCOPE: # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. # pylint: disable=C. From 75f81750f3a9071b23f7f2d5c9f9f1c2cd0091b1 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Sun, 25 May 2025 12:51:25 +0800 Subject: [PATCH 141/192] [VLM] Initialize video input support for InternVL models (#18499) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> --- docs/models/supported_models.md | 5 +- examples/offline_inference/vision_language.py | 15 +- .../multimodal/generation/test_common.py | 11 + .../generation/vlm_utils/model_utils.py | 86 ++- .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 3 +- vllm/entrypoints/chat_utils.py | 2 + vllm/model_executor/models/h2ovl.py | 11 +- vllm/model_executor/models/internvl.py | 511 +++++++++++++++++- vllm/model_executor/models/nvlm_d.py | 13 +- 10 files changed, 596 insertions(+), 62 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 5a402ee88c61..4b19272f4a28 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -527,7 +527,7 @@ Specified using `--task generate`. | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | ✅︎ | ✅︎\* | | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | ✅︎ | | -| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | | +| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | ✅︎ | | | | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ | | | `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | ✅︎ | ✅︎ | | @@ -577,6 +577,9 @@ Specified using `--task generate`. This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. +!!! note + Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently. + !!! note `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80. diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 941fcd381dea..c8b1e9aba5dc 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -330,22 +330,26 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "OpenGVLab/InternVL2-2B" + model_name = "OpenGVLab/InternVL3-2B" engine_args = EngineArgs( model=model_name, trust_remote_code=True, - max_model_len=4096, + max_model_len=8192, limit_mm_per_prompt={modality: 1}, ) + if modality == "image": + placeholder = "<image>" + elif modality == "video": + placeholder = "<video>" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) messages = [[{ 'role': 'user', - 'content': f"<image>\n{question}" + 'content': f"{placeholder}\n{question}" }] for question in questions] prompts = tokenizer.apply_chat_template(messages, tokenize=False, @@ -357,6 +361,9 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + stop_token_ids = [ + token_id for token_id in stop_token_ids if token_id is not None + ] return ModelRequestData( engine_args=engine_args, diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index d51a03dfea7e..e4e48f9951cf 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -349,6 +349,17 @@ use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, ), + "intern_vl-video": VLMTestInfo( + models=[ + "OpenGVLab/InternVL3-1B", + ], + test_type=VLMTestType.VIDEO, + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + video_idx_to_prompt=lambda idx: "<video>", + max_model_len=8192, + use_tokenizer_eos=True, + patch_hf_runner=model_utils.internvl_patch_hf_runner, + ), "kimi_vl": VLMTestInfo( models=["moonshotai/Kimi-VL-A3B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 743c7f947697..dc1ea5208240 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -7,6 +7,8 @@ from pathlib import PosixPath from typing import Optional, Union +import numpy as np +import numpy.typing as npt import regex as re import torch from PIL.Image import Image @@ -495,30 +497,74 @@ def __init__(self, hf_runner: HfRunner): self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, list[Image]], - **kwargs): + def __call__( + self, + text: str, + images: Union[Image, list[Image]] = None, + videos: Union[npt.NDArray, list[npt.NDArray]] = None, + **kwargs, + ): from vllm.model_executor.models.internvl import ( IMG_CONTEXT, IMG_END, IMG_START, - image_to_pixel_values_internvl) + image_to_pixel_values_internvl, video_to_pixel_values_internvl) images = [images] if isinstance(images, Image) else images - pixel_values = [ - image_to_pixel_values_internvl( - image, - input_size=self.image_size, - min_num=self.min_num, - max_num=self.max_num, - use_thumbnail=self.use_thumbnail, - ) for image in images - ] - num_patches_list = [ - pixel_value.shape[0] for pixel_value in pixel_values - ] + videos = [videos] if isinstance(videos, np.ndarray) else videos + if images is not None: + pixel_values_images = [ + image_to_pixel_values_internvl( + image, + input_size=self.image_size, + min_num=self.min_num, + max_num=self.max_num, + use_thumbnail=self.use_thumbnail, + ) for image in images + ] + num_patches_images = [ + pixel_value.shape[0] for pixel_value in pixel_values_images + ] + else: + pixel_values_images, num_patches_images = [], [] + + if videos is not None: + pixel_values_videos = [ + video_to_pixel_values_internvl( + video, + input_size=self.image_size, + min_num=1, + max_num=1, + use_thumbnail=False, + ) for video in videos + ] + num_patches_videos = [ + pixel_value.shape[0] for pixel_value in pixel_values_videos + ] + else: + pixel_values_videos, num_patches_videos = [], [] + + pixel_values = [] + while ("<image>" in text) or ("<video>" in text): + image_index = text.find("<image>") + video_index = text.find("<video>") + if image_index == -1 or (video_index > -1 + and video_index < image_index): + num_patches = num_patches_videos.pop(0) + pixel_values.append(pixel_values_videos.pop(0)) + context_tokens = IMG_START + \ + IMG_CONTEXT * self.num_image_token + IMG_END + video_tokens = ''.join([ + f'Frame{i+1}: {context_tokens}' + for i in range(num_patches) + ]) + text = text.replace('<video>', video_tokens, 1) + else: + num_patches = num_patches_images.pop(0) + pixel_values.append(pixel_values_images.pop(0)) + context_tokens = IMG_CONTEXT * self.num_image_token \ + * num_patches + image_tokens = IMG_START + context_tokens + IMG_END + text = text.replace('<image>', image_tokens, 1) pixel_values = torch.cat(pixel_values, dim=0) - for num_patches in num_patches_list: - context_tokens = IMG_CONTEXT * self.num_image_token \ - * num_patches - image_tokens = IMG_START + context_tokens + IMG_END - text = text.replace('<image>', image_tokens, 1) + prompt = self.tokenizer(text, return_tensors="pt") prompt.update({"pixel_values": pixel_values}) return prompt diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index e6b70a4438e9..a107eae6de5e 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -258,6 +258,7 @@ def _test_processing_correctness_mistral( "ibm-granite/granite-speech-3.3-8b", "h2oai/h2ovl-mississippi-800m", "OpenGVLab/InternVL2-1B", + "OpenGVLab/InternVL3-1B", "HuggingFaceM4/Idefics3-8B-Llama3", "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "moonshotai/Kimi-VL-A3B-Instruct", diff --git a/tests/models/registry.py b/tests/models/registry.py index 977be6475714..bf7729d4e044 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -334,7 +334,8 @@ def check_available_online( max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", - extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501 + extras={"2B": "OpenGVLab/InternVL2-2B", + "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index e8d10017a1e9..ec1b327da905 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -556,6 +556,8 @@ def _placeholder_str(self, modality: ModalityStr, return "(<audio>./</audio>)" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": + if model_type == "internvl_chat": + return "<video>" if model_type in ("qwen2_vl", "qwen2_5_vl"): return "<|vision_start|><|video_pad|><|vision_end|>" if model_type == "qwen2_5_omni": diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 99c226439ecb..904f5330c653 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -25,9 +25,10 @@ from .intern_vit import InternVisionModel from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, + BaseInternVLDummyInputsBuilder, + BaseInternVLMultiModalProcessor, BaseInternVLProcessingInfo, BaseInternVLProcessor, - InternVLChatModel, InternVLDummyInputsBuilder, - InternVLMultiModalProcessor, build_transform, + InternVLChatModel, build_transform, find_closest_aspect_ratio, get_internvl_target_ratios) @@ -430,8 +431,8 @@ def get_num_image_tokens( ) -class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo] - ): +class H2OVLMultiModalProcessor( + BaseInternVLMultiModalProcessor[H2OVLProcessingInfo]): def _get_prompt_updates( self, @@ -514,7 +515,7 @@ def _cached_apply_hf_processor( @MULTIMODAL_REGISTRY.register_processor( H2OVLMultiModalProcessor, info=H2OVLProcessingInfo, - dummy_inputs=InternVLDummyInputsBuilder) + dummy_inputs=BaseInternVLDummyInputsBuilder) class H2OVLChatModel(InternVLChatModel): def _init_vision_model( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index f68513553846..4612fc438741 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -8,8 +8,9 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, TypeVar, Union +from typing import Any, Literal, Optional, TypedDict, TypeVar, Union +import numpy.typing as npt import torch import torch.nn as nn import torchvision.transforms as T @@ -74,6 +75,33 @@ class InternVLImageEmbeddingInputs(TypedDict): InternVLImageEmbeddingInputs] +class InternVLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values_flat: torch.Tensor + """ + Shape: + `(batch_size * num_video * num_frames, num_channels, height, width)` + """ + + num_patches: torch.Tensor + """Shape: `(batch_size * num_images)`""" + + +class InternVLVideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_videos, total_video_feature_size, hidden_size)` + or a list of tensors of shape `(total_video_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternVLVideoInputs = Union[InternVLVideoPixelInputs, + InternVLVideoEmbeddingInputs] + + # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD @@ -231,6 +259,33 @@ def image_to_pixel_values_internvl( return pixel_values +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def video_to_pixel_values_internvl( + video: npt.NDArray, + *, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, +) -> torch.Tensor: + target_ratios = get_internvl_target_ratios(min_num, max_num) + + transform = build_transform(input_size=input_size) + frames_list = list[Image.Image]() + for frame in video: + pil_frame = dynamic_preprocess_internvl( + Image.fromarray(frame, mode="RGB"), + target_ratios=target_ratios, + image_size=input_size, + use_thumbnail=use_thumbnail, + ) + assert len(pil_frame) == 1 + frames_list.extend(pil_frame) + + pixel_values = torch.stack([transform(image) for image in frames_list]) + return pixel_values + + class BaseInternVLProcessor(ABC): """ This model doesn't define its own HF processor, @@ -375,24 +430,14 @@ def _images_to_pixel_values_lst( ) for image in images ] - def __call__( + def _preprocess_image( self, - text: Optional[Union[str, list[str]]] = None, - images: Optional[Union[Image.Image, list[Image.Image]]] = None, + text: list[str], + images: list[Image.Image], min_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - ) -> Mapping[str, NestedTensors]: - if text is None: - text = [] - if not isinstance(text, list): - text = [text] - if images is None: - images = [] - if not isinstance(images, list): - images = [images] - + ) -> tuple[list[str], dict[str, torch.Tensor]]: if len(images) == 0: image_inputs = {} else: @@ -415,6 +460,34 @@ def __call__( image_repl = self.get_image_repl(feature_size, num_patches) text = [t.replace('<image>', image_repl.full, 1) for t in text] + return text, image_inputs + + def _make_batch_input(self, + input_item: Optional[Union[Any, list[Any]]] = None): + if input_item is None: + input_item = [] + if not isinstance(input_item, list): + input_item = [input_item] + return input_item + + def __call__( + self, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> Mapping[str, NestedTensors]: + text, images = [self._make_batch_input(x) for x in (text, images)] + + text, image_inputs = self._preprocess_image( + text=text, + images=images, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) text_inputs = self.tokenizer(text) @@ -425,11 +498,133 @@ def __call__( class InternVLProcessor(BaseInternVLProcessor): + """ + HF Processor for InternVLChatModel with extended video processing logic. + + Code for video processing is adapted from video example: + https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers + """ + + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + video_token: Optional[str] = None, + ) -> None: + super().__init__( + config=config, + tokenizer=tokenizer, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + # add extra video token for video processing + self.video_token = video_token @property def image_token_id(self) -> int: return self.tokenizer.get_vocab()[IMG_CONTEXT] + @property + def video_token_id(self) -> Optional[int]: + if self.video_token is None: + return None + return self.tokenizer.get_vocab().get(self.video_token, None) + + @property + def supports_video(self) -> bool: + return self.video_token_id is not None + + def _videos_to_pixel_values_lst( + self, + videos: list[npt.NDArray], + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + min_num, max_num = self.resolve_min_max_num( + min_dynamic_patch=1, + max_dynamic_patch=1, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values + ) + + return [ + video_to_pixel_values_internvl( + video, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=False, + ) for video in videos + ] + + def _preprocess_video( + self, + text: list[str], + videos: list[npt.NDArray], + dynamic_image_size: Optional[bool] = None, + ): + if len(videos) == 0 or not self.supports_video: + video_inputs = {} + else: + pixel_values_lst_video = self._videos_to_pixel_values_lst( + videos, + dynamic_image_size=dynamic_image_size, + ) + video_inputs: dict[str, NestedTensors] = { + "pixel_values_flat_video": + torch.cat(pixel_values_lst_video), + "video_num_patches": + torch.tensor([len(item) for item in pixel_values_lst_video]), + } + + for pixel_values in pixel_values_lst_video: + num_patches = pixel_values.shape[0] + + video_repl = self.get_video_repl(self.num_image_token, + num_patches, self.video_token) + text = [t.replace('<video>', video_repl.full, 1) for t in text] + return text, video_inputs + + def __call__( + self, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, + videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None, + min_dynamic_patch: Optional[int] = None, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> Mapping[str, NestedTensors]: + text, images, videos = [ + self._make_batch_input(x) for x in (text, images, videos) + ] + + text, image_inputs = self._preprocess_image( + text=text, + images=images, + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + text, video_inputs = self._preprocess_video( + text=text, + videos=videos, + dynamic_image_size=dynamic_image_size, + ) + + text_inputs = self.tokenizer(text) + + return { + **BatchEncoding(text_inputs, tensor_type=return_tensors), + **image_inputs, + **video_inputs, + } + def get_image_repl( self, feature_size: int, @@ -440,8 +635,24 @@ def get_image_repl( return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) + def get_video_repl( + self, + feature_size: int, + num_patches: Optional[int] = None, + video_context_token: str = IMG_CONTEXT, + ) -> PromptUpdateDetails[str]: + repl_features = video_context_token * self.num_image_token + repl_features_with_sep = IMG_START + repl_features + IMG_END + # num_patches is equal to num_frames + repl_full = ''.join([ + f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches) + ]) + + return PromptUpdateDetails.select_text(repl_full, video_context_token) + class BaseInternVLProcessingInfo(BaseProcessingInfo): + """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod def get_hf_processor( @@ -497,11 +708,22 @@ def get_image_size_with_most_features(self) -> ImageSize: return largest_feature_pinpoint + def get_max_image_tokens(self) -> int: + processor = self.get_hf_processor() + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=processor, + ) + _I = TypeVar("_I", bound=BaseInternVLProcessingInfo) -class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): +class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): + """Basic image-only DummyInputsBuilder for InternVL-style models.""" def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -525,7 +747,8 @@ def get_dummy_mm_data( } -class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): +class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): + """ Basic image-only MultiModalProcessor for InternVL-style models.""" def _call_hf_processor( self, @@ -614,6 +837,38 @@ def get_replacement_internvl(item_idx: int): class InternVLProcessingInfo(BaseInternVLProcessingInfo): + """InternVL ProcessingInfo extended for video processing""" + + @property + def supports_video(self): + return self.get_hf_processor().supports_video + + def get_supported_mm_limits(self): + video_limit = {"video": None} if self.supports_video else {} + return {**super().get_supported_mm_limits(), **video_limit} + + def get_video_token(self) -> Optional[str]: + text_model_type = self.get_hf_config().get_text_config().model_type + if text_model_type == "qwen2": + return "<|video_pad|>" + return None + + def get_num_frames_with_most_features( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> int: + max_images = mm_counts.get("image", 0) + max_videos = mm_counts.get("video", 0) + + processor = self.get_hf_processor() + + max_image_tokens = self.get_max_image_tokens() * max_images + max_total_frames = (seq_len - + max_image_tokens) // processor.num_image_token + max_frames_per_video = max_total_frames // max(max_videos, 1) + + return max(max_frames_per_video, 1) def get_hf_processor( self, @@ -630,6 +885,8 @@ def get_hf_processor( if dynamic_image_size is not None: kwargs["dynamic_image_size"] = dynamic_image_size + kwargs["video_token"] = self.get_video_token() + return self.ctx.init_processor( InternVLProcessor, config=self.get_hf_config(), @@ -638,6 +895,121 @@ def get_hf_processor( ) +class InternVLDummyInputsBuilder( + BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]): + """InternVL DummyInputsBuilder extended for video support""" + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_videos = mm_counts.get("video", 0) + + return super().get_dummy_text(mm_counts) + "<video>" * num_videos + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + dummy_image = super().get_dummy_mm_data(seq_len=seq_len, + mm_counts=mm_counts) + if self.info.supports_video: + config = self.info.get_hf_config() + image_size: int = config.vision_config.image_size + target_num_frames = \ + self.info.get_num_frames_with_most_features(seq_len, mm_counts) + num_videos = mm_counts.get("video", 0) + dummy_video = { + "video": + self._get_dummy_videos(width=image_size, + height=image_size, + num_frames=target_num_frames, + num_videos=num_videos) + } + else: + dummy_video = {} + return {**dummy_image, **dummy_video} + + +class InternVLMultiModalProcessor( + BaseInternVLMultiModalProcessor[InternVLProcessingInfo]): + """InternVL MultiModalProcessor extended for video support""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + processed_outputs = super()._call_hf_processor(prompt, mm_data, + mm_kwargs) + + hf_processor = self.info.get_hf_processor(**mm_kwargs) + if self.info.supports_video and ( + video_token_id := hf_processor.video_token_id) is not None: + processed_outputs["video_token_id"] = torch.tensor(video_token_id) + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_fields = super()._get_mm_fields_config(hf_inputs, + hf_processor_mm_kwargs) + if self.info.supports_video: + video_num_patches = hf_inputs.get("video_num_patches", + torch.empty(0)) + num_videos = len(video_num_patches) + video_fields = dict( + pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_patches), + video_num_patches=MultiModalFieldConfig.batched("video"), + video_token_id=MultiModalFieldConfig.shared( + "video", num_videos), + ) + else: + video_fields = {} + + return image_fields | video_fields + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + prompt_repl: list[PromptUpdate] = super()._get_prompt_updates( + mm_items, hf_processor_mm_kwargs, out_mm_kwargs) + + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "video_num_patches" in out_mm_kwargs: + video_num_patches = out_mm_kwargs["video_num_patches"] + assert isinstance(video_num_patches, torch.Tensor) + video_num_patches = video_num_patches.tolist() + else: + video_num_patches = [] + + def get_video_replacement_internvl(item_idx: int): + feature_size = hf_processor.num_image_token + num_patches = video_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return hf_processor.get_video_repl( + feature_size, + num_patches, + video_context_token=hf_processor.video_token) + + if self.info.supports_video: + prompt_repl.append( + PromptReplacement( + modality="video", + target="<video>", + replacement=get_video_replacement_internvl, + )) + return prompt_repl + + @MULTIMODAL_REGISTRY.register_processor( InternVLMultiModalProcessor, info=InternVLProcessingInfo, @@ -681,6 +1053,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.mlp1 = self._init_mlp1(config) self.img_context_token_id = None + self.video_context_token_id = None + self.visual_token_mask = None self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -825,10 +1199,55 @@ def _parse_and_validate_image_input( raise AssertionError("This line should be unreachable.") + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[InternVLVideoPixelInputs]: + pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None) + video_num_patches = kwargs.pop("video_num_patches", None) + video_embeds = kwargs.pop("image_embeds", None) + + if pixel_values_flat_video is None and video_embeds is None: + return None + + if video_embeds is not None: + if not isinstance(video_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + + return InternVLImageEmbeddingInputs( + type="video_embeds", + data=flatten_bn(video_embeds), + ) + + video_token_id = kwargs["video_token_id"] + assert isinstance(video_token_id, torch.Tensor) + self.video_context_token_id = video_token_id.flatten().unique().item() + + if pixel_values_flat_video is not None: + if not isinstance(pixel_values_flat_video, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values_flat_video)}") + + if not isinstance(video_num_patches, (torch.Tensor, list)): + raise ValueError("Incorrect type of image_num_patches. " + f"Got type: {type(video_num_patches)}") + + pixel_values_flat_video = flatten_bn(pixel_values_flat_video, + concat=True) + video_num_patches = flatten_bn(video_num_patches, concat=True) + + return InternVLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_flat=self._validate_pixel_values( + pixel_values_flat_video), + num_patches=video_num_patches, + ) + + raise AssertionError("This line should be unreachable.") + def _process_image_input( self, - image_input: InternVLImageInputs, - ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]: + image_input: Union[InternVLImageInputs, InternVLVideoPixelInputs], + ) -> tuple[torch.Tensor, ...]: if image_input["type"] == "image_embeds": return image_input["data"] @@ -840,8 +1259,8 @@ def _process_image_input( # Only one image in the current batch if len(num_patches) == 1: - return image_embeds.view( - -1, self.config.text_config.hidden_size).unsqueeze(0) + return (image_embeds.view(-1, + self.config.text_config.hidden_size), ) # NOTE: Image embeddings are split into separate tensors for each image # by the size of each embedding. @@ -853,8 +1272,26 @@ def _process_image_input( ] return image_embeds.split(image_feature_sizes) + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values_flat", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_flat_video", + ) and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities + def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None: if self.is_mono: + assert self.img_context_token_id is not None self.visual_token_mask = ( input_ids == self.img_context_token_id).reshape(-1, 1) else: @@ -865,11 +1302,28 @@ def get_language_model(self) -> torch.nn.Module: def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: + + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: return None - return self._process_image_input(image_input) + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_image_input(video_input) + multimodal_embeddings += video_embeddings + + return multimodal_embeddings def get_input_embeddings( self, @@ -878,13 +1332,18 @@ def get_input_embeddings( ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - assert self.img_context_token_id is not None + context_token_ids = [ + token_id for token_id in (self.img_context_token_id, + self.video_context_token_id) + if token_id is not None + ] + assert len(context_token_ids) >= 1 self._set_visual_token_mask(input_ids) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - self.img_context_token_id, + context_token_ids, ) return inputs_embeds diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 62a7deab6a10..172434e66ae2 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -22,9 +22,10 @@ PromptUpdateDetails) from .intern_vit import InternVisionModel -from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor, - InternVLChatModel, InternVLDummyInputsBuilder, - InternVLMultiModalProcessor) +from .internvl import (BaseInternVLDummyInputsBuilder, + BaseInternVLMultiModalProcessor, + BaseInternVLProcessingInfo, BaseInternVLProcessor, + InternVLChatModel) IMG_PAD = "<|vision_pad|>" @@ -84,7 +85,8 @@ def get_hf_processor( ) -class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]): +class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo] + ): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -110,7 +112,8 @@ def get_dummy_mm_data( } -class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]): +class NVLMMultiModalProcessor( + BaseInternVLMultiModalProcessor[NVLMProcessingInfo]): def _get_prompt_updates( self, From 63934543a0f05edfc6a5f2afa235b5e026b27b71 Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Sun, 25 May 2025 01:02:59 -0400 Subject: [PATCH 142/192] Speed up the `kernels/quantization/` tests (#18669) Signed-off-by: mgoin <mgoin64@gmail.com> --- tests/kernels/quantization/test_block_fp8.py | 14 +++++------ tests/kernels/quantization/test_gguf.py | 4 ++-- .../quantization/test_triton_scaled_mm.py | 24 +++++++------------ 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index ef1d7e47ef81..ae05d61173f3 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -36,16 +36,16 @@ # Test configurations DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] -NUM_TOKENS = [7, 83, 2048] +NUM_TOKENS = [7, 2050] D = [512, 4096, 5120, 13824] -GROUP_SIZE = [64, 128, 256, 512] -M = [1, 7, 8, 83, 84, 512, 2048, 4096] -N = [128, 512, 1024, 4096, 7168, 7748, 13824] -K = [256, 4096, 5120, 3884, 13824, 16384] +GROUP_SIZE = [64, 128, 512] +M = [1, 7, 8, 83, 84, 4096] +N = [128, 512, 7168, 7748, 13824] +K = [256, 3884, 4096, 13824, 16384] # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8 # and its hidden size is 7168. -M_moe = [1, 2, 7, 83, 128, 512, 2048] -M_moe_dg = [128, 192, 512, 1335, 2048] +M_moe = [1, 2, 7, 83, 128, 2048] +M_moe_dg = [128, 192, 1335, 2048] N_moe = [128, 256, 1024, 4608] # [13824] K_moe = [256, 512, 7168] # [13824] BLOCK_SIZE = [[128, 128]] diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index 6cf88604ec65..e520e99b071c 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -35,11 +35,11 @@ def get_gguf_MoE_tensors( return GGUFReader(sample_file).tensors -DTYPES = [torch.half, torch.bfloat16, torch.float32] +DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] # Hidden_size for testing, must match the sample file in HF repo, # we have `hidden_size = 256, 1024` for test in HF repo currently. HIDDEN_SIZES = [256, 1024] -NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing +NUM_TOKENS = [7, 2050] # Arbitrary values for testing SEEDS = [0] QUANT_TYPES = [ # i-matrix diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 45f10b0eb1d5..30e6eeb8d566 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -13,8 +13,13 @@ device = "cuda" +triton_scaled_mm_module = importlib.import_module( + "vllm.model_executor.layers.quantization.compressed_tensors." + "triton_scaled_mm") +triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm -def scaled_mm_torch(a: torch.Tensor, + +def torch_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, @@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a, if use_bias: bias = torch.rand((N, ), device=device, dtype=out_dtype) - triton_scaled_mm_module = importlib.import_module( - "vllm.model_executor.layers.quantization.compressed_tensors." - "triton_scaled_mm") - triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm - c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - a_cpu = a.cpu() - b_cpu = b.cpu() - scale_a_cpu = scale_a.cpu() - scale_b_cpu = scale_b.cpu() - bias_cpu = None if bias is None else bias.cpu() - - c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu, - out_dtype, bias_cpu) + c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) - c_check_cpu = c_check.cpu() - torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1) + torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1) From 44073a7ac338c511dff0ba6cbec0d932f51ac364 Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Sun, 25 May 2025 13:34:24 +0800 Subject: [PATCH 143/192] [BUGFIX] catch subclass first for try...except (#18672) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- vllm/model_executor/model_loader/weight_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 1547a016ab88..f61956f4e8e0 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -319,6 +319,7 @@ def download_safetensors_index_file_from_hf( Args: model_name_or_path (str): The model name or path. + index_file (str): The safetensors index file name cache_dir (Optional[str]): The cache directory to store the model weights. If None, will use HF defaults. revision (Optional[str]): The revision of the model. @@ -337,10 +338,10 @@ def download_safetensors_index_file_from_hf( ) # If file not found on remote or locally, we should not fail since # only some models will have index_file. - except huggingface_hub.utils.EntryNotFoundError: - logger.info("No %s found in remote.", index_file) except huggingface_hub.utils.LocalEntryNotFoundError: logger.info("No %s found in local cache.", index_file) + except huggingface_hub.utils.EntryNotFoundError: + logger.info("No %s found in remote.", index_file) # For models like Mistral-7B-v0.3, there are both sharded From 503f8487c295343ac13124bc2483cee4e2aabf02 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sun, 25 May 2025 14:03:53 +0800 Subject: [PATCH 144/192] [Misc] Reduce logs on startup (#18649) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .../model_executor/layers/quantization/fp8.py | 5 ++--- vllm/platforms/__init__.py | 5 +---- vllm/plugins/__init__.py | 22 +++++++++++-------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 652bf76673c5..c2aca842c8b3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -62,10 +62,9 @@ def __init__( weight_block_size: Optional[list[int]] = None, ) -> None: super().__init__() + self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized - if is_checkpoint_fp8_serialized: - logger.warning("Detected fp8 checkpoint. Please note that the " - "format is experimental and subject to change.") + if activation_scheme not in ACTIVATION_SCHEMES: raise ValueError( f"Unsupported activation scheme {activation_scheme}") diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 49e502d2626c..00d00d05f47a 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -217,11 +217,8 @@ def resolve_current_platform_cls_qualname() -> str: platform_cls_qualname = func() if platform_cls_qualname is not None: activated_plugins.append(name) - logger.info("Platform plugin %s loaded.", name) - logger.warning( - "Platform plugin %s function's return value is None", name) except Exception: - logger.exception("Failed to load platform plugin %s", name) + pass activated_builtin_plugins = list( set(activated_plugins) & set(builtin_platform_plugins.keys())) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index d72ab2bd088c..2884cb46fecd 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -2,7 +2,7 @@ import logging import os -from typing import Callable +from typing import Any, Callable import torch @@ -14,7 +14,7 @@ plugins_loaded = False -def load_plugins_by_group(group: str) -> dict[str, Callable]: +def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]: import sys if sys.version_info < (3, 10): from importlib_metadata import entry_points @@ -27,23 +27,27 @@ def load_plugins_by_group(group: str) -> dict[str, Callable]: if len(discovered_plugins) == 0: logger.debug("No plugins for group %s found.", group) return {} + logger.info("Available plugins for group %s:", group) for plugin in discovered_plugins: - logger.info("name=%s, value=%s", plugin.name, plugin.value) + logger.info("- %s -> %s", plugin.name, plugin.value) + if allowed_plugins is None: - logger.info("all available plugins for group %s will be loaded.", - group) - logger.info("set environment variable VLLM_PLUGINS to control" - " which plugins to load.") - plugins = {} + logger.info("All plugins in this group will be loaded. " + "Set `VLLM_PLUGINS` to control which plugins to load.") + + plugins = dict[str, Callable[[], Any]]() for plugin in discovered_plugins: if allowed_plugins is None or plugin.name in allowed_plugins: + if allowed_plugins is not None: + logger.info("Loading plugin %s", plugin.name) + try: func = plugin.load() plugins[plugin.name] = func - logger.info("plugin %s loaded.", plugin.name) except Exception: logger.exception("Failed to load plugin %s", plugin.name) + return plugins From 624b77a2b363b397bee85ed6b19be96155b7bae5 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 25 May 2025 16:36:33 +0800 Subject: [PATCH 145/192] [doc] fix broken links (#18671) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3dd214e9b451..67f6b957ec55 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. ## Contributing We welcome and value any contributions and collaborations. -Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing) for how to get involved. +Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved. ## Sponsors From 279f854519e0df296de72b97bf9e5e89c76a4359 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 25 May 2025 16:40:31 +0800 Subject: [PATCH 146/192] [doc] improve readability (#18675) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- docs/contributing/dockerfile/dockerfile.md | 7 +++- docs/contributing/model/registration.md | 5 ++- docs/deployment/docker.md | 15 ++++---- docs/deployment/frameworks/skypilot.md | 14 ++++++-- docs/deployment/frameworks/streamlit.md | 3 +- docs/deployment/nginx.md | 35 ++++++++++++++++--- docs/features/quantization/auto_awq.md | 4 ++- docs/features/quantization/bitblas.md | 15 ++++++-- docs/features/quantization/bnb.md | 14 ++++++-- docs/features/quantization/gguf.md | 11 ++++-- docs/features/quantization/gptqmodel.md | 3 +- docs/features/quantization/torchao.md | 11 ++++-- docs/features/reasoning_outputs.md | 3 +- docs/features/spec_decode.md | 9 +++-- .../ai_accelerator/hpu-gaudi.inc.md | 19 ++++++++-- .../installation/ai_accelerator/neuron.inc.md | 12 +++++-- .../installation/gpu/cuda.inc.md | 18 +++++++--- .../installation/gpu/rocm.inc.md | 27 +++++++++----- .../models/extensions/runai_model_streamer.md | 28 +++++++++++---- docs/serving/openai_compatible_server.md | 12 +++++-- 20 files changed, 206 insertions(+), 59 deletions(-) diff --git a/docs/contributing/dockerfile/dockerfile.md b/docs/contributing/dockerfile/dockerfile.md index 3765996cb03f..a39f335c87b8 100644 --- a/docs/contributing/dockerfile/dockerfile.md +++ b/docs/contributing/dockerfile/dockerfile.md @@ -26,7 +26,12 @@ The edges of the build graph represent: > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present): > > ```bash - > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile + > dockerfilegraph \ + > -o png \ + > --legend \ + > --dpi 200 \ + > --max-label-length 50 \ + > --filename docker/Dockerfile > ``` > > or in case you want to run it directly with the docker image: diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index e796e49a7501..2f829889277c 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -41,7 +41,10 @@ If your model imports modules that initialize CUDA, consider lazy-importing it t ```python from vllm import ModelRegistry -ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") +ModelRegistry.register_model( + "YourModelForCausalLM", + "your_code:YourModelForCausalLM" +) ``` !!! warning diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 293536e52c4b..516640f6fd3c 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -11,7 +11,7 @@ vLLM offers an official Docker image for deployment. The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). ```console -$ docker run --runtime nvidia --gpus all \ +docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ -p 8000:8000 \ @@ -23,7 +23,7 @@ $ docker run --runtime nvidia --gpus all \ This image can also be used with other container engines such as [Podman](https://podman.io/). ```console -$ podman run --gpus all \ +podman run --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ -p 8000:8000 \ @@ -73,7 +73,10 @@ You can build and run vLLM from source via the provided <gh-file:docker/Dockerfi ```console # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile +DOCKER_BUILDKIT=1 docker build . \ + --target vllm-openai \ + --tag vllm/vllm-openai \ + --file docker/Dockerfile ``` !!! note @@ -96,8 +99,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- ```console # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) -$ python3 use_existing_torch.py -$ DOCKER_BUILDKIT=1 docker build . \ +python3 use_existing_torch.py +DOCKER_BUILDKIT=1 docker build . \ --file docker/Dockerfile \ --target vllm-openai \ --platform "linux/arm64" \ @@ -113,7 +116,7 @@ $ DOCKER_BUILDKIT=1 docker build . \ To run vLLM with the custom-built Docker image: ```console -$ docker run --runtime nvidia --gpus all \ +docker run --runtime nvidia --gpus all \ -v ~/.cache/huggingface:/root/.cache/huggingface \ -p 8000:8000 \ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index 1844a50c5604..9763745f2378 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -82,7 +82,11 @@ Check the output of the command. There will be a shareable gradio link (like the **Optional**: Serve the 70B model instead of the default 8B and use more GPU: ```console -HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct +HF_TOKEN="your-huggingface-token" \ + sky launch serving.yaml \ + --gpus A100:8 \ + --env HF_TOKEN \ + --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct ``` ## Scale up to multiple replicas @@ -155,7 +159,9 @@ run: | Start the serving the Llama-3 8B model on multiple replicas: ```console -HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN +HF_TOKEN="your-huggingface-token" \ + sky serve up -n vllm serving.yaml \ + --env HF_TOKEN ``` Wait until the service is ready: @@ -318,7 +324,9 @@ run: | 1. Start the chat web UI: ```console - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + sky launch \ + -c gui ./gui.yaml \ + --env ENDPOINT=$(sky serve status --endpoint vllm) ``` 2. Then, we can access the GUI at the returned gradio link: diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md index 8956d1ddc7d8..33ed8c5f5b54 100644 --- a/docs/deployment/frameworks/streamlit.md +++ b/docs/deployment/frameworks/streamlit.md @@ -33,7 +33,8 @@ pip install streamlit openai streamlit run streamlit_openai_chatbot_webserver.py # or specify the VLLM_API_BASE or VLLM_API_KEY -VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run streamlit_openai_chatbot_webserver.py +VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \ + streamlit run streamlit_openai_chatbot_webserver.py # start with debug mode to view more details streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md index 9d1f74475781..80242919ba5b 100644 --- a/docs/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -77,7 +77,11 @@ If you are behind proxy, you can pass the proxy settings to the docker build com ```console cd $vllm_root -docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy +docker build \ + -f docker/Dockerfile . \ + --tag vllm \ + --build-arg http_proxy=$http_proxy \ + --build-arg https_proxy=$https_proxy ``` [](){ #nginxloadbalancer-nginx-docker-network } @@ -102,8 +106,26 @@ Notes: ```console mkdir -p ~/.cache/huggingface/hub/ hf_cache_dir=~/.cache/huggingface/ -docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf -docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf +docker run \ + -itd \ + --ipc host \ + --network vllm_nginx \ + --gpus device=0 \ + --shm-size=10.24gb \ + -v $hf_cache_dir:/root/.cache/huggingface/ \ + -p 8081:8000 \ + --name vllm0 vllm \ + --model meta-llama/Llama-2-7b-chat-hf +docker run \ + -itd \ + --ipc host \ + --network vllm_nginx \ + --gpus device=1 \ + --shm-size=10.24gb \ + -v $hf_cache_dir:/root/.cache/huggingface/ \ + -p 8082:8000 \ + --name vllm1 vllm \ + --model meta-llama/Llama-2-7b-chat-hf ``` !!! note @@ -114,7 +136,12 @@ docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24 ## Launch Nginx ```console -docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest +docker run \ + -itd \ + -p 8000:80 \ + --network vllm_nginx \ + -v ./nginx_conf/:/etc/nginx/conf.d/ \ + --name nginx-lb nginx-lb:latest ``` [](){ #nginxloadbalancer-nginx-verify-nginx } diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index 5879b3126fa6..4366a080f52c 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -42,7 +42,9 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +python examples/offline_inference/llm_engine_example.py \ + --model TheBloke/Llama-2-7b-Chat-AWQ \ + --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index 8e9cf67a7a69..9001725d9c02 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -33,7 +33,12 @@ import torch # "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint. model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas") +llm = LLM( + model=model_id, + dtype=torch.bfloat16, + trust_remote_code=True, + quantization="bitblas" +) ``` ## Read gptq format checkpoint @@ -44,5 +49,11 @@ import torch # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. model_id = "hxbgsyxh/llama-13b-4bit-g-1" -llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024) +llm = LLM( + model=model_id, + dtype=torch.float16, + trust_remote_code=True, + quantization="bitblas", + max_model_len=1024 +) ``` diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 990ac34eb2fd..710becd7f92d 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -27,7 +27,11 @@ from vllm import LLM import torch # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. model_id = "unsloth/tinyllama-bnb-4bit" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True) +llm = LLM( + model=model_id, + dtype=torch.bfloat16, + trust_remote_code=True +) ``` ## Inflight quantization: load as 4bit quantization @@ -38,8 +42,12 @@ For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify from vllm import LLM import torch model_id = "huggyllama/llama-7b" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes") +llm = LLM( + model=model_id, + dtype=torch.bfloat16, + trust_remote_code=True, + quantization="bitsandbytes" +) ``` ## OpenAI Compatible Server diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 04ab5945e8f6..72f758f653a8 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -14,14 +14,17 @@ To run a GGUF model with vLLM, you can download and use the local GGUF model fro ```console wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --tensor-parallel-size 2 ``` !!! warning @@ -31,7 +34,9 @@ GGUF assumes that huggingface can convert the metadata to a config file. In case ```console # If you model is not supported by huggingface you can manually provide a huggingface compatible config path -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0 +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ + --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0 ``` You can also use the GGUF model directly through the LLM entrypoint: diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index 10660a408fd2..53e938d2cbd7 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -59,7 +59,8 @@ model.save(quant_path) To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: ```console -python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 +python examples/offline_inference/llm_engine_example.py \ + --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 ``` ## Using GPTQModel with vLLM's Python API diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md index 82100c6ddcac..a7a517af85aa 100644 --- a/docs/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -7,7 +7,9 @@ We recommend installing the latest torchao nightly with ```console # Install the latest TorchAO nightly build # Choose the CUDA version that matches your system (cu126, cu128, etc.) -pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126 +pip install \ + --pre torchao>=10.0.0 \ + --index-url https://download.pytorch.org/whl/nightly/cu126 ``` ## Quantizing HuggingFace Models @@ -20,7 +22,12 @@ from torchao.quantization import Int8WeightOnlyConfig model_name = "meta-llama/Meta-Llama-3-8B" quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) -quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) +quantized_model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config +) tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "What are we having for dinner?" input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 85464269efac..cbcb246912f4 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -27,7 +27,8 @@ vLLM currently supports the following reasoning models: To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. ```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --reasoning-parser deepseek_r1 ``` Next, make a request to the model that should return the reasoning content in the response. diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index ee871823b078..5080960f72dd 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -45,8 +45,13 @@ for output in outputs: To perform the same with an online mode launch the server: ```bash -python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ - --seed 42 -tp 1 --gpu_memory_utilization 0.8 \ +python -m vllm.entrypoints.openai.api_server \ + --host 0.0.0.0 \ + --port 8000 \ + --model facebook/opt-6.7b \ + --seed 42 \ + -tp 1 \ + --gpu_memory_utilization 0.8 \ --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}' ``` diff --git a/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 1ca8a9216a4e..00935a37417e 100644 --- a/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -45,7 +45,15 @@ Use the following commands to run a Docker image: ```console docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run \ + -it \ + --runtime=habana \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + --cap-add=sys_nice \ + --net=host \ + --ipc=host \ + vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` # --8<-- [end:requirements] @@ -91,7 +99,14 @@ Currently, there are no pre-built Intel Gaudi images. ```console docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +docker run \ + -it \ + --runtime=habana \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + --cap-add=sys_nice \ + --net=host \ + --rm vllm-hpu-env ``` !!! tip diff --git a/docs/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md index 671afa8d8900..f08c78fba6c8 100644 --- a/docs/getting_started/installation/ai_accelerator/neuron.inc.md +++ b/docs/getting_started/installation/ai_accelerator/neuron.inc.md @@ -38,7 +38,8 @@ The installation of drivers and tools wouldn't be necessary, if [Deep Learning A sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main EOF -wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - +wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB \ + | sudo apt-key add - # Update OS packages sudo apt-get update -y @@ -96,12 +97,17 @@ source aws_neuron_venv_pytorch/bin/activate # Install Jupyter notebook kernel pip install ipykernel -python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" +python3.10 -m ipykernel install \ + --user \ + --name aws_neuron_venv_pytorch \ + --display-name "Python (torch-neuronx)" pip install jupyter notebook pip install environment_kernels # Set pip repository pointing to the Neuron repository -python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com +python -m pip config set \ + global.extra-index-url \ + https://pip.repos.neuron.amazonaws.com # Install wget, awscli python -m pip install wget diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index a76ef1ccf32e..64dccef63d73 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -55,7 +55,9 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ##### Install the latest code using `pip` ```console -pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +pip install -U vllm \ + --pre \ + --extra-index-url https://wheels.vllm.ai/nightly ``` `--pre` is required for `pip` to consider pre-released versions. @@ -63,7 +65,9 @@ pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly Another way to install the latest code is to use `uv`: ```console -uv pip install -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly +uv pip install -U vllm \ + --torch-backend=auto \ + --extra-index-url https://wheels.vllm.ai/nightly ``` ##### Install specific revisions using `pip` @@ -83,7 +87,9 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi ```console export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -uv pip install vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +uv pip install vllm \ + --torch-backend=auto \ + --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. @@ -192,7 +198,11 @@ Additionally, if you have trouble building vLLM, we recommend using the NVIDIA P ```console # Use `--ipc=host` to make sure the shared memory is large enough. -docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +docker run \ + --gpus all \ + -it \ + --rm \ + --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 85d539b75669..0029b3a24496 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -91,19 +91,22 @@ Currently, there are no pre-built ROCm wheels. 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps: ```bash - $ pip install --upgrade pip + pip install --upgrade pip # Build & install AMD SMI - $ pip install /opt/rocm/share/amd_smi + pip install /opt/rocm/share/amd_smi # Install dependencies - $ pip install --upgrade numba scipy huggingface-hub[cli,hf_transfer] setuptools_scm - $ pip install "numpy<2" - $ pip install -r requirements/rocm.txt + pip install --upgrade numba \ + scipy \ + huggingface-hub[cli,hf_transfer] \ + setuptools_scm + pip install "numpy<2" + pip install -r requirements/rocm.txt # Build vLLM for MI210/MI250/MI300. - $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - $ python3 setup.py develop + export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + python3 setup.py develop ``` This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. @@ -154,7 +157,9 @@ It is important that the user kicks off the docker build using buildkit. Either To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: ```console -DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base . +DOCKER_BUILDKIT=1 docker build \ + -f docker/Dockerfile.rocm_base \ + -t rocm/vllm-dev:base . ``` #### Build an image with vLLM @@ -189,7 +194,11 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm . To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: ```console -DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build \ + --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \ + -f docker/Dockerfile.rocm \ + -t vllm-rocm \ + . ``` To run the above docker image `vllm-rocm`, use the below command: diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index c80120fa98f2..6755b574ea67 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -16,19 +16,25 @@ pip3 install vllm[runai] To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +vllm serve s3://core-llm/Llama-3-8b \ + --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \ +AWS_EC2_METADATA_DISABLED=true \ +AWS_ENDPOINT_URL=https://storage.googleapis.com \ +vllm serve s3://core-llm/Llama-3-8b \ + --load-format runai_streamer ``` ## Tunable parameters @@ -39,14 +45,18 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer \ + --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer \ + --model-loader-extra-config '{"memory_limit":5368709120}' ``` !!! note @@ -63,7 +73,9 @@ vllm serve /path/to/sharded/model --load-format runai_streamer_sharded The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`: ```console -vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' +vllm serve /path/to/sharded/model \ + --load-format runai_streamer_sharded \ + --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' ``` To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader. @@ -71,7 +83,9 @@ To create sharded model files, you can use the script provided in <gh-file:examp The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: ```console -vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' +vllm serve /path/to/sharded/model \ + --load-format runai_streamer_sharded \ + --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' ``` !!! note diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 012bddf3d9c9..c2e39d029dd5 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -8,7 +8,9 @@ vLLM provides an HTTP server that implements OpenAI's [Completions API](https:// In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) ```bash -vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 +vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ + --dtype auto \ + --api-key token-abc123 ``` To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python). @@ -243,7 +245,9 @@ and passing a list of `messages` in the request. Refer to the examples below for ```bash vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ - --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja + --trust-remote-code \ + --max-model-len 4096 \ + --chat-template examples/template_vlm2vec.jinja ``` !!! warning @@ -285,7 +289,9 @@ and passing a list of `messages` in the request. Refer to the examples below for ```bash vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ - --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja + --trust-remote-code \ + --max-model-len 8192 \ + --chat-template examples/template_dse_qwen2_vl.jinja ``` !!! warning From f2faac745dc9b9d7d2fa92a9a4cfba6b230db2d4 Mon Sep 17 00:00:00 2001 From: Yuqi Zhang <zhangyuqi94@gmail.com> Date: Sun, 25 May 2025 02:36:06 -0700 Subject: [PATCH 147/192] [Bugfix] Fix cpu usage and cache hit stats reporting on cpu environment (#18674) Signed-off-by: zzzyq <zhangyuqi94@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> --- vllm/engine/llm_engine.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2e5361c4891b..5ca3ebe91d12 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1650,6 +1650,20 @@ def _get_stats(self, gpu_prefix_cache_hit_rate = self.scheduler[ 0].get_prefix_cache_hit_rate(Device.GPU) + # Exchange the uasge and cache hit stats between gpu and cpu when + # running on cpu because the cpu_worker.py intentionally reports the + # number of cpu blocks as gpu blocks in favor of cache management. + if self.device_config.device_type == "cpu": + num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu + gpu_cache_usage_sys, cpu_cache_usage_sys = ( + cpu_cache_usage_sys, + gpu_cache_usage_sys, + ) + gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = ( + cpu_prefix_cache_hit_rate, + gpu_prefix_cache_hit_rate, + ) + # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 From 35be8fad62099199ab26fdb5e7c0001fd9f4d71c Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 25 May 2025 18:10:51 +0800 Subject: [PATCH 148/192] [CI/build] fix no regex (#18676) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- .github/workflows/cleanup_pr_body.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 3250b6671989..d5c6b8d43a6e 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -20,6 +20,11 @@ jobs: with: python-version: '3.12' + - name: Install Python dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install regex + - name: Update PR description env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 3a886bd58cecf3ce78c1eafed1f7d3d16800bccc Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 25 May 2025 21:05:38 +0800 Subject: [PATCH 149/192] [Misc] small improve (#18680) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- docs/features/quantization/bnb.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 710becd7f92d..a8dc2476f30a 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -15,7 +15,7 @@ pip install bitsandbytes>=0.45.3 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. -You can find bitsandbytes quantized models on <https://huggingface.co/models?search=bitsandbytes>. +You can find bitsandbytes quantized models on [Hugging Face](https://huggingface.co/models?search=bitsandbytes). And usually, these repositories have a config.json file that includes a quantization_config section. ## Read quantized checkpoint From 57fd13a70729c73dd4abf97251fdafe28df328d0 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Sun, 25 May 2025 22:05:30 +0800 Subject: [PATCH 150/192] [Bugfix] Fix profiling dummy data for Pixtral (#18677) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .../multimodal/processing/test_common.py | 261 +++++++----------- .../multimodal/processing/test_mllama.py | 2 +- tests/models/registry.py | 9 +- vllm/model_executor/models/pixtral.py | 36 ++- vllm/multimodal/profiling.py | 15 +- 5 files changed, 153 insertions(+), 170 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index a107eae6de5e..572fa366d332 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -9,15 +9,15 @@ UserMessage) from mistral_common.protocol.instruct.request import ChatCompletionRequest from PIL import Image -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.config import ModelConfig from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.inputs import MultiModalInputs from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache -from vllm.transformers_utils.tokenizer import (MistralTokenizer, - cached_tokenizer_from_config) +from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, + cached_tokenizer_from_config, + encode_tokens) from ....multimodal.utils import random_audio, random_image, random_video from ...registry import HF_EXAMPLE_MODELS @@ -28,7 +28,6 @@ def _test_processing_correctness( hit_rate: float, num_batches: int, simplify_rate: float, - ignore_mm_keys: Optional[set[str]] = None, ): model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info.check_available_online(on_fail="skip") @@ -99,10 +98,23 @@ def _test_processing_correctness( } mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = dummy_inputs.get_dummy_processor_inputs( - model_config.max_model_len, - mm_counts, - ).prompt_text + + # Mistral chat outputs tokens directly, rather than text prompts + if isinstance(tokenizer, MistralTokenizer): + images = mm_data.get("image", []) + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=""), + *(ImageChunk(image=image) for image in images), + ]), + ]) + res = tokenizer.mistral.encode_chat_completion(request) + prompt = res.tokens + else: + prompt = dummy_inputs.get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt # Drop unnecessary keys and test single -> multi conversion if rng.rand() < simplify_rate: @@ -112,67 +124,59 @@ def _test_processing_correctness( elif len(mm_data[k]) == 1: mm_data[k] = mm_data[k][0] - if isinstance(tokenizer, MistralTokenizer): - _test_processing_correctness_mistral( - model_config, - tokenizer, - prompt, - mm_data, - baseline_processor, - cached_processor, - batch_idx, - ignore_mm_keys=ignore_mm_keys, - ) - else: - _test_processing_correctness_hf( - model_config, - tokenizer, - prompt, - mm_data, - baseline_processor, - cached_processor, - batch_idx, - ignore_mm_keys=ignore_mm_keys, - ) - - -def _test_processing_correctness_hf( + _test_processing_correctness_one( + model_config, + tokenizer, + prompt, + mm_data, + baseline_processor, + cached_processor, + batch_idx, + ) + + +# For some multimodal models, tokenizer will always add bos_token +# at the beginning of prompt by default, causing hf_processor outputs +# incorrect token ids. So we need use `add_special_tokens=False` here +# to leave bos_token to be added by the processor. +_ADD_SPECIAL_TOKENS_OVERRIDES = { + "mllama": False, + "ovis": False, + "ultravox": False, + "whisper": False, +} + +_IGNORE_MM_KEYS = { + # In Ultravox, the audio_features can be different depending on padding + # The slight difference should not be a problem though, since + # attention_mask lets us ignore the difference. + "ultravox": {"audio_features"}, +} + + +def _test_processing_correctness_one( model_config: ModelConfig, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - prompt: str, + tokenizer: AnyTokenizer, + prompt: Union[str, list[int]], mm_data: MultiModalDataDict, baseline_processor: BaseMultiModalProcessor, cached_processor: BaseMultiModalProcessor, batch_idx: int, - ignore_mm_keys: Optional[set[str]] = None, ): - if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox", - "whisper"): - # For some multimodal models, tokenizer will always add bos_token - # at the beginning of prompt by default, causing hf_processor outputs - # incorrect token ids. So we need use `add_special_tokens=False` here - # to leave bos_token to be added by the processor. - token_prompt = tokenizer.encode(prompt, add_special_tokens=False) + model_type = model_config.hf_config.model_type + ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]()) + + if isinstance(prompt, str): + text_prompt = prompt + token_prompt = encode_tokens( + tokenizer, + prompt, + add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type), + ) else: - token_prompt = tokenizer.encode(prompt) - - baseline_result = baseline_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - cached_result = cached_processor.apply( - prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - _assert_inputs_equal( - baseline_result, - cached_result, - ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", - ) + # Mistral does not support decode_tokens with skip_special_tokens=False + text_prompt = None + token_prompt = prompt baseline_tokenized_result = baseline_processor.apply( token_prompt, @@ -180,56 +184,6 @@ def _test_processing_correctness_hf( hf_processor_mm_kwargs={}, ) - _assert_inputs_equal( - baseline_result, - baseline_tokenized_result, - ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", - ) - - cached_tokenized_result = cached_processor.apply( - token_prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) - - _assert_inputs_equal( - cached_result, - cached_tokenized_result, - ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", - ) - - -def _test_processing_correctness_mistral( - model_config: ModelConfig, - tokenizer: MistralTokenizer, - prompt: str, - mm_data: MultiModalDataDict, - baseline_processor: BaseMultiModalProcessor, - cached_processor: BaseMultiModalProcessor, - batch_idx: int, - ignore_mm_keys: Optional[set[str]] = None, -): - images = mm_data.get("image", []) - if not isinstance(images, list): - images = [images] - - request = ChatCompletionRequest(messages=[ - UserMessage(content=[ - TextChunk(text=prompt), - *(ImageChunk(image=image) for image in images), - ]), - ]) - res = tokenizer.mistral.encode_chat_completion(request) - token_prompt = res.tokens - - # Mistral chat outputs tokens directly, rather than text prompts - baseline_tokenized_result = baseline_processor.apply( - token_prompt, - mm_data=mm_data, - hf_processor_mm_kwargs={}, - ) cached_tokenized_result = cached_processor.apply( token_prompt, mm_data=mm_data, @@ -240,9 +194,44 @@ def _test_processing_correctness_mistral( baseline_tokenized_result, cached_tokenized_result, ignore_mm_keys=ignore_mm_keys, - msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})", + msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})", ) + if text_prompt is not None: + baseline_text_result = baseline_processor.apply( + text_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_text_result = cached_processor.apply( + text_prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + _assert_inputs_equal( + baseline_text_result, + cached_text_result, + ignore_mm_keys=ignore_mm_keys, + msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})", + ) + + _assert_inputs_equal( + baseline_text_result, + baseline_tokenized_result, + ignore_mm_keys=ignore_mm_keys, + msg=f"Failed ({batch_idx=}, {text_prompt=}, " + f"{token_prompt=}, {mm_data=})", + ) + + _assert_inputs_equal( + cached_text_result, + cached_tokenized_result, + ignore_mm_keys=ignore_mm_keys, + msg=f"Failed ({batch_idx=}, {text_prompt=}, " + f"{token_prompt=}, {mm_data=})", + ) + # yapf: disable @pytest.mark.parametrize("model_id", [ @@ -281,6 +270,7 @@ def _test_processing_correctness_mistral( "AIDC-AI/Ovis2-1B", "google/paligemma-3b-mix-224", "google/paligemma2-3b-ft-docci-448", + "microsoft/Phi-3.5-vision-instruct", "microsoft/Phi-4-multimodal-instruct", "mistralai/Pixtral-12B-2409", "mistral-community/pixtral-12b", @@ -303,41 +293,6 @@ def test_processing_correctness( num_batches: int, simplify_rate: float, ): - ignore_mm_keys = None - if 'ultravox' in model_id: - # In Ultravox, the audio_features can be different depending on padding - # The slight difference should not be a problem though, since - # attention_mask lets us ignore the difference. - ignore_mm_keys = {"audio_features"} - - _test_processing_correctness( - model_id, - hit_rate=hit_rate, - num_batches=num_batches, - simplify_rate=simplify_rate, - ignore_mm_keys=ignore_mm_keys, - ) - - -# yapf: disable -@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) -@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) -@pytest.mark.parametrize("num_batches", [32]) -@pytest.mark.parametrize("simplify_rate", [1.0]) -# yapf: enable -def test_processing_correctness_phi3v( - model_id: str, - hit_rate: float, - num_batches: int, - simplify_rate: float, -): - # HACK - this is an attempted workaround for the following bug - # https://github.com/huggingface/transformers/issues/34307 - from transformers import AutoImageProcessor # noqa: F401 - from transformers import AutoProcessor # noqa: F401 - - AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) - _test_processing_correctness( model_id, hit_rate=hit_rate, @@ -356,16 +311,10 @@ def _assert_inputs_equal( if ignore_mm_keys is None: ignore_mm_keys = set() - if msg is None: - assert "mm_kwargs" in a and "mm_kwargs" in b - else: - assert "mm_kwargs" in a and "mm_kwargs" in b, msg + assert "mm_kwargs" in a and "mm_kwargs" in b, msg for key in ignore_mm_keys: a["mm_kwargs"].pop(key, None) b["mm_kwargs"].pop(key, None) - if msg is None: - assert a == b - else: - assert a == b, msg + assert a == b, msg diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py index b89376cf1722..d4794396f6d2 100644 --- a/tests/models/multimodal/processing/test_mllama.py +++ b/tests/models/multimodal/processing/test_mllama.py @@ -49,7 +49,7 @@ def test_profiling( ] * max_num_seqs mm_kwargs = processor.apply( - prompt=dummy_mm_data.prompt_text, + prompt=dummy_mm_data.prompt, mm_data=dummy_mm_data.mm_data, hf_processor_mm_kwargs=dict(), )["mm_kwargs"] diff --git a/tests/models/registry.py b/tests/models/registry.py index bf7729d4e044..a49e3ad6b20e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -8,6 +8,8 @@ from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION +from vllm.config import TokenizerMode + @dataclass(frozen=True) class _HfExamplesInfo: @@ -20,7 +22,7 @@ class _HfExamplesInfo: tokenizer: Optional[str] = None """Set the tokenizer to load for this architecture.""" - tokenizer_mode: str = "auto" + tokenizer_mode: TokenizerMode = "auto" """Set the tokenizer type for this architecture.""" speculative_model: Optional[str] = None @@ -388,8 +390,7 @@ def check_available_online( "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True), "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 - tokenizer_mode="mistral", - v0_only=True), + tokenizer_mode="mistral"), "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL", extras={"chat": "Qwen/Qwen-VL-Chat"}, # noqa: E501 trust_remote_code=True, @@ -400,7 +401,7 @@ def check_available_online( "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B", min_transformers_version="4.52"), "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ", # noqa: E501 - min_transformers_version="4.52"), + min_transformers_version="4.52"), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index c664d2371e27..bbaa85cf54df 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -9,7 +9,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from mistral_common.protocol.instruct.messages import ImageChunk +from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk, + UserMessage) +from mistral_common.protocol.instruct.request import ChatCompletionRequest from mistral_common.tokens.tokenizers.multimodal import ImageEncoder from PIL import Image from transformers import PixtralVisionConfig, TensorType @@ -39,7 +41,7 @@ BaseProcessingInfo, MultiModalHashes, PromptReplacement, PromptUpdate, PromptUpdateDetails) -from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import (MistralTokenizer, cached_tokenizer_from_config) @@ -224,6 +226,28 @@ def get_dummy_mm_data( num_images=num_images) } + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + tokenizer = self.info.get_tokenizer() + + dummy_text = self.get_dummy_text(mm_counts) + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) + dummy_images = dummy_mm_data.get("image", []) + + request = ChatCompletionRequest(messages=[ + UserMessage(content=[ + TextChunk(text=dummy_text), + *(ImageChunk(image=image) for image in dummy_images), + ]), + ]) + res = tokenizer.mistral.encode_chat_completion(request) + dummy_tokens = res.tokens + + return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data) + class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] ): @@ -275,8 +299,12 @@ def _cached_apply_hf_processor( *, return_mm_hashes: bool, ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: - prompt_ids, mm_kwargs, mm_hashes, _ = super( - )._cached_apply_hf_processor( + ( + prompt_ids, + mm_kwargs, + mm_hashes, + _, + ) = super()._cached_apply_hf_processor( prompt=prompt, mm_data_items=mm_data_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index b5875124c126..59427f35293a 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -3,7 +3,7 @@ from abc import ABC from collections.abc import Mapping from dataclasses import dataclass, field -from typing import Generic, NamedTuple, Optional, TypeVar, cast +from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast import numpy as np import numpy.typing as npt @@ -27,7 +27,7 @@ class ProcessorInputs: Represents the keyword arguments to {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. """ - prompt_text: str + prompt: Union[str, list[int]] mm_data: MultiModalDataDict hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) @@ -75,7 +75,12 @@ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: "in an upcoming release.") seq_len = self.info.ctx.model_config.max_model_len - return self.get_dummy_processor_inputs(seq_len, mm_counts).prompt_text + + prompt = self.get_dummy_processor_inputs(seq_len, mm_counts).prompt + if not isinstance(prompt, str): + prompt = self.info.get_tokenizer().decode(prompt) + + return prompt # TODO: @abstractmethod after transition def get_dummy_mm_data( @@ -101,7 +106,7 @@ def get_dummy_processor_inputs( dummy_text = self.get_dummy_text(mm_counts) dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts) - return ProcessorInputs(prompt_text=dummy_text, mm_data=dummy_mm_data) + return ProcessorInputs(prompt=dummy_text, mm_data=dummy_mm_data) def _get_dummy_audios( self, @@ -177,7 +182,7 @@ def _get_dummy_mm_inputs( seq_len, mm_counts) return self.processor.apply( - prompt=processor_inputs.prompt_text, + prompt=processor_inputs.prompt, mm_data=processor_inputs.mm_data, hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, ) From 6071e989df1531b59ef35568f83f7351afb0b51e Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Sun, 25 May 2025 18:33:35 +0100 Subject: [PATCH 151/192] [Core][Multimodal] Convert PIL Image to array without data copy when hashing (#18682) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/multimodal/hasher.py | 4 ++-- vllm/multimodal/video.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index a5a4dcd0b6e1..b4cd6a90834c 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -36,8 +36,8 @@ def serialize_item(cls, obj: object) -> bytes: return np.array(obj).tobytes() if isinstance(obj, Image.Image): - return cls.item_to_bytes("image", - np.array(convert_image_mode(obj, "RGBA"))) + return cls.item_to_bytes( + "image", np.asarray(convert_image_mode(obj, "RGBA"))) if isinstance(obj, torch.Tensor): return cls.item_to_bytes("tensor", obj.numpy()) if isinstance(obj, np.ndarray): diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 3685fd4c3458..261d56abad9c 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -164,7 +164,7 @@ def load_base64(self, media_type: str, data: str) -> npt.NDArray: ) return np.stack([ - np.array(load_frame(frame_data)) + np.asarray(load_frame(frame_data)) for frame_data in data.split(",") ]) From fba06427043be95a6c3d4329aeba8b519c449f23 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 11:27:50 +0800 Subject: [PATCH 152/192] [CI/Build][Doc] Update `gte-Qwen2-1.5B-instruct` usage (#18683) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/models/supported_models.md | 5 +---- tests/models/language/pooling/test_embedding.py | 6 +----- tests/models/language/pooling/test_gte.py | 9 --------- 3 files changed, 2 insertions(+), 18 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4b19272f4a28..7594c6e6fbf1 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -404,10 +404,7 @@ Specified using `--task embed`. You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. !!! note - The HF implementation of `Alibaba-NLP/gte-Qwen2-1.5B-instruct` is hardcoded to use causal attention despite what is shown in `config.json`. To compare vLLM vs HF results, - you should set `--hf-overrides '{"is_causal": true}'` in vLLM so that the two implementations are consistent with each other. - - For both the 1.5B and 7B variants, you also need to enable `--trust-remote-code` for the correct tokenizer to be loaded. + For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). !!! note diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 9db385e77bdb..a44b2154b137 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -15,13 +15,12 @@ marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), pytest.param("intfloat/e5-mistral-7b-instruct", marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), @@ -47,9 +46,6 @@ def test_models( vllm_extra_kwargs["override_pooler_config"] = \ PoolerConfig(pooling_type="MEAN") - if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - vllm_extra_kwargs["hf_overrides"] = {"is_causal": True} - # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" # sentence_transformers will strip the input texts, see: diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index b60d27aaa72b..91d10f529cd6 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -45,9 +45,6 @@ EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", architecture="Qwen2ForCausalLM", enable_test=True), - EmbedModelInfo("Alibaba-NLP/gte-Qwen2-7B-instruct", - architecture="Qwen2ForCausalLM", - enable_test=False), ########## ModernBertModel EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", architecture="ModernBertModel", @@ -61,9 +58,6 @@ def test_models_mteb(hf_runner, vllm_runner, from .mteb_utils import mteb_test_embed_models vllm_extra_kwargs: dict[str, Any] = {} - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - vllm_extra_kwargs["hf_overrides"] = {"is_causal": True} - if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} @@ -81,9 +75,6 @@ def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts = [str(s).strip() for s in example_prompts] vllm_extra_kwargs: dict[str, Any] = {} - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - vllm_extra_kwargs["hf_overrides"] = {"is_causal": True} - if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} From 8820821b59121192205077d0cdc56490e215b9fb Mon Sep 17 00:00:00 2001 From: AlexZhao <zhaohaidao2008@hotmail.com> Date: Mon, 26 May 2025 13:51:27 +0800 Subject: [PATCH 153/192] [Misc] Fixed the abnormally high TTFT issue in the PD disaggregation example (#18644) Signed-off-by: zhaohaidao <zhaohaidao2008@hotmail.com> Signed-off-by: zhaohaiyuan <zhaohaiyuan@xiaohongshu.com> Co-authored-by: zhaohaiyuan <zhaohaiyuan@xiaohongshu.com> --- .../lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index 8db93bc8931b..32d36da9f2e8 100644 --- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -135,7 +135,7 @@ async def generate_stream(): yield chunk return StreamingResponse(generate_stream(), - media_type="application/json") + media_type="text/event-stream") except Exception as e: import sys @@ -172,7 +172,7 @@ async def generate_stream(): yield chunk return StreamingResponse(generate_stream(), - media_type="application/json") + media_type="text/event-stream") except Exception as e: import sys From abd4030d94206db4b0ea6c42640e2782ccd62532 Mon Sep 17 00:00:00 2001 From: CYJiang <86391540+googs1025@users.noreply.github.com> Date: Mon, 26 May 2025 14:32:28 +0800 Subject: [PATCH 154/192] refactor: simplify request handler, use positive condition check for handler assignment (#18690) Signed-off-by: googs1025 <googs1025@gmail.com> --- vllm/entrypoints/openai/run_batch.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index fccf459f17dc..eae83c9a494a 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -365,8 +365,8 @@ async def main(args): # Determine the type of request and run it. if request.url == "/v1/chat/completions": - chat_handler_fn = (None if openai_serving_chat is None else - openai_serving_chat.create_chat_completion) + chat_handler_fn = openai_serving_chat.create_chat_completion if \ + openai_serving_chat is not None else None if chat_handler_fn is None: response_futures.append( make_async_error_request_output( @@ -380,8 +380,8 @@ async def main(args): run_request(chat_handler_fn, request, tracker)) tracker.submitted() elif request.url == "/v1/embeddings": - embed_handler_fn = (None if openai_serving_embedding is None else - openai_serving_embedding.create_embedding) + embed_handler_fn = openai_serving_embedding.create_embedding if \ + openai_serving_embedding is not None else None if embed_handler_fn is None: response_futures.append( make_async_error_request_output( @@ -394,8 +394,8 @@ async def main(args): run_request(embed_handler_fn, request, tracker)) tracker.submitted() elif request.url == "/v1/score": - score_handler_fn = (None if openai_serving_scores is None else - openai_serving_scores.create_score) + score_handler_fn = openai_serving_scores.create_score if \ + openai_serving_scores is not None else None if score_handler_fn is None: response_futures.append( make_async_error_request_output( From 561b77a0d608a9059318d6cff9f0975439880d77 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser <mbayser@br.ibm.com> Date: Mon, 26 May 2025 03:52:25 -0300 Subject: [PATCH 155/192] [Bugfix] Fix the lm_head in gpt_bigcode in lora mode (#6357) Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Max de Bayser <maxdebayser@gmail.com> --- vllm/model_executor/models/gpt_bigcode.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 6a1d97bd7b69..c4ae4fc3c006 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -272,12 +272,6 @@ def load_weights(self, weights: Iterable[tuple[str, class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = {"c_attn": ["c_attn"]} - # LoRA specific attributes - embedding_modules = { - "wte": "input_embeddings", - "lm_head": "output_embeddings", - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -330,8 +324,11 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + skip_prefixes = None + if self.config.tie_word_embeddings: + skip_prefixes = ["lm_head."] loader = AutoWeightsLoader( self, - skip_prefixes=(["lm_head."]), + skip_prefixes=skip_prefixes, ) - return loader.load_weights(weights) \ No newline at end of file + return loader.load_weights(weights) From 4ea62c0ea0ff067853f454003d1bddeacc3629a3 Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Mon, 26 May 2025 15:22:04 +0800 Subject: [PATCH 156/192] [CI] add missing argument (#18694) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- tests/runai_model_streamer_test/test_weight_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py index 4afa76c51693..06e506c35761 100644 --- a/tests/runai_model_streamer_test/test_weight_utils.py +++ b/tests/runai_model_streamer_test/test_weight_utils.py @@ -23,10 +23,11 @@ def test_runai_model_loader(): runai_model_streamer_tensors = {} hf_safetensors_tensors = {} - for name, tensor in runai_safetensors_weights_iterator(safetensors): + for name, tensor in runai_safetensors_weights_iterator( + safetensors, True): runai_model_streamer_tensors[name] = tensor - for name, tensor in safetensors_weights_iterator(safetensors): + for name, tensor in safetensors_weights_iterator(safetensors, True): hf_safetensors_tensors[name] = tensor assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors) From 4b7740a1055d2aa13b81c63f636a972024fdcd85 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 15:42:04 +0800 Subject: [PATCH 157/192] [GH] Add issue template for reporting CI failures (#18696) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .github/ISSUE_TEMPLATE/400-bug-report.yml | 6 +- .github/ISSUE_TEMPLATE/450-ci-failure.yml | 69 +++++++++++++++++++++++ 2 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/450-ci-failure.yml diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 00b0f024c0da..f05be2ba8707 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -81,14 +81,14 @@ body: required: true - type: markdown attributes: - value: > - ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: + value: | + ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output: - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. - Thanks for contributing 🎉! + Thanks for reporting 🙏! - type: checkboxes id: askllm attributes: diff --git a/.github/ISSUE_TEMPLATE/450-ci-failure.yml b/.github/ISSUE_TEMPLATE/450-ci-failure.yml new file mode 100644 index 000000000000..e54167363480 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml @@ -0,0 +1,69 @@ +name: 🧪 CI failure report +description: Report a failing test. +title: "[CI Failure]: " +labels: ["ci-failure"] + +body: +- type: markdown + attributes: + value: > + #### Include the name of the failing Buildkite step and test file in the title. +- type: input + attributes: + label: Name of failing test + description: | + Paste in the fully-qualified name of the failing test from the logs. + placeholder: | + `path/to/test_file.py::test_name[params]` + validations: + required: true +- type: checkboxes + attributes: + label: Basic information + description: Select all items that apply to the failing test. + options: + - label: Flaky test + - label: Can reproduce locally + - label: Caused by external libraries (e.g. bug in `transformers`) +- type: textarea + attributes: + label: 🧪 Describe the failing test + description: | + Please provide a clear and concise description of the failing test. + placeholder: | + A clear and concise description of the failing test. + + ``` + The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present. + ``` + validations: + required: true +- type: input + attributes: + label: 📝 History of failing test + description: | + Since when did the test start to fail? + You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main). + + If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods: + + - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally. + + - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally. + + - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only) + placeholder: | + Approximate timeline and/or problematic PRs + + A link to the Buildkite analytics of the failing test (if available) + validations: + required: true +- type: textarea + attributes: + label: CC List. + description: > + The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test. +- type: markdown + attributes: + value: > + Thanks for reporting 🙏! From 65523a0995ffd328526e705e04b42198e519cdf8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 15:45:39 +0800 Subject: [PATCH 158/192] [Doc] Fix issue template format (#18699) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .github/ISSUE_TEMPLATE/450-ci-failure.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/450-ci-failure.yml b/.github/ISSUE_TEMPLATE/450-ci-failure.yml index e54167363480..7af0e0673a2f 100644 --- a/.github/ISSUE_TEMPLATE/450-ci-failure.yml +++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml @@ -38,7 +38,7 @@ body: ``` validations: required: true -- type: input +- type: textarea attributes: label: 📝 History of failing test description: | From 61a45e7a7266e29d14349cf67de69f166415e1c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 16:44:04 +0800 Subject: [PATCH 159/192] [Bugfix] Fix Mistral-format models with sliding window (#18693) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/config.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index c0671d2524ec..4196684639ee 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -542,8 +542,10 @@ def __post_init__(self) -> None: sliding_window = getattr(self.hf_text_config, "sliding_window", None) sliding_window_pattern = getattr(self.hf_text_config, "sliding_window_pattern", None) + has_interleaved_attention = sliding_window_pattern is not None or ( + isinstance(sliding_window, list)) - if not (self.disable_sliding_window or sliding_window_pattern is None): + if not self.disable_sliding_window and has_interleaved_attention: if (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( @@ -563,7 +565,10 @@ def __post_init__(self) -> None: # only the attention layer itself is aware of the sliding # window, and use the window size to compute the attention. self.hf_text_config.interleaved_sliding_window = sliding_window - delattr(self.hf_text_config, "sliding_window") + + if hasattr(self.hf_text_config, "sliding_window"): + delattr(self.hf_text_config, "sliding_window") + sliding_window = None self.max_model_len = _get_and_verify_max_len( @@ -1041,7 +1046,8 @@ def verify_with_parallel_config( if self.use_async_output_proc: self.use_async_output_proc = False - def get_hf_config_sliding_window(self) -> Optional[int]: + def get_hf_config_sliding_window( + self) -> Union[Optional[int], list[Optional[int]]]: """Get the sliding window size, or None if disabled.""" # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in @@ -1052,7 +1058,7 @@ def get_hf_config_sliding_window(self) -> Optional[int]: return None return getattr(self.hf_text_config, "sliding_window", None) - def get_sliding_window(self) -> Optional[int]: + def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]: """Get the sliding window size, or None if disabled. """ # If user disables sliding window, return None. From 38b13dfe78412aacfd9bf99e4f78bad5a43df9e6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 17:05:17 +0800 Subject: [PATCH 160/192] [CI/Build] Replace `math.isclose` with `pytest.approx` (#18703) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .../openai/correctness/test_mteb.py | 3 +-- tests/entrypoints/openai/test_score.py | 8 +++---- tests/models/language/pooling/mteb_utils.py | 3 +-- tests/models/language/pooling/test_gritlm.py | 9 ++++---- tests/models/language/pooling/test_jina.py | 8 +++---- tests/models/language/pooling/test_scoring.py | 22 +++++++++---------- 6 files changed, 22 insertions(+), 31 deletions(-) diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py index b702e0acd38b..ebf2f829b583 100644 --- a/tests/entrypoints/openai/correctness/test_mteb.py +++ b/tests/entrypoints/openai/correctness/test_mteb.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import math import os import pytest @@ -39,4 +38,4 @@ def test_mteb(server): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4) + assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4) diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index b756680ea9f2..b373f2912752 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -1,6 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 - -import math from typing import Any import pytest @@ -92,7 +90,7 @@ def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer, hf_outputs = run_transformers(runner, model, text_pairs) for i in range(len(vllm_outputs)): - assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01) + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer, model: dict[str, Any], runner): @@ -124,7 +122,7 @@ def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer, hf_outputs = run_transformers(runner, model, text_pairs) for i in range(len(vllm_outputs)): - assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01) + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer, model: dict[str, Any], runner): @@ -150,7 +148,7 @@ def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer, hf_outputs = run_transformers(runner, model, text_pairs) for i in range(len(vllm_outputs)): - assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01) + assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01) def test_score_max_model_len(self, server: RemoteOpenAIServer, model: dict[str, Any]): diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 7de2a9af2f2e..f83c9940d524 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import math from collections.abc import Sequence import mteb @@ -115,4 +114,4 @@ def mteb_test_embed_models(hf_runner, print("SentenceTransformer:", model_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert math.isclose(st_main_score, vllm_main_score, rel_tol=MTEB_EMBED_TOL) + assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 7dd3c8a4e79e..f450edd82162 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -2,7 +2,6 @@ from __future__ import annotations import importlib.util -import math from array import array import openai @@ -104,16 +103,16 @@ def get_test_data(): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) - assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) + assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001) cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1]) - assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001) + assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001) cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0]) - assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) + assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) - assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001) + assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001) def test_gritlm_offline_embedding(vllm_runner): diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 5287ca37c0fb..0ddff2146caa 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -1,6 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import math - import pytest from vllm import PoolingParams @@ -60,7 +58,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): assert len(vllm_outputs) == 1 assert len(hf_outputs) == 1 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) @pytest.mark.parametrize("dtype", ["half"]) @@ -78,8 +76,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert len(vllm_outputs) == 10 assert len(hf_outputs) == 10 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) @pytest.fixture(scope="module", params=EMBEDDING_MODELS) diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index e9527700c3ca..6b10aeffc4b7 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -1,6 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import math - import pytest import torch import torch.nn.functional as F @@ -45,7 +43,7 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name): assert len(vllm_outputs) == 1 assert len(hf_outputs) == 1 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): @@ -64,8 +62,8 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): @@ -84,8 +82,8 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) @pytest.fixture(scope="module", params=EMBEDDING_MODELS) @@ -112,7 +110,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name): assert len(vllm_outputs) == 1 assert len(hf_outputs) == 1 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): @@ -140,8 +138,8 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): @@ -169,5 +167,5 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): assert len(vllm_outputs) == 2 assert len(hf_outputs) == 2 - assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) - assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01) + assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) From 5a2c76cbe148b050b5d77b3c638f9584eba8eee9 Mon Sep 17 00:00:00 2001 From: Ning Xie <andy.xning@gmail.com> Date: Mon, 26 May 2025 18:23:35 +0800 Subject: [PATCH 161/192] [CI] fix dump_input for str type (#18697) Signed-off-by: Andy Xie <andy.xning@gmail.com> --- tests/test_logger.py | 38 +++++++++++++++++++++++++++++++- vllm/logging_utils/dump_input.py | 6 ++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/tests/test_logger.py b/tests/test_logger.py index 11deae309ac8..046f70504c89 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 - +import enum import json import logging import os import sys import tempfile +from dataclasses import dataclass from json.decoder import JSONDecodeError from tempfile import NamedTemporaryFile from typing import Any @@ -16,6 +17,7 @@ from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger, enable_trace_function_call, init_logger) from vllm.logging_utils import NewLineFormatter +from vllm.logging_utils.dump_input import prepare_object_to_dump def f1(x): @@ -216,3 +218,37 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(): assert other_logger.handlers != root_logger.handlers assert other_logger.level != root_logger.level assert other_logger.propagate + + +def test_prepare_object_to_dump(): + str_obj = 'str' + assert prepare_object_to_dump(str_obj) == "'str'" + + list_obj = [1, 2, 3] + assert prepare_object_to_dump(list_obj) == '[1, 2, 3]' + + dict_obj = {'a': 1, 'b': 'b'} + assert prepare_object_to_dump(dict_obj) in [ + "{a: 1, b: 'b'}", "{b: 'b', a: 1}" + ] + + set_obj = {1, 2, 3} + assert prepare_object_to_dump(set_obj) == '[1, 2, 3]' + + tuple_obj = ('a', 'b', 'c') + assert prepare_object_to_dump(tuple_obj) == "['a', 'b', 'c']" + + class CustomEnum(enum.Enum): + A = enum.auto() + B = enum.auto() + C = enum.auto() + + assert prepare_object_to_dump(CustomEnum.A) == repr(CustomEnum.A) + + @dataclass + class CustomClass: + a: int + b: str + + assert (prepare_object_to_dump(CustomClass( + 1, 'b')) == "CustomClass(a=1, b='b')") diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py index 169e24794095..47ce0ab188bd 100644 --- a/vllm/logging_utils/dump_input.py +++ b/vllm/logging_utils/dump_input.py @@ -18,7 +18,7 @@ def prepare_object_to_dump(obj) -> str: if isinstance(obj, str): - return "'{obj}'" # Double quotes + return f"'{obj}'" # Double quotes elif isinstance(obj, dict): dict_str = ', '.join({f'{str(k)}: {prepare_object_to_dump(v)}' \ for k, v in obj.items()}) @@ -42,9 +42,9 @@ def prepare_object_to_dump(obj) -> str: return obj.anon_repr() elif hasattr(obj, '__dict__'): items = obj.__dict__.items() - dict_str = ','.join([f'{str(k)}={prepare_object_to_dump(v)}' \ + dict_str = ', '.join([f'{str(k)}={prepare_object_to_dump(v)}' \ for k, v in items]) - return (f"{type(obj).__name__}({dict_str})") + return f"{type(obj).__name__}({dict_str})" else: # Hacky way to make sure we can serialize the object in JSON format try: From 6d68030f1cac398987405712db894593a334dfee Mon Sep 17 00:00:00 2001 From: Naveassaf <55059536+Naveassaf@users.noreply.github.com> Date: Mon, 26 May 2025 13:31:49 +0300 Subject: [PATCH 162/192] [Model] Add support for YARN in NemotronNAS models (#18427) Signed-off-by: Nave Assaf <nassaf@nvidia.com> --- vllm/model_executor/models/llama.py | 35 +++++++++------- vllm/model_executor/models/nemotron_nas.py | 48 +++++++++++++++++++++- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c15c0213b520..6584980f6dc2 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -162,20 +162,9 @@ def __init__( prefix=f"{prefix}.o_proj", ) - is_neox_style = True - is_gguf = quant_config and quant_config.get_name() == "gguf" - if is_gguf and config.model_type == "llama": - is_neox_style = False - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - is_neox_style=is_neox_style, - partial_rotary_factor=self.partial_rotary_factor, - ) + self._init_rotary_emb(config, + rope_scaling=rope_scaling, + quant_config=quant_config) if hasattr(config, "interleaved_sliding_window"): interleaved_sliding_window = config.interleaved_sliding_window @@ -214,6 +203,24 @@ def forward( output, _ = self.o_proj(attn_output) return output + def _init_rotary_emb(self, config: LlamaConfig, + rope_scaling: Optional[dict[str, Any]], + quant_config: Optional[QuantizationConfig]) -> None: + is_neox_style = True + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and self.config.model_type == "llama": + is_neox_style = False + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, + rope_scaling=rope_scaling, + is_neox_style=is_neox_style, + partial_rotary_factor=self.partial_rotary_factor, + ) + class LlamaDecoderLayer(nn.Module): diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index f4d5a77f2086..9808fe05558e 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -23,18 +23,20 @@ # limitations under the License. """Inference-only deci model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Optional, Union +from typing import Any, Optional, Union import torch from torch import nn from transformers import LlamaConfig +from vllm.attention import AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -62,6 +64,48 @@ def _find_multiple(n: int, k: int) -> int: return n + k - (n % k) +class DeciLMAttention(LlamaAttention): + + def __init__( + self, + config: LlamaConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + bias_o_proj: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + ) -> None: + super().__init__(config, hidden_size, num_heads, num_kv_heads, + rope_theta, rope_scaling, max_position_embeddings, + quant_config, bias, bias_o_proj, cache_config, prefix, + attn_type) + + def _init_rotary_emb(self, config, rope_scaling: Optional[dict[str, Any]], + quant_config: Optional[QuantizationConfig]) -> None: + # Enables YARN for Mistral and LLaMA4 derivatives. + is_neox_style = True + if hasattr(config, "position_embedding_type"): + is_neox_style = config.position_embedding_type not in [ + "mistral_yarn", "rope_llama4" + ] + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, + rope_scaling=rope_scaling, + is_neox_style=is_neox_style, + partial_rotary_factor=self.partial_rotary_factor) + + class DeciLMDecoderLayer(nn.Module): def __init__( @@ -98,7 +142,7 @@ def __init__( if not self._is_no_op_attention: num_kv_heads = (config.num_attention_heads // block_config.attention.n_heads_in_group) - self.self_attn = LlamaAttention( + self.self_attn = DeciLMAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, From 08777500295422577fb7d88b2eb798a3d56af3ee Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Mon, 26 May 2025 19:00:08 +0800 Subject: [PATCH 163/192] [CI/Build] Split pooling and generation extended language models tests in CI (#18705) Signed-off-by: Isotr0py <2037008807@qq.com> --- .buildkite/test-pipeline.yaml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1f54b70f05dd..250140a7eeda 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -486,16 +486,25 @@ steps: - pip freeze | grep -E 'torch' - pytest -v -s models/language -m core_model -- label: Language Models Test (Extended) +- label: Language Models Test (Extended Generation) # 1hr20min mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - - tests/models/language + - tests/models/language/generation commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - - pytest -v -s models/language -m 'not core_model' + - pytest -v -s models/language/generation -m 'not core_model' + +- label: Language Models Test (Extended Pooling) # 36min + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' - label: Multi-Modal Models Test (Standard) mirror_hardwares: [amdexperimental] From e76be06550895b633d1d1aca90f0725152f92294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Durejko?= <lukasz.durejko@intel.com> Date: Mon, 26 May 2025 14:26:07 +0200 Subject: [PATCH 164/192] [Hardware][Intel-Gaudi] [CI/Build] Add tensor parallel size = 2 test to HPU CI (#18709) Signed-off-by: Lukasz Durejko <ldurejko@habana.ai> --- .buildkite/scripts/hardware_ci/run-hpu-test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh index 95b6ac37f185..c3b78d471297 100644 --- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh @@ -21,4 +21,6 @@ remove_docker_container # Run the image and launch offline inference docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2 + EXITCODE=$? From 0665e29998016488e0836ba1abfdb95943731f05 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Mon, 26 May 2025 21:56:18 +0800 Subject: [PATCH 165/192] [Misc] add AutoGen integration (#18712) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> --- docs/deployment/frameworks/autogen.md | 83 +++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 docs/deployment/frameworks/autogen.md diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md new file mode 100644 index 000000000000..ad8c167659ef --- /dev/null +++ b/docs/deployment/frameworks/autogen.md @@ -0,0 +1,83 @@ +--- +title: AutoGen +--- +[](){ #deployment-autogen } + +[AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans. + +## Prerequisites + +- Setup vLLM environment + +- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment + +```console +pip install vllm + +# Install AgentChat and OpenAI client from Extensions +# AutoGen requires Python 3.10 or later. +pip install -U "autogen-agentchat" "autogen-ext[openai]" +``` + +## Deploy + +- Start the vLLM server with the supported chat completion model, e.g. + +```console +python -m vllm.entrypoints.openai.api_server \ + --model mistralai/Mistral-7B-Instruct-v0.2 +``` + +- Call it with AutoGen: + +```python +import asyncio +from autogen_core.models import UserMessage +from autogen_ext.models.openai import OpenAIChatCompletionClient +from autogen_core.models import ModelFamily + + +async def main() -> None: + # Create a model client + model_client = OpenAIChatCompletionClient( + model="mistralai/Mistral-7B-Instruct-v0.2", + base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1", + api_key="EMPTY", + model_info={ + "vision": False, + "function_calling": False, + "json_output": False, + "family": ModelFamily.MISTRAL, + "structured_output": True, + }, + ) + + messages = [UserMessage(content="Write a very short story about a dragon.", source="user")] + + # Create a stream. + stream = model_client.create_stream(messages=messages) + + # Iterate over the stream and print the responses. + print("Streamed responses:") + async for response in stream: + if isinstance(response, str): + # A partial response is a string. + print(response, flush=True, end="") + else: + # The last response is a CreateResult object with the complete message. + print("\n\n------------\n") + print("The complete response:", flush=True) + print(response.content, flush=True) + + # Close the client when done. + await model_client.close() + + +asyncio.run(main()) +``` + +For details, see the tutorial: + +- [Using vLLM in AutoGen](https://microsoft.github.io/autogen/0.2/docs/topics/non-openai-models/local-vllm/) + +- [OpenAI-compatible API examples](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.models.openai.html#autogen_ext.models.openai.OpenAIChatCompletionClient) From 243eb9199fa5962e8edf46e47170f68b3f6a0116 Mon Sep 17 00:00:00 2001 From: dylan <xuhao296@qq.com> Date: Mon, 26 May 2025 22:10:56 +0800 Subject: [PATCH 166/192] [Bugfix]: handle hf-xet CAS error when loading Qwen3 weights in vLLM (#18701) --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index f31824b55026..dd0175dbbef2 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -8,7 +8,7 @@ tqdm blake3 py-cpuinfo transformers >= 4.51.1 -huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads. +huggingface-hub[hf_xet] >= 0.32.0 # Required for Xet downloads. tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. From 9553fdb41e63a2864684242f29851beb4afe75c1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 22:33:34 +0800 Subject: [PATCH 167/192] [Doc] Improve API docs (#18713) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/.nav.yml | 11 ++++++----- docs/mkdocs/stylesheets/extra.css | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/.nav.yml b/docs/.nav.yml index 100841aecf61..4a870b40ed23 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -18,7 +18,7 @@ nav: - Roadmap: https://roadmap.vllm.ai - Releases: https://github.com/vllm-project/vllm/releases - User Guide: - - usage/README.md + - Summary: usage/README.md - General: - usage/* - Inference and Serving: @@ -44,7 +44,7 @@ nav: - features/* - features/quantization - Developer Guide: - - contributing/README.md + - Summary: contributing/README.md - General: - glob: contributing/* flatten_single_child_sections: true @@ -53,9 +53,10 @@ nav: - V0: design - V1: design/v1 - API Reference: - - api/README.md - - glob: api/vllm/* - preserve_directory_names: true + - Summary: api/README.md + - Contents: + - glob: api/vllm/* + preserve_directory_names: true - Community: - community/* - Blog: https://blog.vllm.ai diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css index 165f53efc085..dd7b3460a319 100644 --- a/docs/mkdocs/stylesheets/extra.css +++ b/docs/mkdocs/stylesheets/extra.css @@ -5,7 +5,7 @@ } /* https://christianoliff.com/blog/styling-external-links-with-an-icon-in-css/ */ -a:not(:has(svg)):not(.md-icon) { +a:not(:has(svg)):not(.md-icon):not(.autorefs-external) { align-items: center; &[href^="//"]::after, From 82e2339b0632a4c787915210b5b57da13de26bf6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 22:38:04 +0800 Subject: [PATCH 168/192] [Doc] Move examples and further reorganize user guide (#18666) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .buildkite/pyproject.toml | 5 ----- .buildkite/test-pipeline.yaml | 2 +- .gitignore | 2 +- benchmarks/pyproject.toml | 5 ----- docs/.nav.yml | 9 ++++----- docs/configuration/README.md | 9 +++++++-- docs/{usage => configuration}/env_vars.md | 0 docs/design/v1/metrics.md | 4 ++-- docs/mkdocs/hooks/generate_examples.py | 2 +- docs/models/extensions/tensorizer.md | 2 +- docs/training/rlhf.md | 6 +++--- examples/{ => others}/lmcache/README.md | 0 examples/{ => others}/lmcache/cpu_offload_lmcache.py | 0 .../{ => others}/lmcache/disagg_prefill_lmcache_v0.py | 0 .../configs/lmcache-decoder-config.yaml | 0 .../configs/lmcache-prefiller-config.yaml | 0 .../disagg_prefill_lmcache_v1/disagg_example_nixl.sh | 0 .../disagg_prefill_lmcache_v1/disagg_proxy_server.py | 0 .../disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh | 0 .../lmcache/kv_cache_sharing_lmcache_v1.py | 0 examples/{other => others}/logging_configuration.md | 0 examples/{other => others}/tensorize_vllm_model.py | 10 +++++----- pyproject.toml | 5 ----- requirements/common.txt | 2 +- tests/lora/test_llama_tp.py | 2 +- vllm/model_executor/model_loader/tensorizer.py | 4 ++-- vllm/model_executor/model_loader/tensorizer_loader.py | 4 ++-- 27 files changed, 31 insertions(+), 42 deletions(-) rename docs/{usage => configuration}/env_vars.md (100%) rename examples/{ => others}/lmcache/README.md (100%) rename examples/{ => others}/lmcache/cpu_offload_lmcache.py (100%) rename examples/{ => others}/lmcache/disagg_prefill_lmcache_v0.py (100%) rename examples/{ => others}/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml (100%) rename examples/{ => others}/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml (100%) rename examples/{ => others}/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh (100%) rename examples/{ => others}/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py (100%) rename examples/{ => others}/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh (100%) rename examples/{ => others}/lmcache/kv_cache_sharing_lmcache_v1.py (100%) rename examples/{other => others}/logging_configuration.md (100%) rename examples/{other => others}/tensorize_vllm_model.py (97%) diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml index 083bb795caf5..d5cad1c73c6f 100644 --- a/.buildkite/pyproject.toml +++ b/.buildkite/pyproject.toml @@ -6,11 +6,6 @@ [tool.ruff] line-length = 88 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 250140a7eeda..66e2e3312337 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -246,7 +246,7 @@ steps: - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_embedding.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py diff --git a/.gitignore b/.gitignore index 8d5af1bed92d..e49d1d6ba619 100644 --- a/.gitignore +++ b/.gitignore @@ -146,7 +146,7 @@ venv.bak/ # mkdocs documentation /site -docs/getting_started/examples +docs/examples # mypy .mypy_cache/ diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index f825cb203269..65b1e09a247e 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -6,11 +6,6 @@ [tool.ruff] line-length = 88 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] diff --git a/docs/.nav.yml b/docs/.nav.yml index 4a870b40ed23..42aba9775360 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -5,11 +5,9 @@ nav: - getting_started/quickstart.md - getting_started/installation - Examples: - - Offline Inference: getting_started/examples/offline_inference - - Online Serving: getting_started/examples/online_serving - - Others: - - LMCache: getting_started/examples/lmcache - - getting_started/examples/other/* + - Offline Inference: examples/offline_inference + - Online Serving: examples/online_serving + - Others: examples/others - Quick Links: - User Guide: usage/README.md - Developer Guide: contributing/README.md @@ -19,6 +17,7 @@ nav: - Releases: https://github.com/vllm-project/vllm/releases - User Guide: - Summary: usage/README.md + - usage/v1_guide.md - General: - usage/* - Inference and Serving: diff --git a/docs/configuration/README.md b/docs/configuration/README.md index 442a8d441430..6a8fbc79f4af 100644 --- a/docs/configuration/README.md +++ b/docs/configuration/README.md @@ -1,4 +1,9 @@ # Configuration Options -This section lists the most common options for running the vLLM engine. -For a full list, refer to the [configuration][configuration] page. +This section lists the most common options for running vLLM. + +There are three main levels of configuration, from highest priority to lowest priority: + +- [Request parameters][completions-api] and [input arguments][sampling-params] +- [Engine arguments](./engine_args.md) +- [Environment variables](./env_vars.md) diff --git a/docs/usage/env_vars.md b/docs/configuration/env_vars.md similarity index 100% rename from docs/usage/env_vars.md rename to docs/configuration/env_vars.md diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md index 6080390ba0ed..7156ee9dd3ec 100644 --- a/docs/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../.. ### Grafana Dashboard -vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. +vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: @@ -673,7 +673,7 @@ v0 has support for OpenTelemetry tracing: - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/) - [User-facing - docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html) + docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html) - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) - [IBM product diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index a2131c342e8c..c2f1f2d96f00 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -9,7 +9,7 @@ ROOT_DIR = Path(__file__).parent.parent.parent.parent ROOT_DIR_RELATIVE = '../../../../..' EXAMPLE_DIR = ROOT_DIR / "examples" -EXAMPLE_DOC_DIR = ROOT_DIR / "docs/getting_started/examples" +EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples" print(ROOT_DIR.resolve()) print(EXAMPLE_DIR.resolve()) print(EXAMPLE_DOC_DIR.resolve()) diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 36b49626d47d..b6feb405c6ca 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -10,7 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html). +the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html). !!! note Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/training/rlhf.md b/docs/training/rlhf.md index 72e89c0c7478..4f75e4e01495 100644 --- a/docs/training/rlhf.md +++ b/docs/training/rlhf.md @@ -6,6 +6,6 @@ vLLM can be used to generate the completions for RLHF. The best way to do this i See the following basic examples to get started if you don't want to use an existing library: -- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf.html) -- [Training and inference processes are colocated on the same GPUs using Ray](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_colocate.html) -- [Utilities for performing RLHF with vLLM](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_utils.html) +- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md) +- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md) +- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md) diff --git a/examples/lmcache/README.md b/examples/others/lmcache/README.md similarity index 100% rename from examples/lmcache/README.md rename to examples/others/lmcache/README.md diff --git a/examples/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py similarity index 100% rename from examples/lmcache/cpu_offload_lmcache.py rename to examples/others/lmcache/cpu_offload_lmcache.py diff --git a/examples/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v0.py rename to examples/others/lmcache/disagg_prefill_lmcache_v0.py diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml b/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml rename to examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml b/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml rename to examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh similarity index 100% rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh diff --git a/examples/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py similarity index 100% rename from examples/lmcache/kv_cache_sharing_lmcache_v1.py rename to examples/others/lmcache/kv_cache_sharing_lmcache_v1.py diff --git a/examples/other/logging_configuration.md b/examples/others/logging_configuration.md similarity index 100% rename from examples/other/logging_configuration.md rename to examples/others/logging_configuration.md diff --git a/examples/other/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py similarity index 97% rename from examples/other/tensorize_vllm_model.py rename to examples/others/tensorize_vllm_model.py index b1f2ce871bb4..38193b1c1002 100644 --- a/examples/other/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -28,7 +28,7 @@ To serialize a model, install vLLM from source, then run something like this from the root level of this repository: -python examples/other/tensorize_vllm_model.py \ +python examples/others/tensorize_vllm_model.py \ --model facebook/opt-125m \ serialize \ --serialized-directory s3://my-bucket \ @@ -48,7 +48,7 @@ To deserialize a model, you can run something like this from the root level of this repository: -python examples/other/tensorize_vllm_model.py \ +python examples/others/tensorize_vllm_model.py \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ @@ -66,11 +66,11 @@ model-rank-%03d.tensors For more information on the available arguments for serializing, run -`python -m examples.other.tensorize_vllm_model serialize --help`. +`python -m examples.others.tensorize_vllm_model serialize --help`. Or for deserializing: -`python examples/other/tensorize_vllm_model.py deserialize --help`. +`python examples/others/tensorize_vllm_model.py deserialize --help`. Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: @@ -91,7 +91,7 @@ In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: -`python examples/other/tensorize_vllm_model.py deserialize --help` +`python examples/others/tensorize_vllm_model.py deserialize --help` under the `tensorizer options` section. These can also be used for deserialization in this example script, although `--tensorizer-uri` and diff --git a/pyproject.toml b/pyproject.toml index 2e4242f6d5c8..c642aa048586 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,11 +62,6 @@ ignore_patterns = [ [tool.ruff] # Allow lines to be as long as 80. line-length = 80 -exclude = [ - # External file, leaving license intact - "examples/other/fp8/quantizer/quantize.py", - "vllm/vllm_flash_attn/flash_attn_interface.pyi" -] [tool.ruff.lint.per-file-ignores] "vllm/third_party/**" = ["ALL"] diff --git a/requirements/common.txt b/requirements/common.txt index dd0175dbbef2..625efc3366f4 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -41,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files -python-json-logger # Used by logging as per examples/other/logging_configuration.md +python-json-logger # Used by logging as per examples/others/logging_configuration.md scipy # Required for phi-4-multimodal-instruct ninja # Required for xgrammar, rocm, tpu, xpu opentelemetry-sdk>=1.26.0 # vllm.tracing diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 37bbc3cfa7d0..580992dea53d 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -207,7 +207,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, try: result = subprocess.run([ sys.executable, - f"{VLLM_PATH}/examples/other/tensorize_vllm_model.py", "--model", + f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model", MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size", str(tp_size), "serialize", "--serialized-directory", str(tmp_path), "--suffix", suffix diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 6f9408d892c3..4c4502284a6a 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -251,7 +251,7 @@ class TensorizerArgs: encryption_keyfile: File path to a binary file containing a binary key to use for decryption. `None` (the default) means no decryption. See the example script in - examples/other/tensorize_vllm_model.py. + examples/others/tensorize_vllm_model.py. s3_access_key_id: The access key for the S3 bucket. Can also be set via the S3_ACCESS_KEY_ID environment variable. s3_secret_access_key: The secret access key for the S3 bucket. Can also @@ -469,7 +469,7 @@ def tensorizer_weights_iterator( "loading on vLLM, as tensorizer is forced to load to CPU. " "Consider deserializing a vLLM model instead for faster " "load times. See the " - "examples/other/tensorize_vllm_model.py example script " + "examples/others/tensorize_vllm_model.py example script " "for serializing vLLM models.") deserializer_args = tensorizer_args.deserializer_params diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 26f8c0946b0a..2afe2b59e2f9 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -48,7 +48,7 @@ def _load_model_serialized_cpu( """Load a serialized model with tensorizer to the CPU. This is only necessary when the model isn't vLLM-tensorized (see - examples/other/tensorize_vllm_model.py) This should still + examples/others/tensorize_vllm_model.py) This should still be faster than default HuggingFace loading, but will be slower than loading a vLLM-tensorized model. """ @@ -68,7 +68,7 @@ def _load_model_serialized( """Load a serialized model with tensorizer. Expects a vLLM-tensorized model. See the - examples/other/tensorize_vllm_model.py example script + examples/others/tensorize_vllm_model.py example script for serializing vLLM models.""" device_config = vllm_config.device_config From a869baca73eb90ae7bd18402915dc4bfc36cf06b Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Mon, 26 May 2025 22:49:22 +0800 Subject: [PATCH 169/192] [Bugfix] Fix Llama GGUF initialization (#18717) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/model_executor/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 6584980f6dc2..d36b6466c0bb 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -208,7 +208,7 @@ def _init_rotary_emb(self, config: LlamaConfig, quant_config: Optional[QuantizationConfig]) -> None: is_neox_style = True is_gguf = quant_config and quant_config.get_name() == "gguf" - if is_gguf and self.config.model_type == "llama": + if is_gguf and config.model_type == "llama": is_neox_style = False self.rotary_emb = get_rope( From e7523c2e031bc96740723ab63833d1cf94229ab4 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Mon, 26 May 2025 16:49:36 +0100 Subject: [PATCH 170/192] [V1][Sampler] Improve performance of FlashInfer sampling by sampling logits instead of probs (#18608) --- vllm/v1/sample/ops/topk_topp_sampler.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 5d8b3f423b02..4a5fbb10d408 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -89,18 +89,18 @@ def forward_cuda( p: Optional[torch.Tensor], ) -> torch.Tensor: """More optimized implementation for top-k and top-p sampling.""" - probs = logits.softmax(dim=-1, dtype=torch.float32) if k is None and p is None: # We prefer `random_sample` over `flashinfer_sample` when sorting is # not needed. This is because `random_sample` does not require # CPU-GPU synchronization while `flashinfer_sample` does. + probs = logits.softmax(dim=-1, dtype=torch.float32) return random_sample(probs, generators) if generators: logger.warning("FlashInfer 0.2.3+ does not support " "per-request generators. Falling back to " "PyTorch-native implementation.") return self.forward_native(logits, generators, k, p) - return flashinfer_sample(probs, k, p, generators) + return flashinfer_sample(logits, k, p, generators) def forward_tpu( self, @@ -254,17 +254,17 @@ def random_sample( def flashinfer_sample( - probs: torch.Tensor, + logits: torch.Tensor, k: Optional[torch.Tensor], p: Optional[torch.Tensor], generators: dict[int, torch.Generator], ) -> torch.Tensor: - """Sample from the probabilities using FlashInfer. + """Sample from the logits using FlashInfer. Statistically, this function is equivalent to the `random_sample` function. However, this function is faster because it avoids sorting the logits tensor via rejection sampling. - + NOTE: The outputs of this function do not necessarily match the outputs of the `random_sample` function. It only guarantees that the outputs are statistically equivalent. @@ -274,18 +274,19 @@ def flashinfer_sample( the synchronization overhead. """ assert not (k is None and p is None) - if k is None: # Top-p only. + probs = logits.softmax(dim=-1, dtype=torch.float32) next_token_ids = flashinfer.sampling.top_p_sampling_from_probs( probs, p, deterministic=True) elif p is None: # Top-k only. + probs = logits.softmax(dim=-1, dtype=torch.float32) next_token_ids = flashinfer.sampling.top_k_sampling_from_probs( probs, k, deterministic=True) else: # Both top-k and top-p. - next_token_ids = (flashinfer.sampling.top_k_top_p_sampling_from_probs( - probs, k, p, deterministic=True)) + next_token_ids = flashinfer.sampling.top_k_top_p_sampling_from_logits( + logits, k, p, deterministic=True) return next_token_ids.view(-1) From 27bebcd89792d5c4b08af7a65095759526f2f9e1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 26 May 2025 17:57:54 +0100 Subject: [PATCH 171/192] Convert `examples` to `ruff-format` (#18400) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- examples/offline_inference/audio_language.py | 150 +++--- .../automatic_prefix_caching.py | 16 +- examples/offline_inference/basic/chat.py | 18 +- examples/offline_inference/basic/classify.py | 15 +- examples/offline_inference/basic/embed.py | 14 +- examples/offline_inference/basic/score.py | 6 +- .../offline_inference/batch_llm_inference.py | 22 +- examples/offline_inference/chat_with_tools.py | 110 ++--- examples/offline_inference/data_parallel.py | 111 +++-- .../decode_example.py | 23 +- .../prefill_example.py | 19 +- .../disaggregated_prefill.py | 45 +- examples/offline_inference/eagle.py | 50 +- .../embed_jina_embeddings_v3.py | 19 +- .../offline_inference/embed_matryoshka_fy.py | 15 +- examples/offline_inference/encoder_decoder.py | 40 +- .../encoder_decoder_multimodal.py | 62 +-- .../offline_inference/llm_engine_example.py | 34 +- .../offline_inference/load_sharded_state.py | 38 +- .../lora_with_quantization_inference.py | 116 ++--- examples/offline_inference/mistral-small.py | 64 +-- examples/offline_inference/mlpspeculator.py | 11 +- .../offline_inference/multilora_inference.py | 86 ++-- examples/offline_inference/neuron.py | 3 +- examples/offline_inference/neuron_eagle.py | 6 +- .../neuron_int8_quantization.py | 9 +- .../offline_inference/neuron_speculation.py | 10 +- examples/offline_inference/prefix_caching.py | 21 +- .../prithvi_geospatial_mae.py | 179 ++++---- examples/offline_inference/profiling.py | 225 +++++---- .../profiling_tpu/profiling.py | 69 +-- .../prompt_embed_inference.py | 65 ++- .../qwen2_5_omni/only_thinker.py | 127 ++--- examples/offline_inference/qwen_1m.py | 30 +- examples/offline_inference/rlhf.py | 21 +- examples/offline_inference/rlhf_colocate.py | 23 +- examples/offline_inference/rlhf_utils.py | 32 +- .../offline_inference/save_sharded_state.py | 41 +- .../offline_inference/structured_outputs.py | 43 +- .../offline_inference/torchrun_example.py | 3 +- examples/offline_inference/tpu.py | 10 +- examples/offline_inference/vision_language.py | 433 +++++++++--------- .../vision_language_embedding.py | 71 +-- .../vision_language_multi_image.py | 425 +++++++++-------- examples/online_serving/api_client.py | 22 +- .../online_serving/cohere_rerank_client.py | 17 +- .../disagg_proxy_demo.py | 188 ++++---- .../gradio_openai_chatbot_webserver.py | 82 ++-- examples/online_serving/gradio_webserver.py | 32 +- .../online_serving/jinaai_rerank_client.py | 12 +- .../online_serving/kv_events_subscriber.py | 16 +- .../openai_chat_completion_client.py | 28 +- ...i_chat_completion_client_for_multimodal.py | 257 +++++------ ...penai_chat_completion_client_with_tools.py | 147 +++--- ...t_completion_client_with_tools_required.py | 61 +-- ...enai_chat_completion_structured_outputs.py | 80 ++-- ...etion_structured_outputs_structural_tag.py | 44 +- ...etion_structured_outputs_with_reasoning.py | 59 ++- ...at_completion_tool_calls_with_reasoning.py | 158 +++---- .../openai_chat_completion_with_reasoning.py | 12 +- ...hat_completion_with_reasoning_streaming.py | 4 +- ...ai_chat_embedding_client_for_multimodal.py | 127 +++-- .../openai_classification_client.py | 4 +- .../openai_completion_client.py | 9 +- .../openai_cross_encoder_score.py | 13 +- .../online_serving/openai_embedding_client.py | 2 +- .../online_serving/openai_pooling_client.py | 21 +- .../openai_transcription_client.py | 30 +- .../opentelemetry/dummy_client.py | 9 +- ...ompt_embed_inference_with_openai_client.py | 20 +- examples/online_serving/ray_serve_deepseek.py | 4 +- ...val_augmented_generation_with_langchain.py | 113 ++--- ...al_augmented_generation_with_llamaindex.py | 97 ++-- .../streamlit_openai_chatbot_webserver.py | 55 +-- examples/online_serving/utils.py | 6 +- .../others/lmcache/cpu_offload_lmcache.py | 24 +- .../lmcache/disagg_prefill_lmcache_v0.py | 58 ++- .../disagg_proxy_server.py | 100 ++-- .../lmcache/kv_cache_sharing_lmcache_v1.py | 40 +- examples/others/tensorize_vllm_model.py | 7 +- examples/pyproject.toml | 54 +++ pyproject.toml | 2 + 83 files changed, 2535 insertions(+), 2411 deletions(-) create mode 100644 examples/pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ee909c3b8ca..d0fa4e8f64cc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - id: ruff args: [--output-format, github, --fix] - id: ruff-format - files: ^(.buildkite|benchmarks)/.* + files: ^(.buildkite|benchmarks|examples)/.* - repo: https://github.com/codespell-project/codespell rev: v2.4.1 hooks: diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index bab41c915c32..56cdd6861baa 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to use vLLM for running offline inference +This example shows how to use vLLM for running offline inference with the correct prompt format on audio language models. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ + import os from dataclasses import asdict from typing import NamedTuple, Optional @@ -22,7 +23,7 @@ question_per_audio_count = { 0: "What is 1+1?", 1: "What is recited in the audio?", - 2: "What sport and what nursery rhyme are referenced?" + 2: "What sport and what nursery rhyme are referenced?", } @@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: # MiniCPM-O def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: model_name = "openbmb/MiniCPM-o-2_6" - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) engine_args = EngineArgs( model=model_name, trust_remote_code=True, @@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: limit_mm_per_prompt={"audio": audio_count}, ) - stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_tokens = ["<|im_end|>", "<|endoftext|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] audio_placeholder = "(<audio>./</audio>)" * audio_count audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501 - messages = [{ - 'role': 'user', - 'content': f'{audio_placeholder}\n{question}' - }] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True, - chat_template=audio_chat_template) + messages = [{"role": "user", "content": f"{audio_placeholder}\n{question}"}] + prompt = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + chat_template=audio_chat_template, + ) return ModelRequestData( engine_args=engine_args, @@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: # Since the vision-lora and speech-lora co-exist with the base model, # we have to manually specify the path of the lora weights. speech_lora_path = os.path.join(model_path, "speech-lora") - placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)]) + placeholders = "".join([f"<|audio_{i + 1}|>" for i in range(audio_count)]) prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" @@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: limit_mm_per_prompt={"audio": audio_count}, ) - audio_in_prompt = "".join([ - f"Audio {idx+1}: " - f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) - ]) + audio_in_prompt = "".join( + [ + f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" + for idx in range(audio_count) + ] + ) - prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - "<|im_start|>user\n" - f"{audio_in_prompt}{question}<|im_end|>\n" - "<|im_start|>assistant\n") + prompt = ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) return ModelRequestData( engine_args=engine_args, @@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int): limit_mm_per_prompt={"audio": audio_count}, ) - audio_in_prompt = "".join([ - "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) - ]) + audio_in_prompt = "".join( + ["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)] + ) default_system = ( "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech.") + "generating text and speech." + ) - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n" - f"{audio_in_prompt}{question}<|im_end|>\n" - "<|im_start|>assistant\n") + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) return ModelRequestData( engine_args=engine_args, prompt=prompt, @@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData: model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b" tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [{ - 'role': 'user', - 'content': "<|audio|>\n" * audio_count + question - }] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}] + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) engine_args = EngineArgs( model=model_name, @@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData: # Whisper def run_whisper(question: str, audio_count: int) -> ModelRequestData: - assert audio_count == 1, ( - "Whisper only support single audio input per prompt") + assert audio_count == 1, "Whisper only support single audio input per prompt" model_name = "openai/whisper-large-v3-turbo" prompt = "<|startoftranscript|>" @@ -252,27 +254,33 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData: def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'audio language models') - parser.add_argument('--model-type', - '-m', - type=str, - default="ultravox", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument('--num-prompts', - type=int, - default=1, - help='Number of prompts to run.') - parser.add_argument("--num-audios", - type=int, - default=1, - choices=[0, 1, 2], - help="Number of audio items per prompt.") - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "audio language models" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="ultravox", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--num-prompts", type=int, default=1, help="Number of prompts to run." + ) + parser.add_argument( + "--num-audios", + type=int, + default=1, + choices=[0, 1, 2], + help="Number of audio items per prompt.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() @@ -283,29 +291,30 @@ def main(args): raise ValueError(f"Model type {model} is not supported.") audio_count = args.num_audios - req_data = model_example_map[model](question_per_audio_count[audio_count], - audio_count) + req_data = model_example_map[model]( + question_per_audio_count[audio_count], audio_count + ) # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, - max_tokens=64, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids + ) mm_data = {} if audio_count > 0: mm_data = { "audio": [ - asset.audio_and_sample_rate - for asset in audio_assets[:audio_count] + asset.audio_and_sample_rate for asset in audio_assets[:audio_count] ] } @@ -315,8 +324,9 @@ def main(args): # Batch inference inputs = [inputs] * args.num_prompts # Add LoRA request if applicable - lora_request = (req_data.lora_requests * - args.num_prompts if req_data.lora_requests else None) + lora_request = ( + req_data.lora_requests * args.num_prompts if req_data.lora_requests else None + ) outputs = llm.generate( inputs, diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py index 6d05d0b99d80..0d8c73304237 100644 --- a/examples/offline_inference/automatic_prefix_caching.py +++ b/examples/offline_inference/automatic_prefix_caching.py @@ -16,13 +16,16 @@ Run: python examples/offline_inference/automatic_prefix_caching.py """ + import time from vllm import LLM, SamplingParams # ruff: noqa: E501 # A prompt containing a large markdown table. The table is randomly generated by GPT-4. -LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ +LONG_PROMPT = ( + "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + + """ | ID | Name | Age | Occupation | Country | Email | Phone Number | Address | |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | @@ -56,6 +59,7 @@ | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | """ +) def get_generation_time(llm, sampling_params, prompts): @@ -72,7 +76,7 @@ def get_generation_time(llm, sampling_params, prompts): def main(): # set enable_prefix_caching=True to enable APC - llm = LLM(model='lmsys/longchat-13b-16k', enable_prefix_caching=True) + llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True) sampling_params = SamplingParams(temperature=0, max_tokens=100) @@ -80,8 +84,8 @@ def main(): get_generation_time( llm, sampling_params, - LONG_PROMPT + - "Question: what is the age of John Doe? Your answer: The age of John Doe is ", + LONG_PROMPT + + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", ) # Querying the age of Zack Blue @@ -89,8 +93,8 @@ def main(): get_generation_time( llm, sampling_params, - LONG_PROMPT + - "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", + LONG_PROMPT + + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", ) diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py index 8e6f78ed7de2..b0bb5aa71b8a 100644 --- a/examples/offline_inference/basic/chat.py +++ b/examples/offline_inference/basic/chat.py @@ -56,22 +56,12 @@ def print_outputs(outputs): # In this script, we demonstrate how to pass input to the chat method: conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, { "role": "user", - "content": - "Write an essay about the importance of higher education.", + "content": "Write an essay about the importance of higher education.", }, ] outputs = llm.chat(conversation, sampling_params, use_tqdm=False) diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 5b6dcb41eee1..40ccb1294e42 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach", - task="classify", - enforce_eager=True) + parser.set_defaults( + model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True + ) return parser.parse_args() @@ -36,10 +36,11 @@ def main(args: Namespace): print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): probs = output.outputs.probs - probs_trimmed = ((str(probs[:16])[:-1] + - ", ...]") if len(probs) > 16 else probs) - print(f"Prompt: {prompt!r} \n" - f"Class Probabilities: {probs_trimmed} (size={len(probs)})") + probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs + print( + f"Prompt: {prompt!r} \n" + f"Class Probabilities: {probs_trimmed} (size={len(probs)})" + ) print("-" * 60) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index cb5f923ffb69..38a73ccca251 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="intfloat/e5-mistral-7b-instruct", - task="embed", - enforce_eager=True) + parser.set_defaults( + model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True + ) return parser.parse_args() @@ -36,10 +36,10 @@ def main(args: Namespace): print("\nGenerated Outputs:\n" + "-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} \n" - f"Embeddings: {embeds_trimmed} (size={len(embeds)})") + embeds_trimmed = ( + (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds + ) + print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})") print("-" * 60) diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index d2bda8b3180c..3da73c6c407d 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="BAAI/bge-reranker-v2-m3", - task="score", - enforce_eager=True) + parser.set_defaults( + model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True + ) return parser.parse_args() diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py index 6548857b6d11..c1edfb52ff70 100644 --- a/examples/offline_inference/batch_llm_inference.py +++ b/examples/offline_inference/batch_llm_inference.py @@ -17,12 +17,14 @@ Learn more about Ray Data's LLM integration: https://docs.ray.io/en/latest/data/working-with-llms.html """ + import ray from packaging.version import Version from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig -assert Version(ray.__version__) >= Version( - "2.44.1"), "Ray version must be at least 2.44.1" +assert Version(ray.__version__) >= Version("2.44.1"), ( + "Ray version must be at least 2.44.1" +) # Uncomment to reduce clutter in stdout # ray.init(log_to_driver=False) @@ -53,20 +55,18 @@ vllm_processor = build_llm_processor( config, preprocess=lambda row: dict( - messages=[{ - "role": "system", - "content": "You are a bot that responds with haikus." - }, { - "role": "user", - "content": row["text"] - }], + messages=[ + {"role": "system", "content": "You are a bot that responds with haikus."}, + {"role": "user", "content": row["text"]}, + ], sampling_params=dict( temperature=0.3, max_tokens=250, - )), + ), + ), postprocess=lambda row: dict( answer=row["generated_text"], - **row # This will return all the original columns in the dataset. + **row, # This will return all the original columns in the dataset. ), ) diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index b532bf42adfb..61230d895584 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -50,87 +50,93 @@ # or any other mistral model with function calling ability sampling_params = SamplingParams(max_tokens=8192, temperature=0.0) -llm = LLM(model=model_name, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral") +llm = LLM( + model=model_name, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral", +) def generate_random_id(length=9): characters = string.ascii_letters + string.digits - random_id = ''.join(random.choice(characters) for _ in range(length)) + random_id = "".join(random.choice(characters) for _ in range(length)) return random_id # simulate an API that can be called -def get_current_weather(city: str, state: str, unit: 'str'): - return (f"The weather in {city}, {state} is 85 degrees {unit}. It is " - "partly cloudly, with highs in the 90's.") +def get_current_weather(city: str, state: str, unit: "str"): + return ( + f"The weather in {city}, {state} is 85 degrees {unit}. It is " + "partly cloudly, with highs in the 90's." + ) tool_functions = {"get_current_weather": get_current_weather} -tools = [{ - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": - "string", - "description": - "The city to find the weather for, e.g. 'San Francisco'" +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to find the weather for, e.g. 'San Francisco'", + }, + "state": { + "type": "string", + "description": "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, }, - "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the city is" - " in, e.g. 'CA' which would mean 'California'" - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"] - } + "required": ["city", "state", "unit"], }, - "required": ["city", "state", "unit"] - } + }, } -}] +] -messages = [{ - "role": - "user", - "content": - "Can you tell me what the temperate will be in Dallas, in fahrenheit?" -}] +messages = [ + { + "role": "user", + "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?", + } +] outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools) output = outputs[0].outputs[0].text.strip() # append the assistant message -messages.append({ - "role": "assistant", - "content": output, -}) +messages.append( + { + "role": "assistant", + "content": output, + } +) # let's now actually parse and execute the model's output simulating an API call by using the # above defined function tool_calls = json.loads(output) tool_answers = [ - tool_functions[call['name']](**call['arguments']) for call in tool_calls + tool_functions[call["name"]](**call["arguments"]) for call in tool_calls ] # append the answer as a tool message and let the LLM give you an answer -messages.append({ - "role": "tool", - "content": "\n\n".join(tool_answers), - "tool_call_id": generate_random_id(), -}) +messages.append( + { + "role": "tool", + "content": "\n\n".join(tool_answers), + "tool_call_id": generate_random_id(), + } +) outputs = llm.chat(messages, sampling_params, tools=tools) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index f636a08c0b09..bf60d883c410 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -27,6 +27,7 @@ --master-addr=10.99.48.128 \ --master-port=13345 """ + import os from time import sleep @@ -36,46 +37,46 @@ def parse_args(): import argparse + parser = argparse.ArgumentParser(description="Data Parallel Inference") - parser.add_argument("--model", - type=str, - default="ibm-research/PowerMoE-3b", - help="Model name or path") - parser.add_argument("--dp-size", - type=int, - default=2, - help="Data parallel size") - parser.add_argument("--tp-size", - type=int, - default=2, - help="Tensor parallel size") - parser.add_argument("--node-size", - type=int, - default=1, - help="Total number of nodes") - parser.add_argument("--node-rank", - type=int, - default=0, - help="Rank of the current node") - parser.add_argument("--master-addr", - type=str, - default="", - help="Master node IP address") - parser.add_argument("--master-port", - type=int, - default=0, - help="Master node port") - parser.add_argument("--enforce-eager", - action='store_true', - help="Enforce eager mode execution.") - parser.add_argument("--trust-remote-code", - action='store_true', - help="Trust remote code.") + parser.add_argument( + "--model", + type=str, + default="ibm-research/PowerMoE-3b", + help="Model name or path", + ) + parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size") + parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size") + parser.add_argument( + "--node-size", type=int, default=1, help="Total number of nodes" + ) + parser.add_argument( + "--node-rank", type=int, default=0, help="Rank of the current node" + ) + parser.add_argument( + "--master-addr", type=str, default="", help="Master node IP address" + ) + parser.add_argument("--master-port", type=int, default=0, help="Master node port") + parser.add_argument( + "--enforce-eager", action="store_true", help="Enforce eager mode execution." + ) + parser.add_argument( + "--trust-remote-code", action="store_true", help="Trust remote code." + ) return parser.parse_args() -def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, - dp_master_port, GPUs_per_dp_rank, enforce_eager, trust_remote_code): +def main( + model, + dp_size, + local_dp_rank, + global_dp_rank, + dp_master_ip, + dp_master_port, + GPUs_per_dp_rank, + enforce_eager, + trust_remote_code, +): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) os.environ["VLLM_DP_SIZE"] = str(dp_size) @@ -110,9 +111,9 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, # since we are doing data parallel, every rank can have different # sampling params. here we set different max_tokens for different # ranks for demonstration. - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=[16, 20][global_dp_rank % 2]) + sampling_params = SamplingParams( + temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2] + ) # Create an LLM. llm = LLM( @@ -130,15 +131,16 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, break prompt = output.prompt generated_text = output.outputs[0].text - print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " - f"Generated text: {generated_text!r}") + print( + f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}" + ) # Give engines time to pause their processing loops before exiting. sleep(1) if __name__ == "__main__": - args = parse_args() dp_size = args.dp_size @@ -160,20 +162,29 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, procs = [] for local_dp_rank, global_dp_rank in enumerate( - range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)): - proc = Process(target=main, - args=(args.model, dp_size, local_dp_rank, - global_dp_rank, dp_master_ip, dp_master_port, - tp_size, args.enforce_eager, - args.trust_remote_code)) + range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node) + ): + proc = Process( + target=main, + args=( + args.model, + dp_size, + local_dp_rank, + global_dp_rank, + dp_master_ip, + dp_master_port, + tp_size, + args.enforce_eager, + args.trust_remote_code, + ), + ) proc.start() procs.append(proc) exit_code = 0 for proc in procs: proc.join(timeout=300) if proc.exitcode is None: - print(f"Killing process {proc.pid} that " - f"didn't stop within 5 minutes.") + print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill() exit_code = 1 elif proc.exitcode: diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py index 531c96f176a3..4ae5d3310e0b 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -22,17 +22,18 @@ def main(): prompts = read_prompts() sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - max_num_batched_tokens=64, - max_num_seqs=16, - kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", - kv_role="kv_both", - kv_connector_extra_config={ - "shared_storage_path": "local_storage" - })) #, max_model_len=2048, max_num_batched_tokens=2048) + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + max_num_batched_tokens=64, + max_num_seqs=16, + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ), + ) # , max_model_len=2048, max_num_batched_tokens=2048) # 1ST generation (prefill instance) outputs = llm.generate(prompts, sampling_params) diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py index 24b7b1d8fdbe..5757a8a84b86 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -20,15 +20,16 @@ def main(): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", - kv_role="kv_both", - kv_connector_extra_config={ - "shared_storage_path": "local_storage" - })) #, max_model_len=2048, max_num_batched_tokens=2048) + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ), + ) # , max_model_len=2048, max_num_batched_tokens=2048) # 1ST generation (prefill instance) outputs = llm.generate( diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py index bb6fdd48f79e..3ccab0dcd6d3 100644 --- a/examples/offline_inference/disaggregated_prefill.py +++ b/examples/offline_inference/disaggregated_prefill.py @@ -4,6 +4,7 @@ We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), and then transfer the KV cache between them. """ + import os import time from multiprocessing import Event, Process @@ -32,17 +33,21 @@ def run_prefill(prefill_done): # This instance is the prefill node (kv_producer, rank 0). # The number of parallel instances for KV cache transfer is set to 2, # as required for PyNcclConnector. - ktc = KVTransferConfig(kv_connector="PyNcclConnector", - kv_role="kv_producer", - kv_rank=0, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="PyNcclConnector", + kv_role="kv_producer", + kv_rank=0, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB # memory. You may need to adjust the value to fit your GPU. - llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct", - kv_transfer_config=ktc, - max_model_len=2000, - gpu_memory_utilization=0.8) + llm = LLM( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + kv_transfer_config=ktc, + max_model_len=2000, + gpu_memory_utilization=0.8, + ) llm.generate(prompts, sampling_params) print("Prefill node is finished.") @@ -72,17 +77,21 @@ def run_decode(prefill_done): # This instance is the decode node (kv_consumer, rank 1). # The number of parallel instances for KV cache transfer is set to 2, # as required for PyNcclConnector. - ktc = KVTransferConfig(kv_connector="PyNcclConnector", - kv_role="kv_consumer", - kv_rank=1, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="PyNcclConnector", + kv_role="kv_consumer", + kv_rank=1, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB # memory. You may need to adjust the value to fit your GPU. - llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct", - kv_transfer_config=ktc, - max_model_len=2000, - gpu_memory_utilization=0.8) + llm = LLM( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + kv_transfer_config=ktc, + max_model_len=2000, + gpu_memory_utilization=0.8, + ) # Wait for the producer to start the pipe print("Waiting for prefill node to finish...") @@ -99,8 +108,8 @@ def run_decode(prefill_done): def main(): prefill_done = Event() - prefill_process = Process(target=run_prefill, args=(prefill_done, )) - decode_process = Process(target=run_decode, args=(prefill_done, )) + prefill_process = Process(target=run_prefill, args=(prefill_done,)) + decode_process = Process(target=run_decode, args=(prefill_done,)) # Start prefill node prefill_process.start() diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index 615f67e9f8d8..3dd9e5464641 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -20,9 +20,7 @@ def load_prompts(dataset_path, num_prompts): print(f"Error reading dataset: {e}") return [] else: - prompts = [ - "The future of AI is", "The president of the United States is" - ] + prompts = ["The future of AI is", "The president of the United States is"] return prompts[:num_prompts] @@ -33,34 +31,32 @@ def parse_args(): "--dataset", type=str, default="./examples/data/gsm8k.jsonl", - help="downloaded from the eagle repo " \ - "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/" + help="downloaded from the eagle repo " + "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/", + ) + parser.add_argument( + "--method", type=str, default="eagle", choices=["eagle", "eagle3"] ) - parser.add_argument("--method", - type=str, - default='eagle', - choices=['eagle', 'eagle3']) parser.add_argument("--max_num_seqs", type=int, default=8) parser.add_argument("--num_prompts", type=int, default=80) parser.add_argument("--num_spec_tokens", type=int, default=2) parser.add_argument("--tp", type=int, default=1) parser.add_argument("--draft_tp", type=int, default=1) - parser.add_argument("--enforce_eager", action='store_true') - parser.add_argument("--enable_chunked_prefill", action='store_true') + parser.add_argument("--enforce_eager", action="store_true") + parser.add_argument("--enable_chunked_prefill", action="store_true") parser.add_argument("--max_num_batched_tokens", type=int, default=2048) parser.add_argument("--temp", type=float, default=0) return parser.parse_args() def main(): - args = parse_args() model_dir = "meta-llama/Llama-3.1-8B-Instruct" - if args.method == 'eagle': + if args.method == "eagle": eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" - elif args.method == 'eagle3': + elif args.method == "eagle3": eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" else: raise ValueError(f"unknown method: {args.method}") @@ -72,11 +68,9 @@ def main(): prompts = load_prompts(args.dataset, args.num_prompts) prompt_ids = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - add_generation_prompt=True) + tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], add_generation_prompt=True + ) for prompt in prompts ] @@ -102,8 +96,7 @@ def main(): sampling_params = SamplingParams(temperature=args.temp, max_tokens=256) - outputs = llm.generate(prompt_token_ids=prompt_ids, - sampling_params=sampling_params) + outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params) # print the generated text for output in outputs: @@ -120,19 +113,22 @@ def main(): # accepted acceptance_counts = [0] * (args.num_spec_tokens + 1) for output in outputs: - for step, count in enumerate( - output.metrics.spec_token_acceptance_counts): + for step, count in enumerate(output.metrics.spec_token_acceptance_counts): acceptance_counts[step] += count print("-" * 50) - print(f"mean acceptance length (including bonus tokens): \ - {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}") + print( + f"mean acceptance length (including bonus tokens): \ + {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}" + ) print("-" * 50) # print acceptance at each token position for i in range(len(acceptance_counts)): - print(f"acceptance at token {i}:" - f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}") + print( + f"acceptance at token {i}:" + f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}" + ) if __name__ == "__main__": diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py index b347ddbf3197..23f60c431fc2 100644 --- a/examples/offline_inference/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="jinaai/jina-embeddings-v3", - task="embed", - trust_remote_code=True) + parser.set_defaults( + model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True + ) return parser.parse_args() @@ -41,11 +41,14 @@ def main(args: Namespace): print("-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} \n" - f"Embeddings for text matching: {embeds_trimmed} " - f"(size={len(embeds)})") + embeds_trimmed = ( + (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds + ) + print( + f"Prompt: {prompt!r} \n" + f"Embeddings for text matching: {embeds_trimmed} " + f"(size={len(embeds)})" + ) print("-" * 60) diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py index 7a6cb02556d9..59c0592ae9e2 100644 --- a/examples/offline_inference/embed_matryoshka_fy.py +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -10,9 +10,9 @@ def parse_args(): parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) # Set example specific arguments - parser.set_defaults(model="jinaai/jina-embeddings-v3", - task="embed", - trust_remote_code=True) + parser.set_defaults( + model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True + ) return parser.parse_args() @@ -39,11 +39,10 @@ def main(args: Namespace): print("-" * 60) for prompt, output in zip(prompts, outputs): embeds = output.outputs.embedding - embeds_trimmed = ((str(embeds[:16])[:-1] + - ", ...]") if len(embeds) > 16 else embeds) - print(f"Prompt: {prompt!r} \n" - f"Embeddings: {embeds_trimmed} " - f"(size={len(embeds)})") + embeds_trimmed = ( + (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds + ) + print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})") print("-" * 60) diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index c4916e00f473..83dd1f667eb5 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -1,12 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 -''' +""" Demonstrate prompting of text-to-text encoder/decoder models, specifically BART -''' +""" from vllm import LLM, SamplingParams -from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - TokensPrompt, zip_enc_dec_prompts) +from vllm.inputs import ( + ExplicitEncoderDecoderPrompt, + TextPrompt, + TokensPrompt, + zip_enc_dec_prompts, +) def create_prompts(tokenizer): @@ -18,8 +22,9 @@ def create_prompts(tokenizer): # - Helpers for building prompts text_prompt_raw = "Hello, my name is" text_prompt = TextPrompt(prompt="The president of the United States is") - tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode( - prompt="The capital of France is")) + tokens_prompt = TokensPrompt( + prompt_token_ids=tokenizer.encode(prompt="The capital of France is") + ) # - Pass a single prompt to encoder/decoder model # (implicitly encoder input prompt); # decoder input prompt is assumed to be None @@ -57,14 +62,19 @@ def create_prompts(tokenizer): # decoder prompts together into a list of ExplicitEncoderDecoderPrompt # instances zipped_prompt_list = zip_enc_dec_prompts( - ['An encoder prompt', 'Another encoder prompt'], - ['A decoder prompt', 'Another decoder prompt']) + ["An encoder prompt", "Another encoder prompt"], + ["A decoder prompt", "Another decoder prompt"], + ) # - Let's put all of the above example prompts together into one list # which we will pass to the encoder/decoder LLM. return [ - single_text_prompt_raw, single_text_prompt, single_tokens_prompt, - enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3 + single_text_prompt_raw, + single_text_prompt, + single_tokens_prompt, + enc_dec_prompt1, + enc_dec_prompt2, + enc_dec_prompt3, ] + zipped_prompt_list @@ -85,10 +95,12 @@ def print_outputs(outputs): prompt = output.prompt encoder_prompt = output.encoder_prompt generated_text = output.outputs[0].text - print(f"Output {i+1}:") - print(f"Encoder prompt: {encoder_prompt!r}\n" - f"Decoder prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}") + print(f"Output {i + 1}:") + print( + f"Encoder prompt: {encoder_prompt!r}\n" + f"Decoder prompt: {prompt!r}\n" + f"Generated text: {generated_text!r}" + ) print("-" * 50) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 2883c37ca236..ae3737e37594 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -3,6 +3,7 @@ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. """ + import time from collections.abc import Sequence from dataclasses import asdict @@ -30,18 +31,14 @@ def run_florence2(): ) prompts = [ - { # implicit prompt with task token + { # implicit prompt with task token "prompt": "<DETAILED_CAPTION>", - "multi_modal_data": { - "image": ImageAsset("stop_sign").pil_image - }, + "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image}, }, - { # explicit encoder/decoder prompt + { # explicit encoder/decoder prompt "encoder_prompt": { "prompt": "Describe in detail what is shown in the image.", - "multi_modal_data": { - "image": ImageAsset("cherry_blossom").pil_image - }, + "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image}, }, "decoder_prompt": "", }, @@ -63,20 +60,20 @@ def run_mllama(): ) prompts = [ - { # Implicit prompt - "prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501 + { # Implicit prompt + "prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501 "multi_modal_data": { "image": ImageAsset("stop_sign").pil_image, }, }, - { # Explicit prompt + { # Explicit prompt "encoder_prompt": { "prompt": "<|image|>", "multi_modal_data": { "image": ImageAsset("stop_sign").pil_image, }, }, - "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 + "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 }, ] @@ -96,13 +93,13 @@ def run_whisper(): ) prompts = [ - { # Test implicit prompt + { # Test implicit prompt "prompt": "<|startoftranscript|>", "multi_modal_data": { "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, }, }, - { # Test explicit encoder/decoder prompt + { # Test explicit encoder/decoder prompt "encoder_prompt": { "prompt": "", "multi_modal_data": { @@ -110,7 +107,7 @@ def run_whisper(): }, }, "decoder_prompt": "<|startoftranscript|>", - } + }, ] return ModelRequestData( @@ -128,18 +125,23 @@ def run_whisper(): def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for text generation') - parser.add_argument('--model-type', - '-m', - type=str, - default="mllama", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models for text generation" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="mllama", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() @@ -153,7 +155,8 @@ def main(args): # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) @@ -179,8 +182,7 @@ def main(args): for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Decoder prompt: {prompt!r}, " - f"Generated text: {generated_text!r}") + print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}") duration = time.time() - start diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index d84cd9ee9f52..5d5e55a83d22 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -3,6 +3,7 @@ This file demonstrates using the `LLMEngine` for processing prompts with various sampling parameters. """ + import argparse from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams @@ -12,24 +13,26 @@ def create_test_prompts() -> list[tuple[str, SamplingParams]]: """Create a list of test prompts with their sampling parameters.""" return [ - ("A robot may not injure a human being", - SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)), - ("To be or not to be,", - SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), - ("What is the meaning of life?", - SamplingParams(n=2, - temperature=0.8, - top_p=0.95, - frequency_penalty=0.1)), + ( + "A robot may not injure a human being", + SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1), + ), + ( + "To be or not to be,", + SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2), + ), + ( + "What is the meaning of life?", + SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1), + ), ] -def process_requests(engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams]]): +def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 - print('-' * 50) + print("-" * 50) while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params = test_prompts.pop(0) @@ -41,7 +44,7 @@ def process_requests(engine: LLMEngine, for request_output in request_outputs: if request_output.finished: print(request_output) - print('-' * 50) + print("-" * 50) def initialize_engine(args: argparse.Namespace) -> LLMEngine: @@ -52,7 +55,8 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine: def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using the LLMEngine class directly') + description="Demo on using the LLMEngine class directly" + ) parser = EngineArgs.add_cli_args(parser) return parser.parse_args() @@ -64,6 +68,6 @@ def main(args: argparse.Namespace): process_requests(engine, test_prompts) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() main(args) diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index 7e90d5d25e29..5bb2327a3f83 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -36,22 +36,21 @@ def parse_args(): parser.set_defaults(load_format="sharded_state") # Add validation arguments - parser.add_argument("--prompt", - type=str, - default="Hello, world!", - help="Prompt for validation") - parser.add_argument("--max-tokens", - type=int, - default=100, - help="Maximum number of tokens to generate") - parser.add_argument("--temperature", - type=float, - default=0.7, - help="Sampling temperature") - parser.add_argument("--top-p", - type=float, - default=1.0, - help="Top-p sampling parameter") + parser.add_argument( + "--prompt", type=str, default="Hello, world!", help="Prompt for validation" + ) + parser.add_argument( + "--max-tokens", + type=int, + default=100, + help="Maximum number of tokens to generate", + ) + parser.add_argument( + "--temperature", type=float, default=0.7, help="Sampling temperature" + ) + parser.add_argument( + "--top-p", type=float, default=1.0, help="Top-p sampling parameter" + ) return parser.parse_args() @@ -60,8 +59,9 @@ def main(): args = parse_args() engine_args = EngineArgs.from_cli_args(args) - print(f"Loading model from {engine_args.model} " - f"using format {engine_args.load_format}") + print( + f"Loading model from {engine_args.model} using format {engine_args.load_format}" + ) print(f"Tensor parallel size: {engine_args.tensor_parallel_size}") # Load the model using engine args @@ -90,4 +90,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index b6608ec6e958..33c660015ba7 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -17,50 +17,55 @@ def create_test_prompts( - lora_path: str + lora_path: str, ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: return [ # this is an example of using quantization without LoRA - ("My name is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), None), + ( + "My name is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + None, + ), # the next three examples use quantization with LoRA - ("my name is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), - LoRARequest("lora-test-1", 1, lora_path)), - ("The capital of USA is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), - LoRARequest("lora-test-2", 1, lora_path)), - ("The capital of France is", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), - LoRARequest("lora-test-3", 1, lora_path)), + ( + "my name is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-1", 1, lora_path), + ), + ( + "The capital of USA is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-2", 1, lora_path), + ), + ( + "The capital of France is", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + LoRARequest("lora-test-3", 1, lora_path), + ), ] -def process_requests(engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, - Optional[LoRARequest]]]): +def process_requests( + engine: LLMEngine, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], +): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params, lora_request = test_prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - lora_request=lora_request) + engine.add_request( + str(request_id), prompt, sampling_params, lora_request=lora_request + ) request_id += 1 request_outputs: list[RequestOutput] = engine.step() @@ -71,15 +76,18 @@ def process_requests(engine: LLMEngine, print(f"Output: {request_output.outputs[0].text}") -def initialize_engine(model: str, quantization: str, - lora_repo: Optional[str]) -> LLMEngine: +def initialize_engine( + model: str, quantization: str, lora_repo: Optional[str] +) -> LLMEngine: """Initialize the LLMEngine.""" - engine_args = EngineArgs(model=model, - quantization=quantization, - enable_lora=True, - max_lora_rank=64, - max_loras=4) + engine_args = EngineArgs( + model=model, + quantization=quantization, + enable_lora=True, + max_lora_rank=64, + max_loras=4, + ) return LLMEngine.from_engine_args(engine_args) @@ -90,32 +98,30 @@ def main(): # QLoRA (https://arxiv.org/abs/2305.14314) { "name": "qlora_inference_example", - 'model': "huggyllama/llama-7b", - 'quantization': "bitsandbytes", - 'lora_repo': 'timdettmers/qlora-flan-7b' + "model": "huggyllama/llama-7b", + "quantization": "bitsandbytes", + "lora_repo": "timdettmers/qlora-flan-7b", }, { "name": "AWQ_inference_with_lora_example", - 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ', - 'quantization': "awq", - 'lora_repo': 'jashing/tinyllama-colorist-lora' + "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "quantization": "awq", + "lora_repo": "jashing/tinyllama-colorist-lora", }, { "name": "GPTQ_inference_with_lora_example", - 'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ', - 'quantization': "gptq", - 'lora_repo': 'jashing/tinyllama-colorist-lora' - } + "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + "quantization": "gptq", + "lora_repo": "jashing/tinyllama-colorist-lora", + }, ] for test_config in test_configs: - print( - f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~" + print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~") + engine = initialize_engine( + test_config["model"], test_config["quantization"], test_config["lora_repo"] ) - engine = initialize_engine(test_config['model'], - test_config['quantization'], - test_config['lora_repo']) - lora_path = snapshot_download(repo_id=test_config['lora_repo']) + lora_path = snapshot_download(repo_id=test_config["lora_repo"]) test_prompts = create_test_prompts(lora_path) process_requests(engine, test_prompts) @@ -125,5 +131,5 @@ def main(): torch.cuda.empty_cache() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 37c3181dc5fa..98fef2648f6b 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -74,19 +74,10 @@ def run_simple_demo(args: argparse.Namespace): messages = [ { - "role": - "user", + "role": "user", "content": [ - { - "type": "text", - "text": prompt - }, - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": image_url}}, ], }, ] @@ -121,25 +112,11 @@ def run_advanced_demo(args: argparse.Namespace): messages = [ { - "role": - "user", + "role": "user", "content": [ - { - "type": "text", - "text": prompt - }, - { - "type": "image_url", - "image_url": { - "url": url_1 - } - }, - { - "type": "image_url", - "image_url": { - "url": url_2 - } - }, + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": url_1}}, + {"type": "image_url", "image_url": {"url": url_2}}, ], }, { @@ -153,12 +130,7 @@ def run_advanced_demo(args: argparse.Namespace): { "role": "user", "content": [ - { - "type": "image_url", - "image_url": { - "url": url_3 - } - }, + {"type": "image_url", "image_url": {"url": url_3}}, ], }, ] @@ -171,7 +143,8 @@ def run_advanced_demo(args: argparse.Namespace): def parse_args(): parser = argparse.ArgumentParser( - description="Run a demo in simple or advanced mode.") + description="Run a demo in simple or advanced mode." + ) parser.add_argument( "mode", @@ -179,15 +152,18 @@ def parse_args(): help="Specify the demo mode: 'simple' or 'advanced'", ) - parser.add_argument('--format', - choices=["mistral", "hf"], - default="mistral", - help='Specify the format of the model to load.') + parser.add_argument( + "--format", + choices=["mistral", "hf"], + default="mistral", + help="Specify the format of the model to load.", + ) parser.add_argument( - '--disable-mm-preprocessor-cache', - action='store_true', - help='If True, disables caching of multi-modal preprocessor/mapper.') + "--disable-mm-preprocessor-cache", + action="store_true", + help="If True, disables caching of multi-modal preprocessor/mapper.", + ) return parser.parse_args() diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index 53c58a76d9dc..b750397f45b8 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -13,8 +13,9 @@ from vllm import LLM, SamplingParams -def time_generation(llm: LLM, prompts: list[str], - sampling_params: SamplingParams, title: str): +def time_generation( + llm: LLM, prompts: list[str], sampling_params: SamplingParams, title: str +): # Generate texts from the prompts. The output is a list of RequestOutput # objects that contain the prompt, generated text, and other information. # Warmup first @@ -25,8 +26,7 @@ def time_generation(llm: LLM, prompts: list[str], end = time.time() print("-" * 50) print(title) - print("time: ", - (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs)) + print("time: ", (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs)) # Print the outputs. for output in outputs: generated_text = output.outputs[0].text @@ -38,7 +38,8 @@ def main(): template = ( "Below is an instruction that describes a task. Write a response " "that appropriately completes the request.\n\n### Instruction:\n{}" - "\n\n### Response:\n") + "\n\n### Response:\n" + ) # Sample prompts. prompts = [ diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index de409740292a..1fa2f16f82a8 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -15,7 +15,7 @@ def create_test_prompts( - lora_path: str + lora_path: str, ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: """Create a list of test prompts with their sampling parameters. @@ -26,38 +26,49 @@ def create_test_prompts( first adapter have finished. """ return [ - ("A robot may not injure a human being", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), None), - ("To be or not to be,", - SamplingParams(temperature=0.8, - top_k=5, - presence_penalty=0.2, - max_tokens=128), None), + ( + "A robot may not injure a human being", + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128 + ), + None, + ), + ( + "To be or not to be,", + SamplingParams( + temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128 + ), + None, + ), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), + SamplingParams( + temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003], + ), + LoRARequest("sql-lora", 1, lora_path), + ), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora2", 2, lora_path)), + SamplingParams( + temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003], + ), + LoRARequest("sql-lora2", 2, lora_path), + ), ] -def process_requests(engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, - Optional[LoRARequest]]]): +def process_requests( + engine: LLMEngine, + test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], +): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -65,10 +76,9 @@ def process_requests(engine: LLMEngine, while test_prompts or engine.has_unfinished_requests(): if test_prompts: prompt, sampling_params, lora_request = test_prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - lora_request=lora_request) + engine.add_request( + str(request_id), prompt, sampling_params, lora_request=lora_request + ) request_id += 1 request_outputs: list[RequestOutput] = engine.step() @@ -88,12 +98,14 @@ def initialize_engine() -> LLMEngine: # numbers will cause higher memory usage. If you know that all LoRAs will # use the same rank, it is recommended to set this as low as possible. # max_cpu_loras: controls the size of the CPU LoRA cache. - engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", - enable_lora=True, - max_loras=1, - max_lora_rank=8, - max_cpu_loras=2, - max_num_seqs=256) + engine_args = EngineArgs( + model="meta-llama/Llama-2-7b-hf", + enable_lora=True, + max_loras=1, + max_lora_rank=8, + max_cpu_loras=2, + max_num_seqs=256, + ) return LLMEngine.from_engine_args(engine_args) @@ -105,5 +117,5 @@ def main(): process_requests(engine, test_prompts) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index 5906c7b2c6b3..f2d7698f22d7 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -30,7 +30,8 @@ def main(): # The device argument can be either unspecified for automated detection, # or explicitly assigned. device="neuron", - tensor_parallel_size=2) + tensor_parallel_size=2, + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py index 4f63f1a2fb3c..a51caa2aec8b 100644 --- a/examples/offline_inference/neuron_eagle.py +++ b/examples/offline_inference/neuron_eagle.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to run offline inference with an EAGLE speculative +This example shows how to run offline inference with an EAGLE speculative decoding model on neuron. To use EAGLE speculative decoding, you must use a draft model that is specifically fine-tuned for EAGLE speculation. Additionally, to use EAGLE with NxD Inference, the draft model must include @@ -24,7 +24,7 @@ speculative_config={ "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", "num_speculative_tokens": 5, - "max_model_len": 2048 + "max_model_len": 2048, }, max_num_seqs=4, # The max_model_len and block_size arguments are required to be same as @@ -40,7 +40,7 @@ tensor_parallel_size=32, override_neuron_config={ "enable_eagle_speculation": True, - "enable_fused_speculation": True + "enable_fused_speculation": True, }, ) diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index af21274a3a5b..ec38525b9daf 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -5,12 +5,12 @@ from vllm import LLM, SamplingParams # creates XLA hlo graphs for all the context length buckets. -os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" +os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" # creates XLA hlo graphs for all the token gen buckets. -os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" +os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" # Quantizes neuron model weight to int8 , # The default config for quantization is int8 dtype. -os.environ['NEURON_QUANT_DTYPE'] = "s8" +os.environ["NEURON_QUANT_DTYPE"] = "s8" # Sample prompts. prompts = [ @@ -44,7 +44,8 @@ def main(): override_neuron_config={ "cast_logits_dtype": "bfloat16", }, - tensor_parallel_size=2) + tensor_parallel_size=2, + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index bef434bae5ba..ecacbab771c2 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to run offline inference with a speculative +This example shows how to run offline inference with a speculative decoding model on neuron. """ @@ -19,9 +19,9 @@ def config_buckets(): """Configure context length and token gen buckets.""" # creates XLA hlo graphs for all the context length buckets. - os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" + os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048" # creates XLA hlo graphs for all the token gen buckets. - os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" + os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" def initialize_model(): @@ -31,7 +31,7 @@ def initialize_model(): speculative_config={ "model": "openlm-research/open_llama_3b", "num_speculative_tokens": 4, - "max_model_len": 2048 + "max_model_len": 2048, }, max_num_seqs=4, max_model_len=2048, @@ -60,5 +60,5 @@ def main(): process_requests(model, sampling_params) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index f0bec387d3a9..d3dad24956a6 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -16,7 +16,8 @@ "teaching role. They have 5 years of previous teaching experience " "as an assistant teacher at a co-ed, public school with experience " "in middle school math teaching. Based on these information, fulfill " - "the following paragraph: ") + "the following paragraph: " +) # Sample prompts. prompts = [ @@ -58,9 +59,11 @@ def main(): cleanup_dist_env_and_memory() # Create an LLM with prefix caching enabled. - prefix_cached_llm = LLM(model="facebook/opt-125m", - enable_prefix_caching=True, - gpu_memory_utilization=0.4) + prefix_cached_llm = LLM( + model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4, + ) # Warmup so that the shared prompt's KV cache is computed. prefix_cached_llm.generate(generating_prompts[0], sampling_params) @@ -81,10 +84,12 @@ def main(): print("-" * 50) # Compare the results and display the speedup - generated_same = all([ - regular_generated_texts[i] == cached_generated_texts[i] - for i in range(len(prompts)) - ]) + generated_same = all( + [ + regular_generated_texts[i] == cached_generated_texts[i] + for i in range(len(prompts)) + ] + ) print(f"Generated answers are the same: {generated_same}") diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index e3cc606db7a9..21f7668adc86 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -16,7 +16,8 @@ Run the example: python prithvi_geospatial_mae.py -""" # noqa: E501 +""" # noqa: E501 + import argparse import datetime import os @@ -110,77 +111,67 @@ # Temporarily creating the "config.json" for the model. # This is going to disappear once the correct config.json is available on HF -with open(os.path.join(os.path.dirname(__file__), "./model/config.json"), - 'w') as config_file: +with open( + os.path.join(os.path.dirname(__file__), "./model/config.json"), "w" +) as config_file: config_file.write(model_config) datamodule_config = { - 'bands': ['BLUE', 'GREEN', 'RED', 'NIR_NARROW', 'SWIR_1', 'SWIR_2'], - 'batch_size': - 16, - 'constant_scale': - 0.0001, - 'data_root': - '/dccstor/geofm-finetuning/datasets/sen1floods11', - 'drop_last': - True, - 'no_data_replace': - 0.0, - 'no_label_replace': - -1, - 'num_workers': - 8, - 'test_transform': [ - albumentations.Resize(always_apply=False, - height=448, - interpolation=1, - p=1, - width=448), - albumentations.pytorch.ToTensorV2(transpose_mask=False, - always_apply=True, - p=1.0) + "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"], + "batch_size": 16, + "constant_scale": 0.0001, + "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11", + "drop_last": True, + "no_data_replace": 0.0, + "no_label_replace": -1, + "num_workers": 8, + "test_transform": [ + albumentations.Resize( + always_apply=False, height=448, interpolation=1, p=1, width=448 + ), + albumentations.pytorch.ToTensorV2( + transpose_mask=False, always_apply=True, p=1.0 + ), ], } class PrithviMAE: - def __init__(self): print("Initializing PrithviMAE model") - self.model = LLM(model=os.path.join(os.path.dirname(__file__), - "./model"), - skip_tokenizer_init=True, - dtype="float32") + self.model = LLM( + model=os.path.join(os.path.dirname(__file__), "./model"), + skip_tokenizer_init=True, + dtype="float32", + ) def run(self, input_data, location_coords): print("################ Running inference on vLLM ##############") # merge the inputs into one data structure mm_data = { - "pixel_values": - torch.empty(0) if input_data is None else input_data, - "location_coords": - torch.empty(0) if location_coords is None else location_coords + "pixel_values": torch.empty(0) if input_data is None else input_data, + "location_coords": torch.empty(0) + if location_coords is None + else location_coords, } prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data} outputs = self.model.encode(prompt, use_tqdm=False) - print( - "################ Inference done (it took seconds) ##############" - ) + print("################ Inference done (it took seconds) ##############") return outputs[0].outputs.data def generate_datamodule(): datamodule = Sen1Floods11NonGeoDataModule( - data_root=datamodule_config['data_root'], + data_root=datamodule_config["data_root"], batch_size=datamodule_config["batch_size"], num_workers=datamodule_config["num_workers"], bands=datamodule_config["bands"], drop_last=datamodule_config["drop_last"], - test_transform=datamodule_config["test_transform" - ""]) + test_transform=datamodule_config["test_transform"], + ) return datamodule @@ -204,8 +195,7 @@ def process_channel_group(orig_img, channels): max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE)) min_value = OFFSET - orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, - 1) + orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1) # No data as zeros orig_img[~valid_mask] = 0 @@ -300,18 +290,21 @@ def load_example( location_coords.append(coords) try: - match = re.search(r'(\d{7,8}T\d{6})', file) + match = re.search(r"(\d{7,8}T\d{6})", file) if match: year = int(match.group(1)[:4]) - julian_day = match.group(1).split('T')[0][4:] + julian_day = match.group(1).split("T")[0][4:] if len(julian_day) == 3: julian_day = int(julian_day) else: - julian_day = datetime.datetime.strptime( - julian_day, '%m%d').timetuple().tm_yday + julian_day = ( + datetime.datetime.strptime(julian_day, "%m%d") + .timetuple() + .tm_yday + ) temporal_coords.append([year, julian_day]) except Exception as e: - print(f'Could not extract timestamp for {file} ({e})') + print(f"Could not extract timestamp for {file} ({e})") imgs = np.stack(imgs, axis=0) # num_frames, H, W, C imgs = np.moveaxis(imgs, -1, 0).astype("float32") @@ -320,50 +313,44 @@ def load_example( return imgs, temporal_coords, location_coords, metas -def run_model(input_data, - temporal_coords, - location_coords, - model, - datamodule, - img_size, - lightning_model=None): +def run_model( + input_data, + temporal_coords, + location_coords, + model, + datamodule, + img_size, + lightning_model=None, +): # Reflect pad if not divisible by img_size original_h, original_w = input_data.shape[-2:] pad_h = (img_size - (original_h % img_size)) % img_size pad_w = (img_size - (original_w % img_size)) % img_size - input_data = np.pad(input_data, - ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), - mode="reflect") + input_data = np.pad( + input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect" + ) # Build sliding window batch_size = 1 batch = torch.tensor(input_data, device="cpu") - windows = (batch.unfold(3, img_size, - img_size).unfold(4, img_size, img_size)) + windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size) h1, w1 = windows.shape[3:5] - windows = rearrange(windows, - "b c t h1 w1 h w -> (b h1 w1) c t h w", - h=img_size, - w=img_size) + windows = rearrange( + windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size + ) # Split into batches if number of windows > batch_size - num_batches = windows.shape[0] // batch_size if windows.shape[ - 0] > batch_size else 1 + num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1 windows = torch.tensor_split(windows, num_batches, dim=0) - if torch.cuda.is_available(): - device = torch.device('cuda') - else: - device = torch.device('cpu') + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") if temporal_coords: - temporal_coords = torch.tensor(temporal_coords, - device=device).unsqueeze(0) + temporal_coords = torch.tensor(temporal_coords, device=device).unsqueeze(0) else: temporal_coords = None if location_coords: - location_coords = torch.tensor(location_coords[0], - device=device).unsqueeze(0) + location_coords = torch.tensor(location_coords[0], device=device).unsqueeze(0) else: location_coords = None @@ -371,26 +358,24 @@ def run_model(input_data, pred_imgs = [] for x in windows: # Apply standardization - x = datamodule.test_transform( - image=x.squeeze().numpy().transpose(1, 2, 0)) - x = datamodule.aug(x)['image'] + x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0)) + x = datamodule.aug(x)["image"] with torch.no_grad(): x = x.to(device) pred = model.run(x, location_coords=location_coords) if lightning_model: pred_lightning = lightning_model( - x, - temporal_coords=temporal_coords, - location_coords=location_coords) + x, temporal_coords=temporal_coords, location_coords=location_coords + ) pred_lightning = pred_lightning.output.detach().cpu() if not torch.equal(pred, pred_lightning): print("Inference output is not equal") y_hat = pred.argmax(dim=1) - y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(), - size=img_size, - mode="nearest") + y_hat = torch.nn.functional.interpolate( + y_hat.unsqueeze(1).float(), size=img_size, mode="nearest" + ) pred_imgs.append(y_hat) @@ -437,8 +422,7 @@ def parse_args(): default=[1, 2, 3, 8, 11, 12], type=int, nargs="+", - help= - "0-based indices of the six Prithvi channels to be selected from the " + help="0-based indices of the six Prithvi channels to be selected from the " "input. By default selects [1,2,3,8,11,12] for S2L1C data.", ) parser.add_argument( @@ -478,17 +462,18 @@ def main( # Running model ------------------------------------------------------------ channels = [ - datamodule_config['bands'].index(b) for b in ["RED", "GREEN", "BLUE"] + datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"] ] # BGR -> RGB - pred = run_model(input_data, temporal_coords, location_coords, model_obj, - datamodule, img_size) + pred = run_model( + input_data, temporal_coords, location_coords, model_obj, datamodule, img_size + ) # Save pred meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0) pred_file = os.path.join( - output_dir, - f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff") + output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff" + ) save_geotiff(_convert_np_uint8(pred), pred_file, meta_data) # Save image + pred @@ -502,13 +487,13 @@ def main( channels=channels, ) - pred[pred == 0.] = np.nan + pred[pred == 0.0] = np.nan img_pred = rgb_orig * 0.7 + pred * 0.3 img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()] img_pred_file = os.path.join( - output_dir, - f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff") + output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff" + ) save_geotiff( image=_convert_np_uint8(img_pred), output_path=img_pred_file, @@ -518,8 +503,9 @@ def main( # Save image rgb if rgb_outputs: rgb_file = os.path.join( - output_dir, "original_rgb_" - f"{os.path.splitext(os.path.basename(data_file))[0]}.tiff") + output_dir, + f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff", + ) save_geotiff( image=_convert_np_uint8(rgb_orig), output_path=rgb_file, @@ -528,7 +514,6 @@ def main( if __name__ == "__main__": - args = parse_args() main(**vars(args)) diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 3cf0c340d670..244a64b891c9 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -44,14 +44,17 @@ def get_dtype(dtype: str): OutputLen_NumReqs_Map: TypeAlias = dict[int, int] -def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \ - -> OutputLen_NumReqs_Map: + + +def compute_request_output_lengths( + batch_size: int, step_requests: list[int] +) -> OutputLen_NumReqs_Map: """ Given the number of requests, batch_size, and the number of requests that each engine-step should process, step_requests, determine the output lengths of the requests such that step_request is honoured. - Example: + Example: if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1] then return, {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning, @@ -100,17 +103,19 @@ def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \ output_length -= 1 # sanity checks. - assert sum(ol_nr.values()) == batch_size, \ - ("Number of requests in output-length assignment does not match " - f"batch-size.\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}") + assert sum(ol_nr.values()) == batch_size, ( + "Number of requests in output-length assignment does not match " + f"batch-size.\n batch size {batch_size} - " + f"step requests {step_requests} - assignments {ol_nr}" + ) # Check that the output-length is in [1, num-steps]. Output length must be # at least 1 as all requests must participate in the prefill-step. - assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \ - ("Output lengths of requests should be in range " - f"[1, num-engine-steps].\n batch size {batch_size} - " - f"step requests {step_requests} - assignments {ol_nr}") + assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), ( + "Output lengths of requests should be in range " + f"[1, num-engine-steps].\n batch size {batch_size} - " + f"step requests {step_requests} - assignments {ol_nr}" + ) return ol_nr @@ -131,7 +136,7 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]: context: ProfileContext object. Returns: - list[int]: Number of requests to process for all engine-steps. + list[int]: Number of requests to process for all engine-steps. output[i], contains the number of requests that the ith step should process. """ @@ -140,10 +145,13 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]: # that their output lengths must be equal to num_engine_steps. return [context.batch_size] * context.num_steps - assert context.complete_num_requests_per_step and \ - context.complete_num_requests_per_step > 0, \ - (f"Expected a positive complete_num_requests_per_step argument." - f"Instead got {context.complete_num_requests_per_step}") + assert ( + context.complete_num_requests_per_step + and context.complete_num_requests_per_step > 0 + ), ( + f"Expected a positive complete_num_requests_per_step argument." + f"Instead got {context.complete_num_requests_per_step}" + ) # We start dropping after the first decode step. step_requests = [ @@ -165,8 +173,9 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]: return step_requests -def run_profile(context: ProfileContext, csv_output: Optional[str], - json_output: Optional[str]): +def run_profile( + context: ProfileContext, csv_output: Optional[str], json_output: Optional[str] +): print("Run profile with:") for key, value in asdict(context).items(): print(f" {key} = {value}") @@ -174,7 +183,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], requests_per_step: list[int] = determine_requests_per_step(context) ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths( - context.batch_size, requests_per_step) + context.batch_size, requests_per_step + ) num_steps_to_profile: int = len(requests_per_step) max_output_len: int = max(ol_nr.keys()) @@ -186,7 +196,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], top_p=0.95, # max_tokens is set on a per-request basis. max_tokens=None, - ignore_eos=True) + ignore_eos=True, + ) # Create LLM llm = LLM(**asdict(context.engine_args)) @@ -199,31 +210,37 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], max_num_seqs = scheduler_config.max_num_seqs if batch_size * prompt_len > max_num_batched_tokens: - print(f"ERROR: chosen batch_size * prompt_len " - f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " - f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " - f"and therefore cannot be run in a single profile step, please " - f"choose a smaller batch size or prompt length, or increase " - f"--max-num-batched-tokens") + print( + f"ERROR: chosen batch_size * prompt_len " + f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " + f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " + f"and therefore cannot be run in a single profile step, please " + f"choose a smaller batch size or prompt length, or increase " + f"--max-num-batched-tokens" + ) sys.exit(-1) if batch_size > max_num_seqs: print( f"ERROR: chosen batch_size ({batch_size}) is larger than " f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a " - f"single profile step, please choose a smaller batch size") + f"single profile step, please choose a smaller batch size" + ) sys.exit(-1) - print("llm.llm_engine.model_config.max_model_len: ", - llm.llm_engine.model_config.max_model_len) + print( + "llm.llm_engine.model_config.max_model_len: ", + llm.llm_engine.model_config.max_model_len, + ) if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len: - print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + " - f"{max_output_len} = {prompt_len + max_output_len}) is larger " - f"than the model's max_model_len ({max_model_len}), please " - f"choose a smaller prompt_len or max_output_len, or increase " - f"--max-model-len") + print( + f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + " + f"{max_output_len} = {prompt_len + max_output_len}) is larger " + f"than the model's max_model_len ({max_model_len}), please " + f"choose a smaller prompt_len or max_output_len, or increase " + f"--max-model-len" + ) sys.exit(-1) def add_requests(): - def get_output_len_generator() -> Generator[int, Any, Any]: for output_len, num_reqs in ol_nr.items(): for _ in range(num_reqs): @@ -234,13 +251,15 @@ def get_output_len_generator() -> Generator[int, Any, Any]: sampling_params.max_tokens = next(output_len_generator) assert isinstance(sampling_params.max_tokens, int) - prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size, - size=(prompt_len, )).tolist() + prompt_token_ids = torch.randint( + llm.get_tokenizer().vocab_size, size=(prompt_len,) + ).tolist() llm.llm_engine.add_request( request_id=f"seq{i}", - prompt={'prompt_token_ids': prompt_token_ids}, - params=sampling_params) + prompt={"prompt_token_ids": prompt_token_ids}, + params=sampling_params, + ) def abort_requests(): for i in range(batch_size): @@ -261,10 +280,8 @@ def abort_requests(): decode_profs = [] for _ in tqdm.tqdm(range(num_steps_to_profile - 1)): - num_running_seqs = llm.llm_engine.scheduler[ - 0].get_num_unfinished_seq_groups() - with layerwise_profile( - num_running_seqs=num_running_seqs) as decode_prof: + num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups() + with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof: llm.llm_engine.step() decode_profs.append(decode_prof) @@ -274,8 +291,7 @@ def abort_requests(): LINE_WIDTH = 80 print("=" * LINE_WIDTH) - print(f"= Prefill Model Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})") print("=" * LINE_WIDTH) print() prefill_results.print_model_table() @@ -283,16 +299,17 @@ def abort_requests(): if has_decode: print() print("=" * LINE_WIDTH) - print(f"= First Decode Step Model Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print( + f"= First Decode Step Model Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})" + ) print("=" * LINE_WIDTH) print() decode_results_list[0].print_model_table() print() print("=" * LINE_WIDTH) - print(f"= Prefill Summary Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})") print("=" * LINE_WIDTH) print() prefill_results.print_summary_table() @@ -300,25 +317,32 @@ def abort_requests(): if has_decode: print() print("=" * LINE_WIDTH) - print(f"= First Decode Step Summary Table " - f"(prompt_len={prompt_len}, batch_size={batch_size})") + print( + f"= First Decode Step Summary Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})" + ) print("=" * LINE_WIDTH) print() decode_results_list[0].print_summary_table() if csv_output: - csv_filename_base = csv_output[:-4] \ - if csv_output.endswith('.csv') else csv_output + csv_filename_base = ( + csv_output[:-4] if csv_output.endswith(".csv") else csv_output + ) prefill_results.export_model_stats_table_csv( - csv_filename_base + "_prefill_model_table.csv") + csv_filename_base + "_prefill_model_table.csv" + ) prefill_results.export_summary_stats_table_csv( - csv_filename_base + "_prefill_summary_table.csv") + csv_filename_base + "_prefill_summary_table.csv" + ) if has_decode: - decode_results_list[0].export_model_stats_table_csv(\ - csv_filename_base + "_decode_model_table.csv") + decode_results_list[0].export_model_stats_table_csv( + csv_filename_base + "_decode_model_table.csv" + ) decode_results_list[0].export_summary_stats_table_csv( - csv_filename_base + "_decode_summary_table.csv") + csv_filename_base + "_decode_summary_table.csv" + ) if json_output: cuda_devices = [ @@ -332,7 +356,7 @@ def abort_requests(): "torch_version": f"{torch.__version__}", "torch_cuda_version": f"{torch.version.cuda}", "cuda_devices": f"{cuda_devices}", - **asdict(context) + **asdict(context), }, "prefill": prefill_results.convert_stats_to_dict(), } @@ -342,8 +366,9 @@ def abort_requests(): json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() # Add .json to json_output filename if it doesn't exist already. - json_output_file = json_output if json_output.endswith( - '.json') else json_output + '.json' + json_output_file = ( + json_output if json_output.endswith(".json") else json_output + ".json" + ) with open(json_output_file, "w+") as f: json.dump(json_dict, f, indent=2) pass @@ -351,16 +376,21 @@ def abort_requests(): if context.save_chrome_traces_folder is not None: os.makedirs(context.save_chrome_traces_folder, exist_ok=True) prefill_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + "/prefill.json") + context.save_chrome_traces_folder + "/prefill.json" + ) for idx, decode_prof in enumerate(decode_profs): decode_prof.profiler.export_chrome_trace( - context.save_chrome_traces_folder + f"/decode_{idx + 1}.json") - print("Traces saved as prefill.json and decode_1.json, etc." - f" in folder {context.save_chrome_traces_folder}") + context.save_chrome_traces_folder + f"/decode_{idx + 1}.json" + ) + print( + "Traces saved as prefill.json and decode_1.json, etc." + f" in folder {context.save_chrome_traces_folder}" + ) def parse_args(): - parser = FlexibleArgumentParser(description=""" + parser = FlexibleArgumentParser( + description=""" Profile a model example: @@ -384,7 +414,8 @@ def parse_args(): --output-directory profile_breakdown --plot-metric pct_cuda_time ``` """, - formatter_class=RawTextHelpFormatter) + formatter_class=RawTextHelpFormatter, + ) parser.add_argument( "--csv", type=str, @@ -393,59 +424,68 @@ def parse_args(): "filename, will create <filename>_prefill_model_table.csv, " "<filename>_prefill_summary_table.csv, " "<filename>_decode_model_table.csv, and " - "<filename>_decode_summary_table.csv") + "<filename>_decode_summary_table.csv", + ) parser.add_argument( "--json", type=str, default=None, - help="Export the results as a json file. This should be the filename") - parser.add_argument("--save-chrome-traces-folder", - type=str, - help="Save chrome traces for the prefill and decode " - "will save traces as prefill.json and decode_1.json, " - "etc. inside this folder") + help="Export the results as a json file. This should be the filename", + ) + parser.add_argument( + "--save-chrome-traces-folder", + type=str, + help="Save chrome traces for the prefill and decode " + "will save traces as prefill.json and decode_1.json, " + "etc. inside this folder", + ) parser.add_argument( "--prompt-len", type=int, default=PROMPT_LEN_DEFAULT, help=f"Length of the random prompt to use when profiling, all batched " - f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}") - parser.add_argument("--batch-size", - type=int, - default=BATCH_SIZE_DEFAULT, - help=f"Number of requests to run as a single batch, " - f"default={BATCH_SIZE_DEFAULT}") + f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}", + ) + parser.add_argument( + "--batch-size", + type=int, + default=BATCH_SIZE_DEFAULT, + help=f"Number of requests to run as a single batch, " + f"default={BATCH_SIZE_DEFAULT}", + ) subparsers = parser.add_subparsers(dest="cmd") run_num_steps_parser = subparsers.add_parser( - "run_num_steps", - help="This variation profiles n engine.step() invocations.") + "run_num_steps", help="This variation profiles n engine.step() invocations." + ) run_num_steps_parser.add_argument( - '-n', - '--num-steps', + "-n", + "--num-steps", type=int, help="Number of engine steps to profile.\n" "Setting it to 1, profiles only the prefill step.\n" "Setting it to 2, profiles the prefill and first decode step\n" "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n" - "and so on ...") + "and so on ...", + ) run_to_completion_parser = subparsers.add_parser( "run_to_completion", help="This variation profiles all the engine.step() invocations" - "until the engine exhausts all submitted requests.") + "until the engine exhausts all submitted requests.", + ) run_to_completion_parser.add_argument( - '-n', - '--complete-num-requests-per-step', + "-n", + "--complete-num-requests-per-step", type=int, - help= - "Complete complete_num_requests_per_step requests every decode step." + help="Complete complete_num_requests_per_step requests every decode step." "For e.g., with batch_size 128 and complete_num_requests_per_step 32," "the profiler is run for 6 engine steps, with the steps processing, " "128, 128, 96, 64, 32, 1 requests respectively.\n" "Note that we tack-on a one-request step at the end as it is often " - "useful.") + "useful.", + ) EngineArgs.add_cli_args(parser) @@ -459,7 +499,8 @@ def main(args): k: v for k, v in vars(args).items() if k in inspect.signature(ProfileContext).parameters - }) + }, + ) run_profile(context, csv_output=args.csv, json_output=args.json) diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index 61da4705e18e..82737d538df4 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -31,18 +31,16 @@ def main(args: argparse.Namespace): max_tokens=args.output_len, ) print(sampling_params) - dummy_prompt_token_ids = np.random.randint(10000, - size=(args.batch_size, - args.input_len)) - dummy_prompts: list[PromptType] = [{ - "prompt_token_ids": batch - } for batch in dummy_prompt_token_ids.tolist()] + dummy_prompt_token_ids = np.random.randint( + 10000, size=(args.batch_size, args.input_len) + ) + dummy_prompts: list[PromptType] = [ + {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() + ] def run_to_completion(): start_time = time.perf_counter() - llm.generate(dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) end_time = time.perf_counter() latency = end_time - start_time return latency @@ -58,10 +56,9 @@ def run_to_completion(): profile_dir = args.profile_result_dir print(f"Profiling (results will be saved to '{profile_dir}')...") # Enable tracing on server - xp.trace_detached("localhost:9012", - profile_dir, - delay_ms=DELAY_MS, - duration_ms=DURATION_MS) + xp.trace_detached( + "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS + ) if DELAY_MS == 0: time.sleep(1.0) profile_latencies = [] @@ -72,30 +69,36 @@ def run_to_completion(): return -if __name__ == '__main__': +if __name__ == "__main__": parser = FlexibleArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--num-iters-warmup', - type=int, - default=5, - help='Number of iterations to run for warmup.') - parser.add_argument('--num-iters', - type=int, - default=1, - help='Number of iterations to run for profiling.') + description="Benchmark the latency of processing a single batch of " + "requests till completion." + ) + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument( + "--num-iters-warmup", + type=int, + default=5, + help="Number of iterations to run for warmup.", + ) + parser.add_argument( + "--num-iters", + type=int, + default=1, + help="Number of iterations to run for profiling.", + ) parser.add_argument( - '--profile-result-dir', + "--profile-result-dir", type=str, default="profiles", - help= - ('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard ' - '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).' - )) + help=( + "path to save the pytorch profiler output. Can be visualized " + "with ui.perfetto.dev or Tensorboard " + "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)." + ), + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py index 99c5a682fb27..9f6a602233f8 100644 --- a/examples/offline_inference/prompt_embed_inference.py +++ b/examples/offline_inference/prompt_embed_inference.py @@ -18,8 +18,7 @@ """ import torch -from transformers import (AutoModelForCausalLM, AutoTokenizer, - PreTrainedTokenizer) +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer from vllm import LLM @@ -32,27 +31,29 @@ def init_tokenizer_and_llm(model_name: str): return tokenizer, embedding_layer, llm -def get_prompt_embeds(chat: list[dict[str, - str]], tokenizer: PreTrainedTokenizer, - embedding_layer: torch.nn.Module): - token_ids = tokenizer.apply_chat_template(chat, - add_generation_prompt=True, - return_tensors='pt') +def get_prompt_embeds( + chat: list[dict[str, str]], + tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module, +): + token_ids = tokenizer.apply_chat_template( + chat, add_generation_prompt=True, return_tensors="pt" + ) prompt_embeds = embedding_layer(token_ids).squeeze(0) return prompt_embeds -def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, - embedding_layer: torch.nn.Module): - chat = [{ - "role": "user", - "content": "Please tell me about the capital of France." - }] +def single_prompt_inference( + llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module +): + chat = [{"role": "user", "content": "Please tell me about the capital of France."}] prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer) - outputs = llm.generate({ - "prompt_embeds": prompt_embeds, - }) + outputs = llm.generate( + { + "prompt_embeds": prompt_embeds, + } + ) print("\n[Single Inference Output]") print("-" * 30) @@ -61,34 +62,26 @@ def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, print("-" * 30) -def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, - embedding_layer: torch.nn.Module): - chats = [[{ - "role": "user", - "content": "Please tell me about the capital of France." - }], - [{ - "role": "user", - "content": "When is the day longest during the year?" - }], - [{ - "role": "user", - "content": "Where is bigger, the moon or the sun?" - }]] +def batch_prompt_inference( + llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module +): + chats = [ + [{"role": "user", "content": "Please tell me about the capital of France."}], + [{"role": "user", "content": "When is the day longest during the year?"}], + [{"role": "user", "content": "Where is bigger, the moon or the sun?"}], + ] prompt_embeds_list = [ get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats ] - outputs = llm.generate([{ - "prompt_embeds": embeds - } for embeds in prompt_embeds_list]) + outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list]) print("\n[Batch Inference Outputs]") print("-" * 30) for i, o in enumerate(outputs): - print(f"Q{i+1}: {chats[i][0]['content']}") - print(f"A{i+1}: {o.outputs[0].text}\n") + print(f"Q{i + 1}: {chats[i][0]['content']}") + print(f"A{i + 1}: {o.outputs[0].text}\n") print("-" * 30) diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index deb6f580a447..6482490d1a93 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -This example shows how to use vLLM for running offline inference +This example shows how to use vLLM for running offline inference with the correct prompt format on Qwen2.5-Omni (thinker only). """ @@ -27,51 +27,55 @@ class QueryResult(NamedTuple): default_system = ( "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech.") + "generating text and speech." +) def get_mixed_modalities_query() -> QueryResult: - question = ("What is recited in the audio? " - "What is the content of this image? Why is this video funny?") - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" - "<|vision_bos|><|IMAGE|><|vision_eos|>" - "<|vision_bos|><|VIDEO|><|vision_eos|>" - f"{question}<|im_end|>\n" - f"<|im_start|>assistant\n") + question = ( + "What is recited in the audio? " + "What is the content of this image? Why is this video funny?" + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" + "<|vision_bos|><|IMAGE|><|vision_eos|>" + "<|vision_bos|><|VIDEO|><|vision_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) return QueryResult( inputs={ "prompt": prompt, "multi_modal_data": { - "audio": - AudioAsset("mary_had_lamb").audio_and_sample_rate, - "image": - convert_image_mode( - ImageAsset("cherry_blossom").pil_image, "RGB"), - "video": - VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + "image": convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB" + ), + "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, }, }, - limit_mm_per_prompt={ - "audio": 1, - "image": 1, - "video": 1 - }, + limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, ) def get_use_audio_in_video_query() -> QueryResult: - question = ("Describe the content of the video, " - "then convert what the baby say into text.") - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" - f"{question}<|im_end|>\n" - f"<|im_start|>assistant\n") + question = ( + "Describe the content of the video, then convert what the baby say into text." + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) asset = VideoAsset(name="baby_reading", num_frames=16) audio = asset.get_audio(sampling_rate=16000) - assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. " - "Please launch this example with " - "`VLLM_USE_V1=0`.") + assert not envs.VLLM_USE_V1, ( + "V1 does not support use_audio_in_video. " + "Please launch this example with " + "`VLLM_USE_V1=0`." + ) return QueryResult( inputs={ "prompt": prompt, @@ -83,20 +87,19 @@ def get_use_audio_in_video_query() -> QueryResult: "use_audio_in_video": True, }, }, - limit_mm_per_prompt={ - "audio": 1, - "video": 1 - }, + limit_mm_per_prompt={"audio": 1, "video": 1}, ) def get_multi_audios_query() -> QueryResult: question = "Are these two audio clips the same?" - prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n" - "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" - "<|audio_bos|><|AUDIO|><|audio_eos|>" - f"{question}<|im_end|>\n" - f"<|im_start|>assistant\n") + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" + "<|audio_bos|><|AUDIO|><|audio_eos|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) return QueryResult( inputs={ "prompt": prompt, @@ -124,18 +127,19 @@ def main(args): model_name = "Qwen/Qwen2.5-Omni-7B" query_result = query_map[args.query_type]() - llm = LLM(model=model_name, - max_model_len=5632, - max_num_seqs=5, - limit_mm_per_prompt=query_result.limit_mm_per_prompt, - seed=args.seed) + llm = LLM( + model=model_name, + max_model_len=5632, + max_num_seqs=5, + limit_mm_per_prompt=query_result.limit_mm_per_prompt, + seed=args.seed, + ) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. sampling_params = SamplingParams(temperature=0.2, max_tokens=64) - outputs = llm.generate(query_result.inputs, - sampling_params=sampling_params) + outputs = llm.generate(query_result.inputs, sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text @@ -144,18 +148,23 @@ def main(args): def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'audio language models') - parser.add_argument('--query-type', - '-q', - type=str, - default="mixed_modalities", - choices=query_map.keys(), - help='Query type.') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "audio language models" + ) + parser.add_argument( + "--query-type", + "-q", + type=str, + default="mixed_modalities", + choices=query_map.keys(), + help="Query type.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py index 64a1f4c54b67..856a35b0e59b 100644 --- a/examples/offline_inference/qwen_1m.py +++ b/examples/offline_inference/qwen_1m.py @@ -17,10 +17,10 @@ def load_prompt() -> str: # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt with urlopen( - "https://qianwen-res.oss-cn-beijing.aliyuncs.com" - "/Qwen2.5-1M/test-data/600k.txt", - timeout=5) as response: - prompt = response.read().decode('utf-8') + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt", + timeout=5, + ) as response: + prompt = response.read().decode("utf-8") return prompt @@ -41,18 +41,22 @@ def process_requests(llm: LLM, prompts: list[str]) -> None: for output in outputs: prompt_token_ids = output.prompt_token_ids generated_text = output.outputs[0].text - print(f"Prompt length: {len(prompt_token_ids)}, " - f"Generated text: {generated_text!r}") + print( + f"Prompt length: {len(prompt_token_ids)}, " + f"Generated text: {generated_text!r}" + ) # Create an LLM. def initialize_engine() -> LLM: - llm = LLM(model="Qwen/Qwen2.5-7B-Instruct-1M", - max_model_len=1048576, - tensor_parallel_size=4, - enforce_eager=True, - enable_chunked_prefill=True, - max_num_batched_tokens=131072) + llm = LLM( + model="Qwen/Qwen2.5-7B-Instruct-1M", + max_model_len=1048576, + tensor_parallel_size=4, + enforce_eager=True, + enable_chunked_prefill=True, + max_num_batched_tokens=131072, + ) return llm @@ -62,5 +66,5 @@ def main(): process_requests(llm, [prompt]) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index e0ed0ac49754..a8f6977e29a4 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -12,6 +12,7 @@ and multiple inference instances. For the full implementation, please refer to the OpenRLHF framework. """ + import os import ray @@ -26,7 +27,6 @@ class MyLLM(LLM): - def __init__(self, *args, **kwargs): # a hack to make the script work. # stop ray from manipulating CUDA_VISIBLE_DEVICES @@ -89,8 +89,7 @@ def __init__(self, *args, **kwargs): for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print("-" * 50) # set up the communication between the training process @@ -98,11 +97,13 @@ def __init__(self, *args, **kwargs): master_address = get_ip() master_port = get_open_port() -handle = llm.collective_rpc.remote("init_weight_update_group", - args=(master_address, master_port, 1, 3)) +handle = llm.collective_rpc.remote( + "init_weight_update_group", args=(master_address, master_port, 1, 3) +) -model_update_group = stateless_init_process_group(master_address, master_port, - 0, 3, torch.device("cuda:0")) +model_update_group = stateless_init_process_group( + master_address, master_port, 0, 3, torch.device("cuda:0") +) ray.get(handle) # simulate training, modify the weights of the model. @@ -111,8 +112,7 @@ def __init__(self, *args, **kwargs): # sync weight from the training process to the inference engine. for name, p in train_model.named_parameters(): - handle = llm.collective_rpc.remote("update_weight", - args=(name, p.dtype, p.shape)) + handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape)) model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) ray.get(handle) @@ -126,6 +126,5 @@ def __init__(self, *args, **kwargs): for output in outputs_updated: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print("-" * 50) diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py index 3ceac0fa2e20..76eafdca1f6c 100644 --- a/examples/offline_inference/rlhf_colocate.py +++ b/examples/offline_inference/rlhf_colocate.py @@ -9,6 +9,7 @@ - Use cuda-ipc to pass tensors, since NCCL does not work when we have multiple processes on the same GPU. """ + import os import ray @@ -20,7 +21,6 @@ class MyLLM(LLM): - def __init__(self, *args, bundle_indices: list, **kwargs): # a hack to make the script work. # stop ray from manipulating CUDA_VISIBLE_DEVICES @@ -29,17 +29,16 @@ def __init__(self, *args, bundle_indices: list, **kwargs): # every worker will use 0.4 GPU, so that we can schedule # 2 instances on the same GPUs. os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" - os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join( - map(str, bundle_indices)) + os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices)) print(f"creating LLM with bundle_indices={bundle_indices}") super().__init__(*args, **kwargs) class RayTrainingActor: - def __init__(self): # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs from transformers import AutoModelForCausalLM + self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") self.model.to("cuda:0") for name, p in self.model.named_parameters(): @@ -48,6 +47,7 @@ def __init__(self): # the argument for get_device_uuid is the index # of the GPU in the visible devices. from vllm.platforms import current_platform + self.device_uuid = current_platform.get_device_uuid(0) def report_device_id(self) -> str: @@ -55,6 +55,7 @@ def report_device_id(self) -> str: def get_weight_ipc_handles(self): from torch.multiprocessing.reductions import reduce_tensor + data = {} for name, p in self.model.named_parameters(): # the training actor might only have a subset of the weights @@ -101,7 +102,7 @@ def get_weight_ipc_handles(self): print(f"training actor {bundle_index} is on {device_id}") training_actor_device_ids.append(device_id) -for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]): +for i, bundle_indices in enumerate([[0, 1], [2, 3]]): # IMPORTANT: when creating vLLM instances, we need to # make sure there are no GPU activities on the target GPUs, # otherwise, they will interfere with the vLLM memory profiling, @@ -128,7 +129,8 @@ def get_weight_ipc_handles(self): for i, llm in enumerate(inference_engines): inference_engine_device_ids.append( - ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))) + ray.get(llm.collective_rpc.remote("report_device_id", args=tuple())) + ) print(f"inference engine {i} is on {inference_engine_device_ids[-1]}") # check the placement @@ -147,9 +149,10 @@ def get_weight_ipc_handles(self): print("update the weights of the inference engines") for llm in inference_engines: ray.get( - llm.collective_rpc.remote("update_weights_from_ipc_handles", - args=(ipc_handles, ))) + llm.collective_rpc.remote( + "update_weights_from_ipc_handles", args=(ipc_handles,) + ) + ) print("check if the weights are updated") for llm in inference_engines: - assert ray.get( - llm.collective_rpc.remote("check_weights_changed", args=tuple())) + assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple())) diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index 11b73b7c4a0a..3461af707eba 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -2,21 +2,20 @@ import torch -def stateless_init_process_group(master_address, master_port, rank, world_size, - device): +def stateless_init_process_group(master_address, master_port, rank, world_size, device): """ vLLM provides `StatelessProcessGroup` to create a process group without considering the global process group in torch.distributed. It is recommended to create `StatelessProcessGroup`, and then initialize - the data-plane communication (NCCL) between external (train processes) + the data-plane communication (NCCL) between external (train processes) and vLLM workers. """ from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.distributed.utils import StatelessProcessGroup - pg = StatelessProcessGroup.create(host=master_address, - port=master_port, - rank=rank, - world_size=world_size) + + pg = StatelessProcessGroup.create( + host=master_address, port=master_port, rank=rank, world_size=world_size + ) pynccl = PyNcclCommunicator(pg, device=device) return pynccl @@ -31,9 +30,11 @@ class WorkerExtension: should pass the full qualified name as `worker_extension_cls` argument. """ - def init_weight_update_group(self, master_address, master_port, - rank_offset, world_size): + def init_weight_update_group( + self, master_address, master_port, rank_offset, world_size + ): from vllm.distributed.parallel_state import get_world_group + rank = get_world_group().rank + rank_offset self.model_update_group = stateless_init_process_group( master_address, @@ -45,9 +46,9 @@ def init_weight_update_group(self, master_address, master_port, def update_weight(self, name, dtype, shape): weight = torch.empty(shape, dtype=dtype, device="cuda") - self.model_update_group.broadcast(weight, - src=0, - stream=torch.cuda.current_stream()) + self.model_update_group.broadcast( + weight, src=0, stream=torch.cuda.current_stream() + ) self.model_runner.model.load_weights(weights=[(name, weight)]) @@ -59,8 +60,7 @@ def check_weights_changed(self): """ weights_updated = True for name, p in self.model_runner.model.named_parameters(): - weights_updated = weights_updated and torch.allclose( - p, torch.zeros_like(p)) + weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p)) return weights_updated @@ -76,6 +76,7 @@ class ColocateWorkerExtension: def report_device_id(self) -> str: from vllm.platforms import current_platform + self.device_uuid = current_platform.get_device_uuid(self.device.index) return self.device_uuid @@ -100,6 +101,5 @@ def check_weights_changed(self): """ weights_updated = True for name, p in self.model_runner.model.named_parameters(): - weights_updated = weights_updated and torch.allclose( - p, torch.zeros_like(p)) + weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p)) return weights_updated diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 338380cc9684..860fe2b5fe06 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -21,6 +21,7 @@ tensor_parallel_size=8, ) """ + import dataclasses import os import shutil @@ -33,18 +34,18 @@ def parse_args(): parser = FlexibleArgumentParser() EngineArgs.add_cli_args(parser) - parser.add_argument("--output", - "-o", - required=True, - type=str, - help="path to output checkpoint") - parser.add_argument("--file-pattern", - type=str, - help="string pattern of saved filenames") - parser.add_argument("--max-file-size", - type=str, - default=5 * 1024**3, - help="max size (in bytes) of each safetensors file") + parser.add_argument( + "--output", "-o", required=True, type=str, help="path to output checkpoint" + ) + parser.add_argument( + "--file-pattern", type=str, help="string pattern of saved filenames" + ) + parser.add_argument( + "--max-file-size", + type=str, + default=5 * 1024**3, + help="max size (in bytes) of each safetensors file", + ) return parser.parse_args() @@ -68,23 +69,23 @@ def main(args): # For V1 engine, we need to use engine_core.save_sharded_state print("Using V1 engine save path") llm.llm_engine.engine_core.save_sharded_state( - path=args.output, - pattern=args.file_pattern, - max_size=args.max_file_size) + path=args.output, pattern=args.file_pattern, max_size=args.max_file_size + ) else: # For V0 engine print("Using V0 engine save path") model_executor = llm.llm_engine.model_executor - model_executor.save_sharded_state(path=args.output, - pattern=args.file_pattern, - max_size=args.max_file_size) + model_executor.save_sharded_state( + path=args.output, pattern=args.file_pattern, max_size=args.max_file_size + ) # Copy metadata files to output directory for file in os.listdir(model_path): if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"): if os.path.isdir(os.path.join(model_path, file)): - shutil.copytree(os.path.join(model_path, file), - os.path.join(args.output, file)) + shutil.copytree( + os.path.join(model_path, file), os.path.join(args.output, file) + ) else: shutil.copy(os.path.join(model_path, file), args.output) diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 363b500e0adf..9ed7299606b7 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 """ -This file demonstrates the example usage of guided decoding -to generate structured outputs using vLLM. It shows how to apply -different guided decoding techniques such as Choice, Regex, JSON schema, -and Grammar to produce structured and formatted results +This file demonstrates the example usage of guided decoding +to generate structured outputs using vLLM. It shows how to apply +different guided decoding techniques such as Choice, Regex, JSON schema, +and Grammar to produce structured and formatted results based on specific prompts. """ @@ -15,20 +15,20 @@ from vllm.sampling_params import GuidedDecodingParams # Guided decoding by Choice (list of possible options) -guided_decoding_params_choice = GuidedDecodingParams( - choice=["Positive", "Negative"]) -sampling_params_choice = SamplingParams( - guided_decoding=guided_decoding_params_choice) +guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"]) +sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice) prompt_choice = "Classify this sentiment: vLLM is wonderful!" # Guided decoding by Regex guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n") sampling_params_regex = SamplingParams( - guided_decoding=guided_decoding_params_regex, stop=["\n"]) + guided_decoding=guided_decoding_params_regex, stop=["\n"] +) prompt_regex = ( "Generate an email address for Alan Turing, who works in Enigma." "End in .com and new line. Example result:" - "alan.turing@enigma.com\n") + "alan.turing@enigma.com\n" +) # Guided decoding by JSON using Pydantic schema @@ -47,10 +47,11 @@ class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema() guided_decoding_params_json = GuidedDecodingParams(json=json_schema) -sampling_params_json = SamplingParams( - guided_decoding=guided_decoding_params_json) -prompt_json = ("Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's") +sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json) +prompt_json = ( + "Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's" +) # Guided decoding by Grammar simplified_sql_grammar = """ @@ -61,12 +62,11 @@ class CarDescription(BaseModel): condition ::= column "= " number number ::= "1 " | "2 " """ -guided_decoding_params_grammar = GuidedDecodingParams( - grammar=simplified_sql_grammar) -sampling_params_grammar = SamplingParams( - guided_decoding=guided_decoding_params_grammar) -prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table.") +guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar) +sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar) +prompt_grammar = ( + "Generate an SQL query to show the 'username' and 'email'from the 'users' table." +) def format_output(title: str, output: str): @@ -90,8 +90,7 @@ def main(): json_output = generate_output(prompt_json, sampling_params_json, llm) format_output("Guided decoding by JSON", json_output) - grammar_output = generate_output(prompt_grammar, sampling_params_grammar, - llm) + grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm) format_output("Guided decoding by Grammar", grammar_output) diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index bb61a0a29e32..2fa49c0835e3 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -45,8 +45,7 @@ for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}\n") + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n") print("-" * 50) """ Further tips: diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 71cd88f2788a..e4a75b3f9380 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -20,10 +20,12 @@ def main(): # Set `enforce_eager=True` to avoid ahead-of-time compilation. # In real workloads, `enforace_eager` should be `False`. - llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", - max_num_batched_tokens=64, - max_num_seqs=4, - max_model_len=128) + llm = LLM( + model="Qwen/Qwen2-1.5B-Instruct", + max_num_batched_tokens=64, + max_num_seqs=4, + max_model_len=128, + ) outputs = llm.generate(prompts, sampling_params) print("-" * 50) for output, answer in zip(outputs, answers): diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index c8b1e9aba5dc..f0504501639d 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -6,6 +6,7 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ + import os import random from contextlib import contextmanager @@ -49,9 +50,13 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" - "<|im_end|>\n<|im_start|>assistant\n") - for question in questions] + prompts = [ + ( + f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" + "<|im_end|>\n<|im_start|>assistant\n" + ) + for question in questions + ] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] @@ -135,8 +140,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: ) prompts = [ - f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" - for question in questions + f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" for question in questions ] return ModelRequestData( @@ -198,9 +202,14 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - prompts = [("<bos><start_of_turn>user\n" - f"<start_of_image>{question}<end_of_turn>\n" - "<start_of_turn>model\n") for question in questions] + prompts = [ + ( + "<bos><start_of_turn>user\n" + f"<start_of_image>{question}<end_of_turn>\n" + "<start_of_turn>model\n" + ) + for question in questions + ] return ModelRequestData( engine_args=engine_args, @@ -225,7 +234,8 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: prompts = [ f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ - {question}<|assistant|>" for question in questions + {question}<|assistant|>" + for question in questions ] stop_token_ids = [151329, 151336, 151338] @@ -250,15 +260,13 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m @@ -284,15 +292,14 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ - "size": { - "longest_edge": 3 * 364 - }, + "size": {"longest_edge": 3 * 364}, }, limit_mm_per_prompt={modality: 1}, ) - prompts = [( - f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:" - ) for question in questions] + prompts = [ + (f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:") + for question in questions + ] return ModelRequestData( engine_args=engine_args, @@ -311,9 +318,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: max_num_seqs=2, enforce_eager=True, mm_processor_kwargs={ - "max_image_size": { - "longest_edge": 384 - }, + "max_image_size": {"longest_edge": 384}, }, limit_mm_per_prompt={modality: 1}, ) @@ -330,7 +335,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "OpenGVLab/InternVL3-2B" engine_args = EngineArgs( @@ -345,15 +349,14 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: elif modality == "video": placeholder = "<video>" - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"{placeholder}\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for InternVL # models variants may have different stop tokens @@ -361,9 +364,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - stop_token_ids = [ - token_id for token_id in stop_token_ids if token_id is not None - ] + stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] return ModelRequestData( engine_args=engine_args, @@ -379,7 +380,8 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: prompts = [ "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>" f"<|media_pad|><|media_end|>{question}<|im_end|>" - "<|im_assistant|>assistant<|im_middle|>" for question in questions + "<|im_assistant|>assistant<|im_middle|>" + for question in questions ] engine_args = EngineArgs( @@ -399,9 +401,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - prompts = [ - f"USER: <image>\n{question}\nASSISTANT:" for question in questions - ] + prompts = [f"USER: <image>\n{question}\nASSISTANT:" for question in questions] engine_args = EngineArgs( model="llava-hf/llava-1.5-7b-hf", @@ -434,13 +434,10 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(questions: list[str], - modality: str) -> ModelRequestData: +def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestData: assert modality == "video" - prompts = [ - f"USER: <video>\n{question} ASSISTANT:" for question in questions - ] + prompts = [f"USER: <video>\n{question} ASSISTANT:" for question in questions] engine_args = EngineArgs( model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192, @@ -455,19 +452,19 @@ def run_llava_next_video(questions: list[str], # LLaVA-OneVision -def run_llava_onevision(questions: list[str], - modality: str) -> ModelRequestData: - +def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData: if modality == "video": prompts = [ f"<|im_start|>user <video>\n{question}<|im_end|> \ - <|im_start|>assistant\n" for question in questions + <|im_start|>assistant\n" + for question in questions ] elif modality == "image": prompts = [ f"<|im_start|>user <image>\n{question}<|im_end|> \ - <|im_start|>assistant\n" for question in questions + <|im_start|>assistant\n" + for question in questions ] engine_args = EngineArgs( @@ -486,11 +483,8 @@ def run_llava_onevision(questions: list[str], def run_mantis(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 - prompts = [ - llama3_template.format(f"{question}\n<image>") - for question in questions - ] + llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # noqa: E501 + prompts = [llama3_template.format(f"{question}\n<image>") for question in questions] engine_args = EngineArgs( model="TIGER-Lab/Mantis-8B-siglip-llama3", @@ -530,8 +524,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): # 2.6: image, video # o2.6: image, video, audio # model_name = "openbmb/MiniCPM-o-2_6" - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) engine_args = EngineArgs( model=model_name, max_model_len=4096, @@ -547,7 +540,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] # 2.6 / o2.6 - stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_tokens = ["<|im_end|>", "<|endoftext|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] modality_placeholder = { @@ -557,12 +550,16 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): prompts = [ tokenizer.apply_chat_template( - [{ - 'role': 'user', - 'content': f"{modality_placeholder[modality]}\n{question}" - }], + [ + { + "role": "user", + "content": f"{modality_placeholder[modality]}\n{question}", + } + ], tokenize=False, - add_generation_prompt=True) for question in questions + add_generation_prompt=True, + ) + for question in questions ] return ModelRequestData( @@ -622,19 +619,18 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ) tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [[{ - "role": - "user", - "content": [{ - "type": "image" - }, { - "type": "text", - "text": question - }] - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - add_generation_prompt=True, - tokenize=False) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": question}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) return ModelRequestData( engine_args=engine_args, @@ -657,19 +653,18 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData: ) tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [[{ - "role": - "user", - "content": [{ - "type": "image" - }, { - "type": "text", - "text": f"{question}" - }] - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - add_generation_prompt=True, - tokenize=False) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) stop_token_ids = None return ModelRequestData( engine_args=engine_args, @@ -693,7 +688,8 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: prompts = [ f"<|im_start|>user <image>\n{question}<|im_end|> \ - <|im_start|>assistant\n" for question in questions + <|im_start|>assistant\n" + for question in questions ] return ModelRequestData( @@ -717,15 +713,13 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -748,15 +742,13 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -847,8 +839,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: # we have to manually specify the path of the lora weights. vision_lora_path = os.path.join(model_path, "vision-lora") prompts = [ - f"<|user|><|image_1|>{question}<|end|><|assistant|>" - for question in questions + f"<|user|><|image_1|>{question}<|end|><|assistant|>" for question in questions ] engine_args = EngineArgs( model=model_path, @@ -915,7 +906,6 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: # Qwen2-VL def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "Qwen/Qwen2-VL-7B-Instruct" engine_args = EngineArgs( @@ -936,10 +926,13 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: placeholder = "<|video_pad|>" prompts = [ - ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") for question in questions + ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions ] return ModelRequestData( @@ -950,7 +943,6 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: # Qwen2.5-VL def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "Qwen/Qwen2.5-VL-3B-Instruct" engine_args = EngineArgs( @@ -971,10 +963,13 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: placeholder = "<|video_pad|>" prompts = [ - ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" - f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") for question in questions + ( + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions ] return ModelRequestData( @@ -1007,12 +1002,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str): default_system = ( "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech.") + "generating text and speech." + ) - prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n" - f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>" - f"{question}<|im_end|>\n" - "<|im_start|>assistant\n") for question in questions] + prompts = [ + ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions + ] return ModelRequestData( engine_args=engine_args, prompts=prompts, @@ -1032,15 +1033,13 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: limit_mm_per_prompt={modality: 1}, ) - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - messages = [[{ - 'role': 'user', - 'content': f"<image>\n{question}" - }] for question in questions] - prompts = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"<image>\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for SkyworkR1V # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py @@ -1104,8 +1103,7 @@ def get_multi_modal_input(args): """ if args.modality == "image": # Input image and question - image = convert_image_mode( - ImageAsset("cherry_blossom").pil_image, "RGB") + image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") img_questions = [ "What is the content of this image?", "Describe the content of this image in detail.", @@ -1120,8 +1118,7 @@ def get_multi_modal_input(args): if args.modality == "video": # Input video and question - video = VideoAsset(name="baby_reading", - num_frames=args.num_frames).np_ndarrays + video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays vid_questions = ["Why is this video funny?"] return { @@ -1133,12 +1130,13 @@ def get_multi_modal_input(args): raise ValueError(msg) -def apply_image_repeat(image_repeat_prob, num_prompts, data, - prompts: list[str], modality): - """Repeats images with provided probability of "image_repeat_prob". +def apply_image_repeat( + image_repeat_prob, num_prompts, data, prompts: list[str], modality +): + """Repeats images with provided probability of "image_repeat_prob". Used to simulate hit/miss for the MM preprocessor cache. """ - assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0) + assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0 no_yes = [0, 1] probs = [1.0 - image_repeat_prob, image_repeat_prob] @@ -1153,12 +1151,12 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, new_val = (i // 256 // 256, i // 256, i % 256) cur_image.putpixel((0, 0), new_val) - inputs.append({ - "prompt": prompts[i % len(prompts)], - "multi_modal_data": { - modality: cur_image + inputs.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: cur_image}, } - }) + ) return inputs @@ -1167,6 +1165,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, def time_counter(enable: bool): if enable: import time + start_time = time.time() yield elapsed_time = time.time() - start_time @@ -1179,54 +1178,65 @@ def time_counter(enable: bool): def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for text generation') - parser.add_argument('--model-type', - '-m', - type=str, - default="llava", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument('--num-prompts', - type=int, - default=4, - help='Number of prompts to run.') - parser.add_argument('--modality', - type=str, - default="image", - choices=['image', 'video'], - help='Modality of the input.') - parser.add_argument('--num-frames', - type=int, - default=16, - help='Number of frames to extract from the video.') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models for text generation" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--num-prompts", type=int, default=4, help="Number of prompts to run." + ) + parser.add_argument( + "--modality", + type=str, + default="image", + choices=["image", "video"], + help="Modality of the input.", + ) + parser.add_argument( + "--num-frames", + type=int, + default=16, + help="Number of frames to extract from the video.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) parser.add_argument( - '--image-repeat-prob', + "--image-repeat-prob", type=float, default=None, - help='Simulates the hit-ratio for multi-modal preprocessor cache' - ' (if enabled)') + help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)", + ) parser.add_argument( - '--disable-mm-preprocessor-cache', - action='store_true', - help='If True, disables caching of multi-modal preprocessor/mapper.') + "--disable-mm-preprocessor-cache", + action="store_true", + help="If True, disables caching of multi-modal preprocessor/mapper.", + ) parser.add_argument( - '--time-generate', - action='store_true', - help='If True, then print the total generate() call time') + "--time-generate", + action="store_true", + help="If True, then print the total generate() call time", + ) parser.add_argument( - '--use-different-prompt-per-request', - action='store_true', - help='If True, then use different prompt (with the same multi-modal ' - 'data) for each request.') + "--use-different-prompt-per-request", + action="store_true", + help="If True, then use different prompt (with the same multi-modal " + "data) for each request.", + ) return parser.parse_args() @@ -1245,7 +1255,8 @@ def main(args): # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | { "seed": args.seed, @@ -1254,44 +1265,46 @@ def main(args): llm = LLM(**engine_args) # Don't want to check the flag multiple times, so just hijack `prompts`. - prompts = req_data.prompts if args.use_different_prompt_per_request else [ - req_data.prompts[0] - ] + prompts = ( + req_data.prompts + if args.use_different_prompt_per_request + else [req_data.prompts[0]] + ) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, - max_tokens=64, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids + ) assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference inputs = { "prompt": prompts[0], - "multi_modal_data": { - modality: data - }, + "multi_modal_data": {modality: data}, } else: # Batch inference if args.image_repeat_prob is not None: # Repeat images with specified probability of "image_repeat_prob" - inputs = apply_image_repeat(args.image_repeat_prob, - args.num_prompts, data, prompts, - modality) + inputs = apply_image_repeat( + args.image_repeat_prob, args.num_prompts, data, prompts, modality + ) else: # Use the same image for all prompts - inputs = [{ - "prompt": prompts[i % len(prompts)], - "multi_modal_data": { - modality: data - }, - } for i in range(args.num_prompts)] + inputs = [ + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: data}, + } + for i in range(args.num_prompts) + ] # Add LoRA request if applicable - lora_request = (req_data.lora_requests * - args.num_prompts if req_data.lora_requests else None) + lora_request = ( + req_data.lora_requests * args.num_prompts if req_data.lora_requests else None + ) with time_counter(args.time_generate): outputs = llm.generate( diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index 2637949551a1..cee02d06c607 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -6,6 +6,7 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ + from argparse import Namespace from dataclasses import asdict from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args @@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple): def run_e5_v(query: Query) -> ModelRequestData: - llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501 if query["modality"] == "text": text = query["text"] - prompt = llama3_template.format( - f"{text}\nSummary above sentence in one word: ") + prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ") image = None elif query["modality"] == "image": - prompt = llama3_template.format( - "<image>\nSummary above image in one word: ") + prompt = llama3_template.format("<image>\nSummary above image in one word: ") image = query["image"] else: - modality = query['modality'] + modality = query["modality"] raise ValueError(f"Unsupported query modality: '{modality}'") engine_args = EngineArgs( @@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData: image = query["image"] elif query["modality"] == "text+image": text = query["text"] - prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + prompt = ( + f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + ) image = query["image"] else: - modality = query['modality'] + modality = query["modality"] raise ValueError(f"Unsupported query modality: '{modality}'") engine_args = EngineArgs( @@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": seed} llm = LLM(**engine_args) @@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): if req_data.image is not None: mm_data["image"] = req_data.image - outputs = llm.embed({ - "prompt": req_data.prompt, - "multi_modal_data": mm_data, - }) + outputs = llm.embed( + { + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + } + ) print("-" * 50) for output in outputs: @@ -164,23 +168,30 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for multimodal embedding') - parser.add_argument('--model-name', - '-m', - type=str, - default="vlm2vec", - choices=model_example_map.keys(), - help='The name of the embedding model.') - parser.add_argument('--modality', - type=str, - default="image", - choices=get_args(QueryModality), - help='Modality of the input.') - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models for multimodal embedding" + ) + parser.add_argument( + "--model-name", + "-m", + type=str, + default="vlm2vec", + choices=model_example_map.keys(), + help="The name of the embedding model.", + ) + parser.add_argument( + "--modality", + type=str, + default="image", + choices=get_args(QueryModality), + help="Modality of the input.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 20a8e635e322..e776ff7fe6ae 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -4,6 +4,7 @@ multi-image input on vision language models for text generation, using the chat template defined by the model. """ + import os from argparse import Namespace from dataclasses import asdict @@ -59,8 +60,9 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, ) placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls) - prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" - "<|im_start|>assistant\n") + prompt = ( + f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n" + ) stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] return ModelRequestData( @@ -81,23 +83,21 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -106,8 +106,7 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_deepseek_vl2(question: str, - image_urls: list[str]) -> ModelRequestData: +def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "deepseek-ai/deepseek-vl2-tiny" engine_args = EngineArgs( @@ -118,8 +117,9 @@ def load_deepseek_vl2(question: str, limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholder = "".join(f"image_{i}:<image>\n" - for i, _ in enumerate(image_urls, start=1)) + placeholder = "".join( + f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1) + ) prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:" return ModelRequestData( @@ -140,23 +140,21 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -176,15 +174,15 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"max_dynamic_patch": 4}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for H2OVL-Mississippi # https://huggingface.co/h2oai/h2ovl-mississippi-800m @@ -211,14 +209,13 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: # if you are running out of memory, you can reduce the "longest_edge". # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations mm_processor_kwargs={ - "size": { - "longest_edge": 2 * 364 - }, + "size": {"longest_edge": 2 * 364}, }, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 return ModelRequestData( engine_args=engine_args, @@ -238,15 +235,16 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: enforce_eager=True, limit_mm_per_prompt={"image": len(image_urls)}, mm_processor_kwargs={ - "max_image_size": { - "longest_edge": 384 - }, + "max_image_size": {"longest_edge": 384}, }, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + prompt = ( + f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 + ) return ModelRequestData( engine_args=engine_args, prompt=prompt, @@ -265,15 +263,15 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"max_dynamic_patch": 4}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Stop tokens for InternVL # models variants may have different stop tokens @@ -301,23 +299,21 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -338,24 +334,21 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] - processor = AutoProcessor.from_pretrained(model_name, - trust_remote_code=True) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -419,15 +412,15 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"max_dynamic_patch": 4}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -449,15 +442,15 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholders = "\n".join(f"Image-{i}: <image>\n" - for i, _ in enumerate(image_urls, start=1)) - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + placeholders = "\n".join( + f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) return ModelRequestData( engine_args=engine_args, @@ -509,8 +502,9 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, mm_processor_kwargs={"num_crops": 4}, ) - placeholders = "\n".join(f"<|image_{i}|>" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "\n".join( + f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1) + ) prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" return ModelRequestData( @@ -542,8 +536,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: mm_processor_kwargs={"dynamic_hd": 4}, ) - placeholders = "".join(f"<|image_{i}|>" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)) prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>" return ModelRequestData( @@ -554,8 +547,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_qwen_vl_chat(question: str, - image_urls: list[str]) -> ModelRequestData: +def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" engine_args = EngineArgs( model=model_name, @@ -565,24 +557,26 @@ def load_qwen_vl_chat(question: str, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, limit_mm_per_prompt={"image": len(image_urls)}, ) - placeholders = "".join(f"Picture {i}: <img></img>\n" - for i, _ in enumerate(image_urls, start=1)) + placeholders = "".join( + f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1) + ) # This model does not have a chat_template attribute on its tokenizer, # so we need to explicitly pass it. We use ChatML since it's used in the # generation utils of the model: # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265 - tokenizer = AutoTokenizer.from_pretrained(model_name, - trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501 - messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True, - chat_template=chat_template) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] + prompt = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + chat_template=chat_template, + ) stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] @@ -600,9 +594,11 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: - print('WARNING: `qwen-vl-utils` not installed, input images will not ' - 'be automatically resized. You can enable this functionality by ' - '`pip install qwen-vl-utils`.') + print( + "WARNING: `qwen-vl-utils` not installed, input images will not " + "be automatically resized. You can enable this functionality by " + "`pip install qwen-vl-utils`." + ) process_vision_info = None model_name = "Qwen/Qwen2-VL-7B-Instruct" @@ -616,26 +612,22 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) if process_vision_info is None: image_data = [fetch_image(url) for url in image_urls] @@ -653,9 +645,11 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: from qwen_vl_utils import process_vision_info except ModuleNotFoundError: - print('WARNING: `qwen-vl-utils` not installed, input images will not ' - 'be automatically resized. You can enable this functionality by ' - '`pip install qwen-vl-utils`.') + print( + "WARNING: `qwen-vl-utils` not installed, input images will not " + "be automatically resized. You can enable this functionality by " + "`pip install qwen-vl-utils`." + ) process_vision_info = None model_name = "Qwen/Qwen2.5-VL-3B-Instruct" @@ -668,32 +662,27 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": - "user", - "content": [ - *placeholders, - { - "type": "text", - "text": question - }, - ], - }] + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] processor = AutoProcessor.from_pretrained(model_name) - prompt = processor.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) if process_vision_info is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages, - return_video_kwargs=False) + image_data, _ = process_vision_info(messages, return_video_kwargs=False) return ModelRequestData( engine_args=engine_args, @@ -726,23 +715,20 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: } -def run_generate(model, question: str, image_urls: list[str], - seed: Optional[int]): +def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]): req_data = model_example_map[model](question, image_urls) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} llm = LLM(**engine_args) - sampling_params = SamplingParams(temperature=0.0, - max_tokens=256, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids + ) outputs = llm.generate( { "prompt": req_data.prompt, - "multi_modal_data": { - "image": req_data.image_data - }, + "multi_modal_data": {"image": req_data.image_data}, }, sampling_params=sampling_params, lora_request=req_data.lora_requests, @@ -755,38 +741,40 @@ def run_generate(model, question: str, image_urls: list[str], print("-" * 50) -def run_chat(model: str, question: str, image_urls: list[str], - seed: Optional[int]): +def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]): req_data = model_example_map[model](question, image_urls) # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {}) + req_data.engine_args.limit_mm_per_prompt or {} + ) engine_args = asdict(req_data.engine_args) | {"seed": seed} llm = LLM(**engine_args) - sampling_params = SamplingParams(temperature=0.0, - max_tokens=256, - stop_token_ids=req_data.stop_token_ids) + sampling_params = SamplingParams( + temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids + ) outputs = llm.chat( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": question, - }, - *({ - "type": "image_url", - "image_url": { - "url": image_url + [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": question, }, - } for image_url in image_urls), - ], - }], + *( + { + "type": "image_url", + "image_url": {"url": image_url}, + } + for image_url in image_urls + ), + ], + } + ], sampling_params=sampling_params, chat_template=req_data.chat_template, lora_request=req_data.lora_requests, @@ -801,32 +789,39 @@ def run_chat(model: str, question: str, image_urls: list[str], def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models that support multi-image input for text ' - 'generation') - parser.add_argument('--model-type', - '-m', - type=str, - default="phi3_v", - choices=model_example_map.keys(), - help='Huggingface "model_type".') - parser.add_argument("--method", - type=str, - default="generate", - choices=["generate", "chat"], - help="The method to run in `vllm.LLM`.") - parser.add_argument("--seed", - type=int, - default=None, - help="Set the seed when initializing `vllm.LLM`.") + description="Demo on using vLLM for offline inference with " + "vision language models that support multi-image input for text " + "generation" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="phi3_v", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--method", + type=str, + default="generate", + choices=["generate", "chat"], + help="The method to run in `vllm.LLM`.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) parser.add_argument( "--num-images", "-n", type=int, - choices=list(range(1, - len(IMAGE_URLS) + 1)), # the max number of images + choices=list(range(1, len(IMAGE_URLS) + 1)), # the max number of images default=2, - help="Number of images to use for the demo.") + help="Number of images to use for the demo.", + ) return parser.parse_args() @@ -835,7 +830,7 @@ def main(args: Namespace): method = args.method seed = args.seed - image_urls = IMAGE_URLS[:args.num_images] + image_urls = IMAGE_URLS[: args.num_images] if method == "generate": run_generate(model, QUESTION, image_urls, seed) diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index 36079ff11d07..cc190e91c141 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -17,16 +17,15 @@ def clear_line(n: int = 1) -> None: - LINE_UP = '\033[1A' - LINE_CLEAR = '\x1b[2K' + LINE_UP = "\033[1A" + LINE_CLEAR = "\x1b[2K" for _ in range(n): print(LINE_UP, end=LINE_CLEAR, flush=True) -def post_http_request(prompt: str, - api_url: str, - n: int = 1, - stream: bool = False) -> requests.Response: +def post_http_request( + prompt: str, api_url: str, n: int = 1, stream: bool = False +) -> requests.Response: headers = {"User-Agent": "Test Client"} pload = { "prompt": prompt, @@ -35,17 +34,14 @@ def post_http_request(prompt: str, "max_tokens": 16, "stream": stream, } - response = requests.post(api_url, - headers=headers, - json=pload, - stream=stream) + response = requests.post(api_url, headers=headers, json=pload, stream=stream) return response def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: - for chunk in response.iter_lines(chunk_size=8192, - decode_unicode=False, - delimiter=b"\n"): + for chunk in response.iter_lines( + chunk_size=8192, decode_unicode=False, delimiter=b"\n" + ): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"] diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py index c2d4ef08ddbb..e57b94e8805f 100644 --- a/examples/online_serving/cohere_rerank_client.py +++ b/examples/online_serving/cohere_rerank_client.py @@ -6,6 +6,7 @@ run: vllm serve BAAI/bge-reranker-base """ + from typing import Union import cohere @@ -16,28 +17,28 @@ query = "What is the capital of France?" documents = [ - "The capital of France is Paris", "Reranking is fun!", - "vLLM is an open-source framework for fast AI serving" + "The capital of France is Paris", + "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving", ] -def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str, - documents: list[str]) -> dict: +def cohere_rerank( + client: Union[Client, ClientV2], model: str, query: str, documents: list[str] +) -> dict: return client.rerank(model=model, query=query, documents=documents) def main(): # cohere v1 client - cohere_v1 = cohere.Client(base_url="http://localhost:8000", - api_key="sk-fake-key") + cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key") rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents) print("-" * 50) print("rerank_v1_result:\n", rerank_v1_result) print("-" * 50) # or the v2 - cohere_v2 = cohere.ClientV2("sk-fake-key", - base_url="http://localhost:8000") + cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000") rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents) print("rerank_v2_result:\n", rerank_v2_result) print("-" * 50) diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index c6d26778ee49..2ffba4a7ed3f 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -13,6 +13,7 @@ Note: This demo will be removed once the PDController implemented in PR 15343 (https://github.com/vllm-project/vllm/pull/15343) supports XpYd. """ + import argparse import ipaddress import itertools @@ -26,8 +27,7 @@ import aiohttp import requests import uvicorn -from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException, - Request, status) +from fastapi import APIRouter, Depends, FastAPI, Header, HTTPException, Request, status from fastapi.responses import JSONResponse, StreamingResponse AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -36,24 +36,24 @@ class SchedulingPolicy(ABC): - @abstractmethod def schedule(self, cycler: itertools.cycle): raise NotImplementedError("Scheduling Proxy is not set.") class Proxy: - def __init__( self, prefill_instances: list[str], decode_instances: list[str], model: str, scheduling_policy: SchedulingPolicy, - custom_create_completion: Optional[Callable[[Request], - StreamingResponse]] = None, - custom_create_chat_completion: Optional[Callable[ - [Request], StreamingResponse]] = None, + custom_create_completion: Optional[ + Callable[[Request], StreamingResponse] + ] = None, + custom_create_chat_completion: Optional[ + Callable[[Request], StreamingResponse] + ] = None, ): self.prefill_instances = prefill_instances self.decode_instances = decode_instances @@ -68,30 +68,30 @@ def __init__( def setup_routes(self): self.router.post( - "/v1/completions", - dependencies=[ - Depends(self.validate_json_request) - ])(self.custom_create_completion if self. - custom_create_completion else self.create_completion) + "/v1/completions", dependencies=[Depends(self.validate_json_request)] + )( + self.custom_create_completion + if self.custom_create_completion + else self.create_completion + ) self.router.post( - "/v1/chat/completions", - dependencies=[ - Depends(self.validate_json_request) - ])(self.custom_create_chat_completion if self. - custom_create_chat_completion else self.create_chat_completion) - self.router.get("/status", - response_class=JSONResponse)(self.get_status) - self.router.post("/instances/add", - dependencies=[Depends(self.api_key_authenticate) - ])(self.add_instance_endpoint) + "/v1/chat/completions", dependencies=[Depends(self.validate_json_request)] + )( + self.custom_create_chat_completion + if self.custom_create_chat_completion + else self.create_chat_completion + ) + self.router.get("/status", response_class=JSONResponse)(self.get_status) + self.router.post( + "/instances/add", dependencies=[Depends(self.api_key_authenticate)] + )(self.add_instance_endpoint) async def validate_json_request(self, raw_request: Request): content_type = raw_request.headers.get("content-type", "").lower() if content_type != "application/json": raise HTTPException( status_code=415, - detail= - "Unsupported Media Type: Only 'application/json' is allowed", + detail="Unsupported Media Type: Only 'application/json' is allowed", ) def api_key_authenticate(self, x_api_key: str = Header(...)): @@ -103,8 +103,7 @@ def api_key_authenticate(self, x_api_key: str = Header(...)): detail="Server configuration error.", ) if x_api_key != expected_api_key: - logger.warning("Unauthorized access attempt with API Key: %s", - x_api_key) + logger.warning("Unauthorized access attempt with API Key: %s", x_api_key) raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail="Forbidden: Invalid API Key.", @@ -113,8 +112,7 @@ def api_key_authenticate(self, x_api_key: str = Header(...)): async def validate_instance(self, instance: str) -> bool: url = f"http://{instance}/v1/models" try: - async with aiohttp.ClientSession( - timeout=AIOHTTP_TIMEOUT) as client: + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as client: logger.info("Verifying %s ...", instance) async with client.get(url) as response: if response.status == 200: @@ -122,12 +120,15 @@ async def validate_instance(self, instance: str) -> bool: if "data" in data and len(data["data"]) > 0: model_cur = data["data"][0].get("id", "") if model_cur == self.model: - logger.info("Instance: %s could be added.", - instance) + logger.info("Instance: %s could be added.", instance) return True else: - logger.warning("Mismatch model %s : %s != %s", - instance, model_cur, self.model) + logger.warning( + "Mismatch model %s : %s != %s", + instance, + model_cur, + self.model, + ) return False else: return False @@ -147,48 +148,47 @@ async def add_instance_endpoint(self, request: Request): instance_type = data.get("type") instance = data.get("instance") if instance_type not in ["prefill", "decode"]: - raise HTTPException(status_code=400, - detail="Invalid instance type.") + raise HTTPException(status_code=400, detail="Invalid instance type.") if not instance or ":" not in instance: - raise HTTPException(status_code=400, - detail="Invalid instance format.") + raise HTTPException(status_code=400, detail="Invalid instance format.") host, port_str = instance.split(":") try: if host != "localhost": ipaddress.ip_address(host) port = int(port_str) if not (0 < port < 65536): - raise HTTPException(status_code=400, - detail="Invalid port number.") + raise HTTPException(status_code=400, detail="Invalid port number.") except Exception as e: - raise HTTPException(status_code=400, - detail="Invalid instance address.") from e + raise HTTPException( + status_code=400, detail="Invalid instance address." + ) from e is_valid = await self.validate_instance(instance) if not is_valid: - raise HTTPException(status_code=400, - detail="Instance validation failed.") + raise HTTPException( + status_code=400, detail="Instance validation failed." + ) if instance_type == "prefill": if instance not in self.prefill_instances: self.prefill_instances.append(instance) - self.prefill_cycler = itertools.cycle( - self.prefill_instances) + self.prefill_cycler = itertools.cycle(self.prefill_instances) else: - raise HTTPException(status_code=400, - detail="Instance already exists.") + raise HTTPException( + status_code=400, detail="Instance already exists." + ) else: if instance not in self.decode_instances: self.decode_instances.append(instance) self.decode_cycler = itertools.cycle(self.decode_instances) else: - raise HTTPException(status_code=400, - detail="Instance already exists.") + raise HTTPException( + status_code=400, detail="Instance already exists." + ) - return JSONResponse(content={ - "message": - f"Added {instance} to {instance_type}_instances." - }) + return JSONResponse( + content={"message": f"Added {instance} to {instance_type}_instances."} + ) except HTTPException as http_exc: raise http_exc except Exception as e: @@ -197,16 +197,16 @@ async def add_instance_endpoint(self, request: Request): async def forward_request(self, url, data, use_chunked=True): async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" - } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} try: - async with session.post(url=url, json=data, - headers=headers) as response: + async with session.post( + url=url, json=data, headers=headers + ) as response: if 200 <= response.status < 300 or 400 <= response.status < 500: # noqa: E501 if use_chunked: async for chunk_bytes in response.content.iter_chunked( # noqa: E501 - 1024): + 1024 + ): yield chunk_bytes else: content = await response.read() @@ -217,20 +217,21 @@ async def forward_request(self, url, data, use_chunked=True): error_content = json.loads(error_content) except json.JSONDecodeError: error_content = error_content - logger.error("Request failed with status %s: %s", - response.status, error_content) + logger.error( + "Request failed with status %s: %s", + response.status, + error_content, + ) raise HTTPException( status_code=response.status, - detail= - f"Request failed with status {response.status}: " + detail=f"Request failed with status {response.status}: " f"{error_content}", ) except aiohttp.ClientError as e: logger.error("ClientError occurred: %s", str(e)) raise HTTPException( status_code=502, - detail= - "Bad Gateway: Error communicating with upstream server.", + detail="Bad Gateway: Error communicating with upstream server.", ) from e except Exception as e: logger.error("Unexpected error: %s", str(e)) @@ -258,8 +259,8 @@ async def create_completion(self, raw_request: Request): prefill_instance = self.schedule(self.prefill_cycler) try: async for _ in self.forward_request( - f"http://{prefill_instance}/v1/completions", - kv_prepare_request): + f"http://{prefill_instance}/v1/completions", kv_prepare_request + ): continue except HTTPException as http_exc: self.remove_instance_endpoint("prefill", prefill_instance) @@ -270,7 +271,8 @@ async def create_completion(self, raw_request: Request): try: generator = self.forward_request( - f"http://{decode_instance}/v1/completions", request) + f"http://{decode_instance}/v1/completions", request + ) except HTTPException as http_exc: self.remove_instance_endpoint("decode", decode_instance) raise http_exc @@ -295,8 +297,8 @@ async def create_chat_completion(self, raw_request: Request): prefill_instance = self.schedule(self.prefill_cycler) try: async for _ in self.forward_request( - f"http://{prefill_instance}/v1/chat/completions", - kv_prepare_request): + f"http://{prefill_instance}/v1/chat/completions", kv_prepare_request + ): continue except HTTPException as http_exc: self.remove_instance_endpoint("prefill", prefill_instance) @@ -306,8 +308,8 @@ async def create_chat_completion(self, raw_request: Request): try: generator = self.forward_request( - "http://" + decode_instance + "/v1/chat/completions", - request) + "http://" + decode_instance + "/v1/chat/completions", request + ) except HTTPException as http_exc: self.remove_instance_endpoint("decode", decode_instance) raise http_exc @@ -318,20 +320,20 @@ async def create_chat_completion(self, raw_request: Request): error_messages = [str(e) for e in exc_info if e] print("Error occurred in disagg proxy server") print(error_messages) - return StreamingResponse(content=iter(error_messages), - media_type="text/event-stream") + return StreamingResponse( + content=iter(error_messages), media_type="text/event-stream" + ) def remove_instance_endpoint(self, instance_type, instance): - if (instance_type == "decode" and instance in self.decode_instances): + if instance_type == "decode" and instance in self.decode_instances: self.decode_instances.remove(instance) self.decode_cycler = itertools.cycle(self.decode_instances) - if (instance_type == "prefill" and instance in self.decode_instances): + if instance_type == "prefill" and instance in self.decode_instances: self.prefill_instances.remove(instance) self.prefill_cycler = itertools.cycle(self.decode_instances) class RoundRobinSchedulingPolicy(SchedulingPolicy): - def __init__(self): super().__init__() @@ -340,15 +342,12 @@ def schedule(self, cycler: itertools.cycle) -> str: class ProxyServer: - def __init__( self, args: argparse.Namespace, scheduling_policy: Optional[SchedulingPolicy] = None, - create_completion: Optional[Callable[[Request], - StreamingResponse]] = None, - create_chat_completion: Optional[Callable[[Request], - StreamingResponse]] = None, + create_completion: Optional[Callable[[Request], StreamingResponse]] = None, + create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None, ): self.validate_parsed_serve_args(args) self.port = args.port @@ -356,8 +355,11 @@ def __init__( prefill_instances=[] if args.prefill is None else args.prefill, decode_instances=[] if args.decode is None else args.decode, model=args.model, - scheduling_policy=(scheduling_policy if scheduling_policy - is not None else RoundRobinSchedulingPolicy()), + scheduling_policy=( + scheduling_policy + if scheduling_policy is not None + else RoundRobinSchedulingPolicy() + ), custom_create_completion=create_completion, custom_create_chat_completion=create_chat_completion, ) @@ -382,11 +384,9 @@ def validate_instances(self, instances: list): ipaddress.ip_address(host) port = int(port) if not (0 < port < 65536): - raise ValueError( - f"Invalid port number in instance: {instance}") + raise ValueError(f"Invalid port number in instance: {instance}") except Exception as e: - raise ValueError( - f"Invalid instance {instance}: {str(e)}") from e + raise ValueError(f"Invalid instance {instance}: {str(e)}") from e def verify_model_config(self, instances: list, model: str) -> None: model_suffix = model.split("/")[-1] @@ -399,12 +399,14 @@ def verify_model_config(self, instances: list, model: str) -> None: if model_cur_suffix != model_suffix: raise ValueError( f"{instance} serves a different model: " - f"{model_cur} != {model}") + f"{model_cur} != {model}" + ) else: raise ValueError(f"Cannot get model id from {instance}!") except requests.RequestException as e: raise ValueError( - f"Error communicating with {instance}: {str(e)}") from e + f"Error communicating with {instance}: {str(e)}" + ) from e def run_server(self): app = FastAPI() @@ -417,11 +419,7 @@ def run_server(self): def parse_args(): # Todo: allow more config parser = argparse.ArgumentParser("vLLM disaggregated proxy server.") - parser.add_argument("--model", - "-m", - type=str, - required=True, - help="Model name") + parser.add_argument("--model", "-m", type=str, required=True, help="Model name") parser.add_argument( "--prefill", diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index 314f1c5b7395..3f2a3d01b456 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -17,6 +17,7 @@ 2. Rename the downloaded file to: frpc_linux_amd64_v0.3 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc """ + import argparse import gradio as gr @@ -24,16 +25,12 @@ def format_history_to_openai(history): - history_openai_format = [{ - "role": "system", - "content": "You are a great AI assistant." - }] + history_openai_format = [ + {"role": "system", "content": "You are a great AI assistant."} + ] for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) - history_openai_format.append({ - "role": "assistant", - "content": assistant - }) + history_openai_format.append({"role": "assistant", "content": assistant}) return history_openai_format @@ -49,17 +46,17 @@ def predict(message, history, client, model_name, temp, stop_token_ids): temperature=temp, stream=True, extra_body={ - 'repetition_penalty': - 1, - 'stop_token_ids': - [int(id.strip()) - for id in stop_token_ids.split(',')] if stop_token_ids else [] - }) + "repetition_penalty": 1, + "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")] + if stop_token_ids + else [], + }, + ) # Collect all chunks and concatenate them into a full message full_message = "" for chunk in stream: - full_message += (chunk.choices[0].delta.content or "") + full_message += chunk.choices[0].delta.content or "" # Return the full message as a single response return full_message @@ -67,38 +64,34 @@ def predict(message, history, client, model_name, temp, stop_token_ids): def parse_args(): parser = argparse.ArgumentParser( - description='Chatbot Interface with Customizable Parameters') - parser.add_argument('--model-url', - type=str, - default='http://localhost:8000/v1', - help='Model URL') - parser.add_argument('-m', - '--model', - type=str, - required=True, - help='Model name for the chatbot') - parser.add_argument('--temp', - type=float, - default=0.8, - help='Temperature for text generation') - parser.add_argument('--stop-token-ids', - type=str, - default='', - help='Comma-separated stop token IDs') + description="Chatbot Interface with Customizable Parameters" + ) + parser.add_argument( + "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL" + ) + parser.add_argument( + "-m", "--model", type=str, required=True, help="Model name for the chatbot" + ) + parser.add_argument( + "--temp", type=float, default=0.8, help="Temperature for text generation" + ) + parser.add_argument( + "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs" + ) parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8001) return parser.parse_args() def build_gradio_interface(client, model_name, temp, stop_token_ids): - def chat_predict(message, history): - return predict(message, history, client, model_name, temp, - stop_token_ids) + return predict(message, history, client, model_name, temp, stop_token_ids) - return gr.ChatInterface(fn=chat_predict, - title="Chatbot Interface", - description="A simple chatbot powered by vLLM") + return gr.ChatInterface( + fn=chat_predict, + title="Chatbot Interface", + description="A simple chatbot powered by vLLM", + ) def main(): @@ -113,12 +106,13 @@ def main(): client = OpenAI(api_key=openai_api_key, base_url=openai_api_base) # Define the Gradio chatbot interface using the predict function - gradio_interface = build_gradio_interface(client, args.model, args.temp, - args.stop_token_ids) + gradio_interface = build_gradio_interface( + client, args.model, args.temp, args.stop_token_ids + ) - gradio_interface.queue().launch(server_name=args.host, - server_port=args.port, - share=True) + gradio_interface.queue().launch( + server_name=args.host, server_port=args.port, share=True + ) if __name__ == "__main__": diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index 2e7c2a0c5838..fd341ff493b5 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -17,6 +17,7 @@ 2. Rename the downloaded file to: frpc_linux_amd64_v0.3 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc """ + import argparse import json @@ -31,14 +32,11 @@ def http_bot(prompt): "stream": True, "max_tokens": 128, } - response = requests.post(args.model_url, - headers=headers, - json=pload, - stream=True) - - for chunk in response.iter_lines(chunk_size=8192, - decode_unicode=False, - delimiter=b"\n"): + response = requests.post(args.model_url, headers=headers, json=pload, stream=True) + + for chunk in response.iter_lines( + chunk_size=8192, decode_unicode=False, delimiter=b"\n" + ): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"][0] @@ -48,10 +46,10 @@ def http_bot(prompt): def build_demo(): with gr.Blocks() as demo: gr.Markdown("# vLLM text completion demo\n") - inputbox = gr.Textbox(label="Input", - placeholder="Enter text and press ENTER") - outputbox = gr.Textbox(label="Output", - placeholder="Generated result from the model") + inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER") + outputbox = gr.Textbox( + label="Output", placeholder="Generated result from the model" + ) inputbox.submit(http_bot, [inputbox], [outputbox]) return demo @@ -60,17 +58,15 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default=None) parser.add_argument("--port", type=int, default=8001) - parser.add_argument("--model-url", - type=str, - default="http://localhost:8000/generate") + parser.add_argument( + "--model-url", type=str, default="http://localhost:8000/generate" + ) return parser.parse_args() def main(args): demo = build_demo() - demo.queue().launch(server_name=args.host, - server_port=args.port, - share=True) + demo.queue().launch(server_name=args.host, server_port=args.port, share=True) if __name__ == "__main__": diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py index 3076bba765ce..7eb3d2193f41 100644 --- a/examples/online_serving/jinaai_rerank_client.py +++ b/examples/online_serving/jinaai_rerank_client.py @@ -5,6 +5,7 @@ run: vllm serve BAAI/bge-reranker-base """ + import json import requests @@ -14,14 +15,13 @@ headers = {"accept": "application/json", "Content-Type": "application/json"} data = { - "model": - "BAAI/bge-reranker-base", - "query": - "What is the capital of France?", + "model": "BAAI/bge-reranker-base", + "query": "What is the capital of France?", "documents": [ "The capital of Brazil is Brasilia.", - "The capital of France is Paris.", "Horses and cows are both animals" - ] + "The capital of France is Paris.", + "Horses and cows are both animals", + ], } diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index 88bbbebd7478..65d74dccab80 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -9,17 +9,14 @@ # # Types copied from vllm.distributed.kv_events # -class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, - gc=False): +class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False): ts: float events: list[Any] -class KVCacheEvent(msgspec.Struct, - array_like=True, - omit_defaults=True, - gc=False, - tag=True): +class KVCacheEvent( + msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True +): """Base class for all KV cache-related events""" @@ -77,8 +74,9 @@ def main(): if last_seq >= 0 and seq > last_seq + 1: missed = seq - last_seq - 1 - print(f"Missed {missed} messages" - f" (last: {last_seq}, current: {seq})") + print( + f"Missed {missed} messages (last: {last_seq}, current: {seq})" + ) replay.send((last_seq + 1).to_bytes(8, "big")) diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index bf99777d5697..2856e3be3e2d 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -12,26 +12,22 @@ openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -messages = [{ - "role": "system", - "content": "You are a helpful assistant." -}, { - "role": "user", - "content": "Who won the world series in 2020?" -}, { - "role": "assistant", - "content": "The Los Angeles Dodgers won the World Series in 2020." -}, { - "role": "user", - "content": "Where was it played?" -}] +messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"}, + { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020.", + }, + {"role": "user", "content": "Where was it played?"}, +] def parse_args(): parser = argparse.ArgumentParser(description="Client for vLLM API server") - parser.add_argument("--stream", - action="store_true", - help="Enable streaming response") + parser.add_argument( + "--stream", action="store_true", help="Enable streaming response" + ) return parser.parse_args() diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 2707d46f46e2..8c3c6ecdd4b0 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -43,7 +43,7 @@ def encode_base64_content_from_url(content_url: str) -> str: with requests.get(content_url) as response: response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') + result = base64.b64encode(response.content).decode("utf-8") return result @@ -51,10 +51,7 @@ def encode_base64_content_from_url(content_url: str) -> str: # Text-only inference def run_text_only(model: str) -> None: chat_completion = client.chat.completions.create( - messages=[{ - "role": "user", - "content": "What's the capital of France?" - }], + messages=[{"role": "user", "content": "What's the capital of France?"}], model=model, max_completion_tokens=64, ) @@ -65,26 +62,21 @@ def run_text_only(model: str) -> None: # Single-image input inference def run_single_image(model: str) -> None: - ## Use image url in the payload image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": image_url}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -95,22 +87,18 @@ def run_single_image(model: str) -> None: ## Use base64 encoded image in the payload image_base64 = encode_base64_content_from_url(image_url) chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -124,28 +112,22 @@ def run_multi_image(model: str) -> None: image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What are the animals in these images?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url_duck + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + { + "type": "image_url", + "image_url": {"url": image_url_duck}, }, - }, - { - "type": "image_url", - "image_url": { - "url": image_url_lion + { + "type": "image_url", + "image_url": {"url": image_url_lion}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -161,22 +143,18 @@ def run_video(model: str) -> None: ## Use video url in the payload chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": video_url + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this video?"}, + { + "type": "video_url", + "video_url": {"url": video_url}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -186,22 +164,18 @@ def run_video(model: str) -> None: ## Use base64 encoded video in the payload chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this video?" - }, - { - "type": "video_url", - "video_url": { - "url": f"data:video/mp4;base64,{video_base64}" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this video?"}, + { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -219,24 +193,22 @@ def run_audio(model: str) -> None: # OpenAI-compatible schema (`input_audio`) chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - # Any format supported by librosa is supported - "data": audio_base64, - "format": "wav" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this audio?"}, + { + "type": "input_audio", + "input_audio": { + # Any format supported by librosa is supported + "data": audio_base64, + "format": "wav", + }, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -246,23 +218,21 @@ def run_audio(model: str) -> None: # HTTP URL chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - # Any format supported by librosa is supported - "url": audio_url + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this audio?"}, + { + "type": "audio_url", + "audio_url": { + # Any format supported by librosa is supported + "url": audio_url + }, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -272,23 +242,21 @@ def run_audio(model: str) -> None: # base64 URL chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - # Any format supported by librosa is supported - "url": f"data:audio/ogg;base64,{audio_base64}" + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this audio?"}, + { + "type": "audio_url", + "audio_url": { + # Any format supported by librosa is supported + "url": f"data:audio/ogg;base64,{audio_base64}" + }, }, - }, - ], - }], + ], + } + ], model=model, max_completion_tokens=64, ) @@ -308,14 +276,17 @@ def run_audio(model: str) -> None: def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using OpenAI client for online serving with ' - 'multimodal language models served with vLLM.') - parser.add_argument('--chat-type', - '-c', - type=str, - default="single-image", - choices=list(example_function_map.keys()), - help='Conversation type with multimodal data.') + description="Demo on using OpenAI client for online serving with " + "multimodal language models served with vLLM." + ) + parser.add_argument( + "--chat-type", + "-c", + type=str, + default="single-image", + choices=list(example_function_map.keys()), + help="Conversation type with multimodal data.", + ) return parser.parse_args() diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index 94f9c1570586..a0d7841f644f 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -16,6 +16,7 @@ --chat-template examples/tool_chat_template_hermes.jinja \ --enable-auto-tool-choice --tool-call-parser hermes """ + import json from typing import Any @@ -25,55 +26,55 @@ openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -tools = [{ - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": - "string", - "description": - "The city to find the weather for, e.g. 'San Francisco'" - }, - "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the city is" - " in, e.g. 'CA' which would mean 'California'" - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"] - } +properties = { + "city": { + "type": "string", + "description": "The city to find the weather for, e.g. 'San Francisco'", + }, + "state": { + "type": "string", + "description": "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, +} + +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": properties, + "required": ["city", "state", "unit"], }, - "required": ["city", "state", "unit"] - } + }, } -}] - -messages = [{ - "role": "user", - "content": "Hi! How are you doing today?" -}, { - "role": "assistant", - "content": "I'm doing well! How can I help you?" -}, { - "role": - "user", - "content": - "Can you tell me what the temperate will be in Dallas, in fahrenheit?" -}] - - -def get_current_weather(city: str, state: str, unit: 'str'): - return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is " - "partly cloudly, with highs in the 90's.") +] + +messages = [ + {"role": "user", "content": "Hi! How are you doing today?"}, + {"role": "assistant", "content": "I'm doing well! How can I help you?"}, + { + "role": "user", + "content": ( + "Can you tell me what the temperate will be in Dallas, in fahrenheit?" + ), + }, +] + + +def get_current_weather(city: str, state: str, unit: "str"): + return ( + "The weather in Dallas, Texas is 85 degrees fahrenheit. It is " + "partly cloudly, with highs in the 90's." + ) def handle_tool_calls_stream( @@ -82,10 +83,9 @@ def handle_tool_calls_stream( model: str, tools: list[dict[str, Any]], ) -> list[Any]: - tool_calls_stream = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=True) + tool_calls_stream = client.chat.completions.create( + messages=messages, model=model, tools=tools, stream=True + ) chunks = [] print("chunks: ") for chunk in tool_calls_stream: @@ -106,8 +106,7 @@ def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]: tool_call = chunk.choices[0].delta.tool_calls[0] if tool_call.index != tool_call_idx: if tool_call_idx >= 0: - print(f"streamed tool call arguments: " - f"{arguments[tool_call_idx]}") + print(f"streamed tool call arguments: {arguments[tool_call_idx]}") tool_call_idx = chunk.choices[0].delta.tool_calls[0].index arguments.append("") if tool_call.id: @@ -115,8 +114,7 @@ def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]: if tool_call.function: if tool_call.function.name: - print( - f"streamed tool call name: {tool_call.function.name}") + print(f"streamed tool call name: {tool_call.function.name}") if tool_call.function.arguments: arguments[tool_call_idx] += tool_call.function.arguments @@ -136,9 +134,9 @@ def main(): models = client.models.list() model = models.data[0].id - chat_completion = client.chat.completions.create(messages=messages, - model=model, - tools=tools) + chat_completion = client.chat.completions.create( + messages=messages, model=model, tools=tools + ) print("-" * 70) print("Chat completion results:") @@ -158,10 +156,12 @@ def main(): print("-" * 70) # Add tool call results to the conversation - messages.append({ - "role": "assistant", - "tool_calls": chat_completion.choices[0].message.tool_calls - }) + messages.append( + { + "role": "assistant", + "tool_calls": chat_completion.choices[0].message.tool_calls, + } + ) # Now, simulate a tool call available_tools = {"get_current_weather": get_current_weather} @@ -172,17 +172,18 @@ def main(): args = json.loads(call.function.arguments) result = tool_to_call(**args) print("tool_to_call result: ", result) - messages.append({ - "role": "tool", - "content": result, - "tool_call_id": call.id, - "name": call.function.name - }) - - chat_completion_2 = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=False) + messages.append( + { + "role": "tool", + "content": result, + "tool_call_id": call.id, + "name": call.function.name, + } + ) + + chat_completion_2 = client.chat.completions.create( + messages=messages, model=model, tools=tools, stream=False + ) print("Chat completion2 results:") print(chat_completion_2) print("-" * 70) diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py index 97d900bb75f1..45c4232fe1de 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -28,18 +28,16 @@ "type": "object", "properties": { "city": { - "type": - "string", - "description": - "The city to find the weather for" + "type": "string", + "description": "The city to find the weather for" ", e.g. 'San Francisco'", }, "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the " - "city is in, e.g. 'CA' which would mean 'California'", + "type": "string", + "description": ( + "the two-letter abbreviation for the state that the " + "city is in, e.g. 'CA' which would mean 'California'" + ), }, "unit": { "type": "string", @@ -60,22 +58,20 @@ "type": "object", "properties": { "city": { - "type": - "string", - "description": - "The city to get the forecast for, e.g. 'New York'", + "type": "string", + "description": ( + "The city to get the forecast for, e.g. 'New York'" + ), }, "state": { - "type": - "string", - "description": - "The two-letter abbreviation for the state, e.g. 'NY'", + "type": "string", + "description": ( + "The two-letter abbreviation for the state, e.g. 'NY'" + ), }, "days": { - "type": - "integer", - "description": - "Number of days to get the forecast for (1-7)", + "type": "integer", + "description": "Number of days to get the forecast for (1-7)", }, "unit": { "type": "string", @@ -90,19 +86,11 @@ ] messages = [ + {"role": "user", "content": "Hi! How are you doing today?"}, + {"role": "assistant", "content": "I'm doing well! How can I help you?"}, { "role": "user", - "content": "Hi! How are you doing today?" - }, - { - "role": "assistant", - "content": "I'm doing well! How can I help you?" - }, - { - "role": - "user", - "content": - "Can you tell me what the current weather is in Dallas \ + "content": "Can you tell me what the current weather is in Dallas \ and the forecast for the next 5 days, in fahrenheit?", }, ] @@ -123,17 +111,16 @@ def main(): model=model, tools=tools, tool_choice="required", - stream=True # Enable streaming response + stream=True, # Enable streaming response ) for chunk in chat_completion: if chunk.choices and chunk.choices[0].delta.tool_calls: print(chunk.choices[0].delta.tool_calls) - chat_completion = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - tool_choice="required") + chat_completion = client.chat.completions.create( + messages=messages, model=model, tools=tools, tool_choice="required" + ) print(chat_completion.choices[0].message.tool_calls) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index 722d747a69bf..a4134ea43c4b 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -20,10 +20,9 @@ def guided_choice_completion(client: OpenAI, model: str): completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": "Classify this sentiment: vLLM is wonderful!" - }], + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], extra_body={"guided_choice": ["positive", "negative"]}, ) return completion.choices[0].message.content @@ -31,20 +30,21 @@ def guided_choice_completion(client: OpenAI, model: str): # Guided decoding by Regex def guided_regex_completion(client: OpenAI, model: str): - prompt = ("Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n") + prompt = ( + "Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n" + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], - extra_body={ - "guided_regex": r"\w+@\w+\.com\n", - "stop": ["\n"] - }, + messages=[ + { + "role": "user", + "content": prompt, + } + ], + extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, ) return completion.choices[0].message.content @@ -66,14 +66,18 @@ class CarDescription(BaseModel): def guided_json_completion(client: OpenAI, model: str): json_schema = CarDescription.model_json_schema() - prompt = ("Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's") + prompt = ( + "Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's" + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_json": json_schema}, ) return completion.choices[0].message.content @@ -95,14 +99,18 @@ def guided_grammar_completion(client: OpenAI, model: str): number ::= "1 " | "2 " """ - prompt = ("Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table.") + prompt = ( + "Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table." + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_grammar": simplified_sql_grammar}, ) return completion.choices[0].message.content @@ -110,19 +118,23 @@ def guided_grammar_completion(client: OpenAI, model: str): # Extra backend options def extra_backend_options_completion(client: OpenAI, model: str): - prompt = ("Generate an email address for Alan Turing, who works in Enigma." - "End in .com and new line. Example result:" - "alan.turing@enigma.com\n") + prompt = ( + "Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n" + ) try: # The guided_decoding_disable_fallback option forces vLLM to use # xgrammar, so when it fails you get a 400 with the reason why completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={ "guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"], diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py index 08f939942508..c73208abe600 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py @@ -17,11 +17,10 @@ def main(): api_key=openai_api_key, ) - messages = [{ - "role": - "user", - "content": - """ + messages = [ + { + "role": "user", + "content": """ You have access to the following function to retrieve the weather in a city: { @@ -58,29 +57,28 @@ def main(): Given the previous instructions, what is the weather in New York City, Boston, and San Francisco? -""" - }] +""", + } + ] response = client.chat.completions.create( model=client.models.list().data[0].id, messages=messages, response_format={ - "type": - "structural_tag", - "structures": [{ - "begin": "<function=get_weather>", - "schema": { - "type": "object", - "properties": { - "city": { - "type": "string" - } - } - }, - "end": "</function>" - }], - "triggers": ["<function="] - }) + "type": "structural_tag", + "structures": [ + { + "begin": "<function=get_weather>", + "schema": { + "type": "object", + "properties": {"city": {"type": "string"}}, + }, + "end": "</function>", + } + ], + "triggers": ["<function="], + }, + ) print(response) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py index a04f0cdf12f7..1ca61a8d5895 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -27,21 +27,22 @@ def print_completion_details(completion): - print("reasoning_content: ", - completion.choices[0].message.reasoning_content) + print("reasoning_content: ", completion.choices[0].message.reasoning_content) print("content: ", completion.choices[0].message.content) # Guided decoding by Regex def guided_regex_completion(client: OpenAI, model: str): - prompt = ("What is the capital of France?") + prompt = "What is the capital of France?" completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={ "guided_regex": "(Paris|London)", }, @@ -57,13 +58,15 @@ class People(BaseModel): def guided_json_completion(client: OpenAI, model: str): json_schema = People.model_json_schema() - prompt = ("Generate a JSON with the name and age of one random person.") + prompt = "Generate a JSON with the name and age of one random person." completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_json": json_schema}, ) print_completion_details(completion) @@ -86,14 +89,18 @@ class CarDescription(BaseModel): def guided_car_json_completion(client: OpenAI, model: str): json_schema = CarDescription.model_json_schema() - prompt = ("Generate a JSON with the brand, model and car_type of" - "the most iconic car from the 90's") + prompt = ( + "Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's" + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_json": json_schema}, ) print_completion_details(completion) @@ -116,14 +123,18 @@ def guided_grammar_completion(client: OpenAI, model: str): """ # This may be very slow https://github.com/vllm-project/vllm/issues/12122 - prompt = ("Generate an SQL query to show the 'username' and 'email'" - "from the 'users' table.") + prompt = ( + "Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table." + ) completion = client.chat.completions.create( model=model, - messages=[{ - "role": "user", - "content": prompt, - }], + messages=[ + { + "role": "user", + "content": prompt, + } + ], extra_body={"guided_grammar": simplified_sql_grammar}, ) print_completion_details(completion) diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index 9417abd3989a..a5febad45863 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -20,9 +20,11 @@ # Now, simulate a tool call -def get_current_weather(city: str, state: str, unit: 'str'): - return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is " - "partly cloudly, with highs in the 90's.") +def get_current_weather(city: str, state: str, unit: "str"): + return ( + "The weather in Dallas, Texas is 85 degrees fahrenheit. It is " + "partly cloudly, with highs in the 90's." + ) available_tools = {"get_current_weather": get_current_weather} @@ -31,49 +33,47 @@ def get_current_weather(city: str, state: str, unit: 'str'): openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -tools = [{ - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "city": { - "type": - "string", - "description": - "The city to find the weather for, e.g. 'San Francisco'" - }, - "state": { - "type": - "string", - "description": - "the two-letter abbreviation for the state that the city is" - " in, e.g. 'CA' which would mean 'California'" - }, - "unit": { - "type": "string", - "description": "The unit to fetch the temperature in", - "enum": ["celsius", "fahrenheit"] - } +properties = { + "city": { + "type": "string", + "description": "The city to find the weather for, e.g. 'San Francisco'", + }, + "state": { + "type": "string", + "description": "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, +} + +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": properties, + "required": ["city", "state", "unit"], }, - "required": ["city", "state", "unit"] - } + }, } -}] -messages = [{ - "role": "user", - "content": "Hi! How are you doing today?" -}, { - "role": "assistant", - "content": "I'm doing well! How can I help you?" -}, { - "role": - "user", - "content": - "Can you tell me what the temperate will be in Dallas, in fahrenheit?" -}] +] +messages = [ + {"role": "user", "content": "Hi! How are you doing today?"}, + {"role": "assistant", "content": "I'm doing well! How can I help you?"}, + { + "role": "user", + "content": ( + "Can you tell me what the temperate will be in Dallas, in fahrenheit?" + ), + }, +] def extract_reasoning_and_calls(chunks: list): @@ -110,73 +110,55 @@ def main(): models = client.models.list() model = models.data[0].id + print("---------Full Generate With Automatic Function Calling-------------") + tool_calls = client.chat.completions.create( + messages=messages, model=model, tools=tools + ) + print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") + print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}") print( - "---------Full Generate With Automatic Function Calling-------------") - tool_calls = client.chat.completions.create(messages=messages, - model=model, - tools=tools) - print( - f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}" + f"function arguments: " + f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}" ) - print(f"function name: " - f"{tool_calls.choices[0].message.tool_calls[0].function.name}") - print(f"function arguments: " - f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}") - print( - "----------Stream Generate With Automatic Function Calling-----------") - tool_calls_stream = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - stream=True) + print("----------Stream Generate With Automatic Function Calling-----------") + tool_calls_stream = client.chat.completions.create( + messages=messages, model=model, tools=tools, stream=True + ) chunks = list(tool_calls_stream) - reasoning_content, arguments, function_names = extract_reasoning_and_calls( - chunks) + reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks) print(f"reasoning_content: {reasoning_content}") print(f"function name: {function_names[0]}") print(f"function arguments: {arguments[0]}") - print( - "----------Full Generate With Named Function Calling-----------------") - tool_calls = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - tool_choice={ - "type": "function", - "function": { - "name": - "get_current_weather" - } - }) + print("----------Full Generate With Named Function Calling-----------------") + tool_calls = client.chat.completions.create( + messages=messages, + model=model, + tools=tools, + tool_choice={"type": "function", "function": {"name": "get_current_weather"}}, + ) tool_call = tool_calls.choices[0].message.tool_calls[0].function - print( - f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}" - ) + print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}") print(f"function name: {tool_call.name}") print(f"function arguments: {tool_call.arguments}") - print( - "----------Stream Generate With Named Function Calling--------------") + print("----------Stream Generate With Named Function Calling--------------") tool_calls_stream = client.chat.completions.create( messages=messages, model=model, tools=tools, - tool_choice={ - "type": "function", - "function": { - "name": "get_current_weather" - } - }, - stream=True) + tool_choice={"type": "function", "function": {"name": "get_current_weather"}}, + stream=True, + ) chunks = list(tool_calls_stream) - reasoning_content, arguments, function_names = extract_reasoning_and_calls( - chunks) + reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks) print(f"reasoning_content: {reasoning_content}") print(f"function name: {function_names[0]}") print(f"function arguments: {arguments[0]}") diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 4bf7731cb41e..f6b8082115f1 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -45,12 +45,12 @@ def main(): # Round 2 messages.append({"role": "assistant", "content": content}) - messages.append({ - "role": - "user", - "content": - "How many Rs are there in the word 'strawberry'?", - }) + messages.append( + { + "role": "user", + "content": "How many Rs are there in the word 'strawberry'?", + } + ) response = client.chat.completions.create(model=model, messages=messages) reasoning_content = response.choices[0].message.reasoning_content diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 9cc0a5f2476b..f984fbabf24f 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -43,9 +43,7 @@ def main(): # ruff: noqa: E501 # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}` - stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) + stream = client.chat.completions.create(model=model, messages=messages, stream=True) print("client: Start streaming chat completions...") printed_reasoning_content = False diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index c850b5aa2f80..ee519e555ff7 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -14,26 +14,17 @@ def vlm2vec(): response = requests.post( "http://localhost:8000/v1/embeddings", json={ - "model": - "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, - { - "type": "text", - "text": "Represent the given image." - }, - ], - }], - "encoding_format": - "float", + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + } + ], + "encoding_format": "float", }, ) response.raise_for_status() @@ -45,19 +36,20 @@ def vlm2vec(): def dse_qwen2_vl(inp: dict): # Embedding an Image if inp["type"] == "image": - messages = [{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": inp["image_url"], - } - }, { - "type": "text", - "text": "What is shown in this image?" - }] - }] + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": inp["image_url"], + }, + }, + {"type": "text", "text": "What is shown in this image?"}, + ], + } + ] # Embedding a Text Query else: # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image @@ -66,23 +58,21 @@ def dse_qwen2_vl(inp: dict): image_placeholder = Image.new("RGB", (56, 56)) image_placeholder.save(buffer, "png") buffer.seek(0) - image_placeholder = base64.b64encode(buffer.read()).decode('utf-8') - messages = [{ - "role": - "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_placeholder}", - } - }, - { - "type": "text", - "text": f"Query: {inp['content']}" - }, - ] - }] + image_placeholder = base64.b64encode(buffer.read()).decode("utf-8") + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_placeholder}", + }, + }, + {"type": "text", "text": f"Query: {inp['content']}"}, + ], + } + ] response = requests.post( "http://localhost:8000/v1/embeddings", @@ -101,12 +91,15 @@ def dse_qwen2_vl(inp: dict): def parse_args(): parser = argparse.ArgumentParser( "Script to call a specified VLM through the API. Make sure to serve " - "the model with --task embed before running this.") - parser.add_argument("--model", - type=str, - choices=["vlm2vec", "dse_qwen2_vl"], - required=True, - help="Which model to call.") + "the model with --task embed before running this." + ) + parser.add_argument( + "--model", + type=str, + choices=["vlm2vec", "dse_qwen2_vl"], + required=True, + help="Which model to call.", + ) return parser.parse_args() @@ -114,16 +107,20 @@ def main(args): if args.model == "vlm2vec": vlm2vec() elif args.model == "dse_qwen2_vl": - dse_qwen2_vl({ - "type": "image", - "image_url": image_url, - }) - dse_qwen2_vl({ - "type": "text", - "content": "What is the weather like today?", - }) + dse_qwen2_vl( + { + "type": "image", + "image_url": image_url, + } + ) + dse_qwen2_vl( + { + "type": "text", + "content": "What is the weather like today?", + } + ) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() main(args) diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py index 99241346373e..649cfa5d6686 100644 --- a/examples/online_serving/openai_classification_client.py +++ b/examples/online_serving/openai_classification_client.py @@ -16,9 +16,7 @@ def parse_args(): parse = argparse.ArgumentParser() parse.add_argument("--host", type=str, default="localhost") parse.add_argument("--port", type=int, default=8000) - parse.add_argument("--model", - type=str, - default="jason9693/Qwen2.5-1.5B-apeach") + parse.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach") return parse.parse_args() diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 77f721921da2..b1d21b5e4b9f 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -11,9 +11,9 @@ def parse_args(): parser = argparse.ArgumentParser(description="Client for vLLM API server") - parser.add_argument("--stream", - action="store_true", - help="Enable streaming response") + parser.add_argument( + "--stream", action="store_true", help="Enable streaming response" + ) return parser.parse_args() @@ -34,7 +34,8 @@ def main(args): echo=False, n=2, stream=args.stream, - logprobs=3) + logprobs=3, + ) print("-" * 50) print("Completion results:") diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 20a64ddb2141..7891e14cb71e 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -4,6 +4,7 @@ Run `vllm serve <model> --task score` to start up the server in vLLM. """ + import argparse import pprint @@ -38,9 +39,7 @@ def main(args): pprint.pprint(score_response.json()) text_1 = "What is the capital of France?" - text_2 = [ - "The capital of Brazil is Brasilia.", "The capital of France is Paris." - ] + text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) print("\nPrompt when text_1 is string and text_2 is a list:") @@ -48,12 +47,8 @@ def main(args): print("\nScore Response:") pprint.pprint(score_response.json()) - text_1 = [ - "What is the capital of Brazil?", "What is the capital of France?" - ] - text_2 = [ - "The capital of Brazil is Brasilia.", "The capital of France is Paris." - ] + text_1 = ["What is the capital of Brazil?", "What is the capital of France?"] + text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) print("\nPrompt when text_1 and text_2 are both lists:") diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index bc217f7ca7a0..a055654e9133 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -21,7 +21,7 @@ def main(): # ruff: noqa: E501 input=[ "Hello my name is", - "The best thing about vLLM is that it supports many different models" + "The best thing about vLLM is that it supports many different models", ], model=model, ) diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index abcfe27c2769..2620a1232024 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -5,6 +5,7 @@ Run `vllm serve <model> --task <embed|classify|reward|score>` to start up the server in vLLM. """ + import argparse import pprint @@ -21,9 +22,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--model", - type=str, - default="jason9693/Qwen2.5-1.5B-apeach") + parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach") return parser.parse_args() @@ -42,15 +41,13 @@ def main(args): # Input like Chat API prompt = { - "model": - model_name, - "messages": [{ - "role": "user", - "content": [{ - "type": "text", - "text": "vLLM is great!" - }], - }] + "model": model_name, + "messages": [ + { + "role": "user", + "content": [{"type": "text", "text": "vLLM is great!"}], + } + ], } pooling_response = post_http_request(prompt=prompt, api_url=api_url) print("Pooling Response:") diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index 66e622672ef2..eb501ae72aa9 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -7,8 +7,8 @@ from vllm.assets.audio import AudioAsset -mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path() -winning_call = AudioAsset('winning_call').get_local_path() +mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path() +winning_call = AudioAsset("winning_call").get_local_path() # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" @@ -31,7 +31,8 @@ def sync_openai(): extra_body=dict( seed=4419, repetition_penalty=1.3, - )) + ), + ) print("transcription result:", transcription.text) @@ -42,33 +43,30 @@ def sync_openai(): async def stream_openai_response(): data = { "language": "en", - 'stream': True, + "stream": True, "model": "openai/whisper-large-v3", } url = openai_api_base + "/audio/transcriptions" headers = {"Authorization": f"Bearer {openai_api_key}"} - print("transcription result:", end=' ') + print("transcription result:", end=" ") async with httpx.AsyncClient() as client: with open(str(winning_call), "rb") as f: - async with client.stream('POST', - url, - files={'file': f}, - data=data, - headers=headers) as response: + async with client.stream( + "POST", url, files={"file": f}, data=data, headers=headers + ) as response: async for line in response.aiter_lines(): # Each line is a JSON object prefixed with 'data: ' if line: - if line.startswith('data: '): - line = line[len('data: '):] + if line.startswith("data: "): + line = line[len("data: ") :] # Last chunk, stream ends - if line.strip() == '[DONE]': + if line.strip() == "[DONE]": break # Parse the JSON response chunk = json.loads(line) # Extract and print the content - content = chunk['choices'][0].get('delta', - {}).get('content') - print(content, end='') + content = chunk["choices"][0].get("delta", {}).get("content") + print(content, end="") # Run the asynchronous function diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py index a8b353090d79..33d365f0caa5 100644 --- a/examples/online_serving/opentelemetry/dummy_client.py +++ b/examples/online_serving/opentelemetry/dummy_client.py @@ -1,14 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 import requests -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( - OTLPSpanExporter) +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import (BatchSpanProcessor, - ConsoleSpanExporter) +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.trace import SpanKind, set_tracer_provider -from opentelemetry.trace.propagation.tracecontext import ( - TraceContextTextMapPropagator) +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator trace_provider = TracerProvider() set_tracer_provider(trace_provider) diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py index ea580f1b432b..85ea2340736e 100644 --- a/examples/online_serving/prompt_embed_inference_with_openai_client.py +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -26,6 +26,7 @@ - torch - openai """ + import base64 import io @@ -44,17 +45,13 @@ def main(): # Transformers tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) - transformers_model = transformers.AutoModelForCausalLM.from_pretrained( - model_name) + transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name) # Refer to the HuggingFace repo for the correct format to use - chat = [{ - "role": "user", - "content": "Please tell me about the capital of France." - }] - token_ids = tokenizer.apply_chat_template(chat, - add_generation_prompt=True, - return_tensors='pt') + chat = [{"role": "user", "content": "Please tell me about the capital of France."}] + token_ids = tokenizer.apply_chat_template( + chat, add_generation_prompt=True, return_tensors="pt" + ) embedding_layer = transformers_model.get_input_embeddings() prompt_embeds = embedding_layer(token_ids).squeeze(0) @@ -64,7 +61,7 @@ def main(): torch.save(prompt_embeds, buffer) buffer.seek(0) binary_data = buffer.read() - encoded_embeds = base64.b64encode(binary_data).decode('utf-8') + encoded_embeds = base64.b64encode(binary_data).decode("utf-8") completion = client.completions.create( model=model_name, @@ -75,7 +72,8 @@ def main(): temperature=0.0, # NOTE: The OpenAI client allows passing in extra JSON body via the # `extra_body` argument. - extra_body={"prompt_embeds": encoded_embeds}) + extra_body={"prompt_embeds": encoded_embeds}, + ) print("-" * 30) print(completion.choices[0].text) diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py index e2dce107e78a..a76020130c3a 100644 --- a/examples/online_serving/ray_serve_deepseek.py +++ b/examples/online_serving/ray_serve_deepseek.py @@ -28,9 +28,7 @@ }, # Change to the accelerator type of the node accelerator_type="H100", - runtime_env={"env_vars": { - "VLLM_USE_V1": "1" - }}, + runtime_env={"env_vars": {"VLLM_USE_V1": "1"}}, # Customize engine arguments as needed (e.g. vLLM engine kwargs) engine_kwargs={ "tensor_parallel_size": 8, diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py index 73063065cb36..37af3b3887f5 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py +++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py @@ -55,7 +55,7 @@ def load_and_split_documents(config: dict[str, Any]): Load and split documents from web URL """ try: - loader = WebBaseLoader(web_paths=(config["url"], )) + loader = WebBaseLoader(web_paths=(config["url"],)) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( @@ -121,64 +121,71 @@ def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate): """ Set up question answering chain """ - return ({ - "context": retriever | format_docs, - "question": RunnablePassthrough(), - } - | prompt - | llm - | StrOutputParser()) + return ( + { + "context": retriever | format_docs, + "question": RunnablePassthrough(), + } + | prompt + | llm + | StrOutputParser() + ) def get_parser() -> argparse.ArgumentParser: """ Parse command line arguments """ - parser = argparse.ArgumentParser(description='RAG with vLLM and langchain') + parser = argparse.ArgumentParser(description="RAG with vLLM and langchain") # Add command line arguments - parser.add_argument('--vllm-api-key', - default="EMPTY", - help='API key for vLLM compatible services') - parser.add_argument('--vllm-embedding-endpoint', - default="http://localhost:8000/v1", - help='Base URL for embedding service') - parser.add_argument('--vllm-chat-endpoint', - default="http://localhost:8001/v1", - help='Base URL for chat service') - parser.add_argument('--uri', - default="./milvus.db", - help='URI for Milvus database') parser.add_argument( - '--url', - default=("https://docs.vllm.ai/en/latest/getting_started/" - "quickstart.html"), - help='URL of the document to process') - parser.add_argument('--embedding-model', - default="ssmits/Qwen2-7B-Instruct-embed-base", - help='Model name for embeddings') - parser.add_argument('--chat-model', - default="qwen/Qwen1.5-0.5B-Chat", - help='Model name for chat') - parser.add_argument('-i', - '--interactive', - action='store_true', - help='Enable interactive Q&A mode') - parser.add_argument('-k', - '--top-k', - type=int, - default=3, - help='Number of top results to retrieve') - parser.add_argument('-c', - '--chunk-size', - type=int, - default=1000, - help='Chunk size for document splitting') - parser.add_argument('-o', - '--chunk-overlap', - type=int, - default=200, - help='Chunk overlap for document splitting') + "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services" + ) + parser.add_argument( + "--vllm-embedding-endpoint", + default="http://localhost:8000/v1", + help="Base URL for embedding service", + ) + parser.add_argument( + "--vllm-chat-endpoint", + default="http://localhost:8001/v1", + help="Base URL for chat service", + ) + parser.add_argument("--uri", default="./milvus.db", help="URI for Milvus database") + parser.add_argument( + "--url", + default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"), + help="URL of the document to process", + ) + parser.add_argument( + "--embedding-model", + default="ssmits/Qwen2-7B-Instruct-embed-base", + help="Model name for embeddings", + ) + parser.add_argument( + "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode" + ) + parser.add_argument( + "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve" + ) + parser.add_argument( + "-c", + "--chunk-size", + type=int, + default=1000, + help="Chunk size for document splitting", + ) + parser.add_argument( + "-o", + "--chunk-overlap", + type=int, + default=200, + help="Chunk overlap for document splitting", + ) return parser @@ -198,7 +205,7 @@ def init_config(args: Namespace): "url": args.url, "chunk_size": args.chunk_size, "chunk_overlap": args.chunk_overlap, - "top_k": args.top_k + "top_k": args.top_k, } @@ -230,7 +237,7 @@ def main(): while True: question = input("\nPlease enter your question: ") - if question.lower() in ['q', 'quit']: + if question.lower() in ["q", "quit"]: print("\nThank you for using! Goodbye!") break @@ -238,7 +245,7 @@ def main(): print(output) else: # Default single question mode - question = ("How to install vLLM?") + question = "How to install vLLM?" output = qa_chain.invoke(question) print("-" * 50) print(output) diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py index a8f76dfe4c69..08796b1b3a54 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py +++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py @@ -35,6 +35,7 @@ - Default ports: 8000 (embedding), 8001 (chat) - First run may take time to download models """ + import argparse from argparse import Namespace from typing import Any @@ -59,7 +60,7 @@ def init_config(args: Namespace): "db_path": args.db_path, "chunk_size": args.chunk_size, "chunk_overlap": args.chunk_overlap, - "top_k": args.top_k + "top_k": args.top_k, } @@ -117,52 +118,58 @@ def query_document(index: VectorStoreIndex, question: str, top_k: int): def get_parser() -> argparse.ArgumentParser: """Parse command line arguments""" - parser = argparse.ArgumentParser( - description='RAG with vLLM and LlamaIndex') + parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex") # Add command line arguments parser.add_argument( - '--url', - default=("https://docs.vllm.ai/en/latest/getting_started/" - "quickstart.html"), - help='URL of the document to process') - parser.add_argument('--embedding-model', - default="ssmits/Qwen2-7B-Instruct-embed-base", - help='Model name for embeddings') - parser.add_argument('--chat-model', - default="qwen/Qwen1.5-0.5B-Chat", - help='Model name for chat') - parser.add_argument('--vllm-api-key', - default="EMPTY", - help='API key for vLLM compatible services') - parser.add_argument('--embedding-endpoint', - default="http://localhost:8000/v1", - help='Base URL for embedding service') - parser.add_argument('--chat-endpoint', - default="http://localhost:8001/v1", - help='Base URL for chat service') - parser.add_argument('--db-path', - default="./milvus_demo.db", - help='Path to Milvus database') - parser.add_argument('-i', - '--interactive', - action='store_true', - help='Enable interactive Q&A mode') - parser.add_argument('-c', - '--chunk-size', - type=int, - default=1000, - help='Chunk size for document splitting') - parser.add_argument('-o', - '--chunk-overlap', - type=int, - default=200, - help='Chunk overlap for document splitting') - parser.add_argument('-k', - '--top-k', - type=int, - default=3, - help='Number of top results to retrieve') + "--url", + default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"), + help="URL of the document to process", + ) + parser.add_argument( + "--embedding-model", + default="ssmits/Qwen2-7B-Instruct-embed-base", + help="Model name for embeddings", + ) + parser.add_argument( + "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat" + ) + parser.add_argument( + "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services" + ) + parser.add_argument( + "--embedding-endpoint", + default="http://localhost:8000/v1", + help="Base URL for embedding service", + ) + parser.add_argument( + "--chat-endpoint", + default="http://localhost:8001/v1", + help="Base URL for chat service", + ) + parser.add_argument( + "--db-path", default="./milvus_demo.db", help="Path to Milvus database" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode" + ) + parser.add_argument( + "-c", + "--chunk-size", + type=int, + default=1000, + help="Chunk size for document splitting", + ) + parser.add_argument( + "-o", + "--chunk-overlap", + type=int, + default=200, + help="Chunk overlap for document splitting", + ) + parser.add_argument( + "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve" + ) return parser @@ -193,7 +200,7 @@ def main(): question = input("\nEnter your question: ") # Check for exit command - if question.lower() in ['quit', 'exit', 'q']: + if question.lower() in ["quit", "exit", "q"]: print("Exiting interactive mode...") break diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py index d8a0f211d44d..0722aa671f66 100644 --- a/examples/online_serving/streamlit_openai_chatbot_webserver.py +++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py @@ -26,6 +26,7 @@ streamlit run streamlit_openai_chatbot_webserver.py \ --logger.level=debug """ + import os from datetime import datetime @@ -33,8 +34,8 @@ from openai import OpenAI # Get command line arguments from environment variables -openai_api_key = os.getenv('VLLM_API_KEY', "EMPTY") -openai_api_base = os.getenv('VLLM_API_BASE', "http://localhost:8000/v1") +openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY") +openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1") # Initialize session states for managing chat sessions if "sessions" not in st.session_state: @@ -81,9 +82,9 @@ def get_llm_response(messages, model): Streaming response object or error message string """ try: - response = client.chat.completions.create(model=model, - messages=messages, - stream=True) + response = client.chat.completions.create( + model=model, messages=messages, stream=True + ) return response except Exception as e: st.error(f"Error details: {str(e)}") @@ -92,8 +93,9 @@ def get_llm_response(messages, model): # Sidebar - API Settings first st.sidebar.title("API Settings") -new_api_base = st.sidebar.text_input("API Base URL:", - value=st.session_state.api_base_url) +new_api_base = st.sidebar.text_input( + "API Base URL:", value=st.session_state.api_base_url +) if new_api_base != st.session_state.api_base_url: st.session_state.api_base_url = new_api_base st.rerun() @@ -109,16 +111,20 @@ def get_llm_response(messages, model): for session_id in sorted(st.session_state.sessions.keys(), reverse=True): # Mark the active session with a pinned button if session_id == st.session_state.active_session: - st.sidebar.button(f"📍 {session_id}", - key=session_id, - type="primary", - on_click=switch_to_chat_session, - args=(session_id, )) + st.sidebar.button( + f"📍 {session_id}", + key=session_id, + type="primary", + on_click=switch_to_chat_session, + args=(session_id,), + ) else: - st.sidebar.button(f"Session {session_id}", - key=session_id, - on_click=switch_to_chat_session, - args=(session_id, )) + st.sidebar.button( + f"Session {session_id}", + key=session_id, + on_click=switch_to_chat_session, + args=(session_id,), + ) # Main interface st.title("vLLM Chat Assistant") @@ -145,18 +151,18 @@ def get_llm_response(messages, model): if prompt := st.chat_input("Type your message here..."): # Save user message to session st.session_state.messages.append({"role": "user", "content": prompt}) - st.session_state.sessions[ - st.session_state.current_session] = st.session_state.messages + st.session_state.sessions[st.session_state.current_session] = ( + st.session_state.messages + ) # Display user message with st.chat_message("user"): st.write(prompt) # Prepare messages for llm - messages_for_llm = [{ - "role": m["role"], - "content": m["content"] - } for m in st.session_state.messages] + messages_for_llm = [ + {"role": m["role"], "content": m["content"]} for m in st.session_state.messages + ] # Generate and display llm response with st.chat_message("assistant"): @@ -179,7 +185,4 @@ def get_llm_response(messages, model): message_placeholder.markdown(full_response) # Save llm response to session history - st.session_state.messages.append({ - "role": "assistant", - "content": full_response - }) + st.session_state.messages.append({"role": "assistant", "content": full_response}) diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py index 4826e8e20528..0781a27f19c5 100644 --- a/examples/online_serving/utils.py +++ b/examples/online_serving/utils.py @@ -16,10 +16,10 @@ def get_first_model(client: OpenAI) -> str: f"{client.base_url} with API key {client.api_key}. Check\n" "1. the server is running\n" "2. the server URL is correct\n" - "3. the API key is correct") from e + "3. the API key is correct" + ) from e if len(models.data) == 0: - raise RuntimeError( - f"No models found on the vLLM server at {client.base_url}") + raise RuntimeError(f"No models found on the vLLM server at {client.base_url}") return models.data[0].id diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py index eedb47dfc12e..98eafb31ed4f 100644 --- a/examples/others/lmcache/cpu_offload_lmcache.py +++ b/examples/others/lmcache/cpu_offload_lmcache.py @@ -20,6 +20,7 @@ Learn more about LMCache environment setup, please refer to: https://docs.lmcache.ai/getting_started/installation.html """ + import argparse import contextlib import os @@ -49,8 +50,7 @@ def setup_environment_variables(vllm_version: str): @contextlib.contextmanager -def build_llm_with_lmcache(lmcache_connector: str, model: str, - vllm_version: str): +def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str): ktc = KVTransferConfig( kv_connector=lmcache_connector, kv_role="kv_both", @@ -97,18 +97,19 @@ def print_output( for output in outputs: generated_text = output.outputs[0].text print(f"Generated text: {generated_text!r}") - print(f"Generation took {time.time() - start:.2f} seconds, " - f"{req_str} request done.") + print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.") print("-" * 50) def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("-v", - "--version", - choices=["v0", "v1"], - default="v1", - help="Specify vLLM version (default: v1)") + parser.add_argument( + "-v", + "--version", + choices=["v0", "v1"], + default="v1", + help="Specify vLLM version (default: v1)", + ) return parser.parse_args() @@ -125,7 +126,6 @@ def main(): setup_environment_variables(args.version) with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm: - # This example script runs two requests with a shared prefix. # Define the shared prompt and specific prompts shared_prompt = "Hello, how are you?" * 1000 @@ -136,9 +136,7 @@ def main(): shared_prompt + "Tell me a very long story", ] - sampling_params = SamplingParams(temperature=0, - top_p=0.95, - max_tokens=10) + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) # Print the first output print_output(llm, first_prompt, sampling_params, "first") diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py index 66cc94185230..b2b7b3b2c1f9 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v0.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py @@ -4,12 +4,13 @@ with LMCache. We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), and launch an additional LMCache server. -KV cache is transferred in the following manner: +KV cache is transferred in the following manner: vLLM prefill node -> LMCache server -> vLLM decode node. Note that `pip install lmcache` is needed to run this example. Learn more about LMCache in https://github.com/LMCache/LMCache. """ + import os import subprocess import time @@ -49,19 +50,23 @@ def run_prefill(prefill_done, prompts): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - ktc = KVTransferConfig(kv_connector="LMCacheConnector", - kv_role="kv_producer", - kv_rank=0, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="LMCacheConnector", + kv_role="kv_producer", + kv_rank=0, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) - - #llm.generate(prompts, sampling_params) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) + + # llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params) for output in outputs: generated_text = output.outputs[0].text @@ -79,17 +84,21 @@ def run_decode(prefill_done, prompts, timeout=1): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig(kv_connector="LMCacheConnector", - kv_role="kv_consumer", - kv_rank=1, - kv_parallel_size=2) + ktc = KVTransferConfig( + kv_connector="LMCacheConnector", + kv_role="kv_consumer", + kv_rank=1, + kv_parallel_size=2, + ) # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # of memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) print("Waiting for prefill node to finish...") prefill_done.wait() @@ -105,10 +114,9 @@ def run_decode(prefill_done, prompts, timeout=1): def run_lmcache_server(port): - server_proc = subprocess.Popen([ - "python", "-m", "lmcache.experimental.server", "localhost", - str(port) - ]) + server_proc = subprocess.Popen( + ["python", "-m", "lmcache.experimental.server", "localhost", str(port)] + ) return server_proc diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index 32d36da9f2e8..20155c203658 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -17,13 +17,17 @@ async def lifespan(app: FastAPI): Lifespan context manager to handle startup and shutdown events. """ # Startup: Initialize clients - prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1' - decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1' - - app.state.prefill_client = httpx.AsyncClient(timeout=None, - base_url=prefiller_base_url) - app.state.decode_client = httpx.AsyncClient(timeout=None, - base_url=decoder_base_url) + prefiller_base_url = ( + f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1" + ) + decoder_base_url = ( + f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1" + ) + + app.state.prefill_client = httpx.AsyncClient( + timeout=None, base_url=prefiller_base_url + ) + app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url) yield @@ -37,7 +41,6 @@ async def lifespan(app: FastAPI): class StatsCalculator: - def __init__(self): self._stats = [] self._last_log_time = time.time() @@ -51,13 +54,18 @@ def add(self, value): def _log_stats(self): # Print average, median, and 99th percentile np_arr = np.array(self._stats) - output_str = f"\nNum requests: {len(self._stats)}" + \ - "\nPrefill node TTFT stats:" + \ - f"\n - Average (ms): {np.mean(np_arr)}" + \ - f"\n - Median (ms): {np.median(np_arr)}" + \ - f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n" - print("===============================", output_str, - "===============================") + output_str = ( + f"\nNum requests: {len(self._stats)}" + + "\nPrefill node TTFT stats:" + + f"\n - Average (ms): {np.mean(np_arr)}" + + f"\n - Median (ms): {np.median(np_arr)}" + + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n" + ) + print( + "===============================", + output_str, + "===============================", + ) stats_calculator = StatsCalculator() @@ -82,15 +90,16 @@ def parse_args(): app.state.decode_client = None -async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, - req_data: dict): +async def send_request_to_service( + client: httpx.AsyncClient, endpoint: str, req_data: dict +): """ Send a request to a service using a persistent client. """ req_data = req_data.copy() - req_data['max_tokens'] = 1 - if 'max_completion_tokens' in req_data: - req_data['max_completion_tokens'] = 1 + req_data["max_tokens"] = 1 + if "max_completion_tokens" in req_data: + req_data["max_completion_tokens"] = 1 headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} response = await client.post(endpoint, json=req_data, headers=headers) @@ -98,14 +107,16 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str, return response -async def stream_service_response(client: httpx.AsyncClient, endpoint: str, - req_data: dict): +async def stream_service_response( + client: httpx.AsyncClient, endpoint: str, req_data: dict +): """ Asynchronously stream the response from a service using a persistent client. """ headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - async with client.stream("POST", endpoint, json=req_data, - headers=headers) as response: + async with client.stream( + "POST", endpoint, json=req_data, headers=headers + ) as response: response.raise_for_status() async for chunk in response.aiter_bytes(): yield chunk @@ -121,28 +132,28 @@ async def handle_completions(request: Request): req_data = await request.json() # Send request to prefill service, ignore the response - await send_request_to_service(app.state.prefill_client, "/completions", - req_data) + await send_request_to_service( + app.state.prefill_client, "/completions", req_data + ) et = time.time() stats_calculator.add(et - st) # Stream response from decode service async def generate_stream(): - async for chunk in stream_service_response(app.state.decode_client, - "/completions", - req_data): + async for chunk in stream_service_response( + app.state.decode_client, "/completions", req_data + ): yield chunk - return StreamingResponse(generate_stream(), - media_type="text/event-stream") + return StreamingResponse(generate_stream(), media_type="text/event-stream") except Exception as e: import sys import traceback + exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server" - " - completions endpoint") + print("Error occurred in disagg prefill proxy server - completions endpoint") print(e) print("".join(traceback.format_exception(*exc_info))) raise @@ -158,36 +169,39 @@ async def handle_chat_completions(request: Request): req_data = await request.json() # Send request to prefill service, ignore the response - await send_request_to_service(app.state.prefill_client, - "/chat/completions", req_data) + await send_request_to_service( + app.state.prefill_client, "/chat/completions", req_data + ) et = time.time() stats_calculator.add(et - st) # Stream response from decode service async def generate_stream(): - async for chunk in stream_service_response(app.state.decode_client, - "/chat/completions", - req_data): + async for chunk in stream_service_response( + app.state.decode_client, "/chat/completions", req_data + ): yield chunk - return StreamingResponse(generate_stream(), - media_type="text/event-stream") + return StreamingResponse(generate_stream(), media_type="text/event-stream") except Exception as e: import sys import traceback + exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server " - " - chat completions endpoint") + print( + "Error occurred in disagg prefill proxy server - chat completions endpoint" + ) print(e) print("".join(traceback.format_exception(*exc_info))) raise -if __name__ == '__main__': +if __name__ == "__main__": global global_args global_args = parse_args() import uvicorn + uvicorn.run(app, host=global_args.host, port=global_args.port) diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py index 7748f8ca6133..89945d67a6f3 100644 --- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py +++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py @@ -3,13 +3,14 @@ This file demonstrates the example usage of remote KV cache sharing with LMCache. We will launch 2 vllm instances, and launch an additional LMCache server. -KV cache is transferred in the following manner: +KV cache is transferred in the following manner: (1) vLLM instance 1 -> LMCache server (KV cache store). (2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve). Note that lmcache needs to be installed to run this example. Learn more about LMCache in https://github.com/LMCache/LMCache. """ + import os import subprocess import time @@ -49,15 +50,16 @@ def run_store(store_done, prompts): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", - kv_role="kv_both") + ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both") # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) outputs = llm.generate(prompts, sampling_params) for output in outputs: @@ -76,15 +78,16 @@ def run_retrieve(store_done, prompts, timeout=1): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", - kv_role="kv_both") + ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both") # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # of memory. Reduce the value if your GPU has less memory. - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enforce_eager=True) + llm = LLM( + model="mistralai/Mistral-7B-Instruct-v0.2", + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + enforce_eager=True, + ) print("Waiting for KV cache store to finish...") store_done.wait() @@ -100,10 +103,9 @@ def run_retrieve(store_done, prompts, timeout=1): def run_lmcache_server(port): - server_proc = subprocess.Popen([ - "python", "-m", "lmcache.experimental.server", "localhost", - str(port) - ]) + server_proc = subprocess.Popen( + ["python", "-m", "lmcache.experimental.server", "localhost", str(port)] + ) return server_proc diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 38193b1c1002..175777630833 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -10,8 +10,11 @@ from vllm.engine.arg_utils import EngineArgs from vllm.lora.request import LoRARequest from vllm.model_executor.model_loader.tensorizer import ( - TensorizerArgs, TensorizerConfig, tensorize_lora_adapter, - tensorize_vllm_model) + TensorizerArgs, + TensorizerConfig, + tensorize_lora_adapter, + tensorize_vllm_model, +) from vllm.utils import FlexibleArgumentParser # yapf conflicts with isort for this docstring diff --git a/examples/pyproject.toml b/examples/pyproject.toml new file mode 100644 index 000000000000..f825cb203269 --- /dev/null +++ b/examples/pyproject.toml @@ -0,0 +1,54 @@ +# This local pyproject file is part of the migration from yapf to ruff format. +# It uses the same core rules as the main pyproject.toml file, but with the +# following differences: +# - ruff line length is overridden to 88 +# - deprecated typing ignores (UP006, UP035) have been removed + +[tool.ruff] +line-length = 88 +exclude = [ + # External file, leaving license intact + "examples/other/fp8/quantizer/quantize.py", + "vllm/vllm_flash_attn/flash_attn_interface.pyi" +] + +[tool.ruff.lint.per-file-ignores] +"vllm/third_party/**" = ["ALL"] +"vllm/version.py" = ["F401"] +"vllm/_version.py" = ["ALL"] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-logging-format + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", + # f-string format + "UP032", + # Can remove once 3.10+ is the minimum Python version + "UP007", +] + +[tool.ruff.lint.isort] +known-first-party = ["vllm"] + +[tool.ruff.format] +docstring-code-format = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c642aa048586..62a734d795d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ ignore_patterns = [ ".buildkite/**", "benchmarks/**", "build/**", + "examples/**", ] [tool.ruff] @@ -144,6 +145,7 @@ skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora skip_glob = [ ".buildkite/*", "benchmarks/*", + "examples/*", ] use_parentheses = true skip_gitignore = true From 0eebd74842d56a2c84374ee2559ef99f9c841774 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Tue, 27 May 2025 04:13:37 +0100 Subject: [PATCH 172/192] [Model][Gemma3] Simplify image input validation (#18710) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/model_executor/models/gemma3_mm.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 743542ec8dfa..c4ae5b50c451 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -504,18 +504,12 @@ def dtype(self): return next(self.parameters()).dtype def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.vision_config.image_size - expected_dims = (3, h, w) - - def _validate_shape(d: torch.Tensor): - if d.shape != expected_dims: - raise ValueError( - "The expected shape of pixel values per image per batch " - f"is {expected_dims}. You supplied {tuple(d.shape)}.") - - for d in data: - _validate_shape(d) - + image_size = self.config.vision_config.image_size + expected_dims = (3, image_size, image_size) + if data.shape[1:] != expected_dims: + raise ValueError( + "The expected shape of pixel values per image per batch is " + f"{expected_dims}. You supplied {tuple(data.shape)}.") return data def _parse_and_validate_image_input( From 1f88dbd2bb7abb34e9018838bfe5856a66c0f63e Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Tue, 27 May 2025 12:35:16 +0800 Subject: [PATCH 173/192] [Misc] improve web section group title display (#18684) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- docs/mkdocs/stylesheets/extra.css | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css index dd7b3460a319..088143ed5956 100644 --- a/docs/mkdocs/stylesheets/extra.css +++ b/docs/mkdocs/stylesheets/extra.css @@ -22,3 +22,15 @@ a:not(:has(svg)):not(.md-icon):not(.autorefs-external) { display: inline-block; } } + +/* Light mode: darker section titles */ +body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis { + color: rgba(0, 0, 0, 0.7) !important; + font-weight: 700; +} + +/* Dark mode: lighter gray section titles */ +body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .md-ellipsis { + color: rgba(255, 255, 255, 0.75) !important; + font-weight: 700; +} From 1f1b1bc03bfa36dbd0d884cede6f4dcdf0937e25 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Tue, 27 May 2025 12:40:28 +0800 Subject: [PATCH 174/192] [V1][Quantization] Add CUDA graph compatible v1 GGUF support (#18646) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/kernels/quantization/test_gguf.py | 4 +- tests/models/quantization/test_gguf.py | 8 +- vllm/engine/arg_utils.py | 8 - vllm/model_executor/layers/linear.py | 4 - .../layers/quantization/gguf.py | 223 ++++++++++++++---- 5 files changed, 188 insertions(+), 59 deletions(-) diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index e520e99b071c..ad755fe7f7a0 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -8,7 +8,6 @@ from huggingface_hub import snapshot_download import vllm._custom_ops as ops -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf from vllm.platforms import current_platform @@ -176,12 +175,11 @@ def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype, w2_dequant = torch.tensor(dequantize(w2.data, quant_type), device="cuda").to(dtype) - act = SiluAndMul() output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"), torch.tensor(w2.data, device="cuda"), topk_weights, - topk_ids, quant_type, quant_type, act) + topk_ids, quant_type, quant_type, "silu") ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights, topk_ids).reshape(output.shape) diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 3ff36502df57..5f17d12284a0 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -78,8 +78,12 @@ def gguf_model(self): ) MODELS = [ - LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, - DOLPHIN_CONFIG + LLAMA_CONFIG, + QWEN2_CONFIG, + PHI3_CONFIG, + GPT2_CONFIG, + # STABLELM_CONFIG, # enable this when v1 support head_size=80 + DOLPHIN_CONFIG, # STARCODER_CONFIG, # broken ] diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3b90880167dc..442e4100fea1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1291,14 +1291,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: recommend_to_remove=False) return False - # Some quantization is not compatible with torch.compile. - V1_UNSUPPORTED_QUANT = ["gguf"] - if model_config.quantization in V1_UNSUPPORTED_QUANT: - _raise_or_fallback( - feature_name=f"--quantization {model_config.quantization}", - recommend_to_remove=False) - return False - # No Embedding Models so far. if model_config.task not in ["generate"]: _raise_or_fallback(feature_name=f"--task {model_config.task}", diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dd2e477f3954..269ac043d26c 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -587,8 +587,6 @@ def weight_loader(self, param.shard_id.append(loaded_shard_id) param.shard_id_map[loaded_shard_id] = len(param.data_container) param.data_container.append(loaded_weight) - if len(param.data_container) == 2: - self.qweight = param.materialize_nested() return param_data = param.data @@ -982,8 +980,6 @@ def weight_loader(self, param.shard_id.append(loaded_shard_id) param.shard_id_map[loaded_shard_id] = len(param.data_container) param.data_container.append(loaded_weight) - if len(param.data_container) == 3: - self.qweight = param.materialize_nested() return param_data = param.data diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index d7d4a5d6acdb..1fcb6d7afc9b 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -9,7 +9,6 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEMethodBase) from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase @@ -19,6 +18,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.utils import set_weight_attrs +from vllm.utils import direct_register_custom_op logger = init_logger(__name__) @@ -96,8 +96,8 @@ def get_quant_method(self, layer: torch.nn.Module, MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES -def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, - qweight_type: int) -> torch.Tensor: +def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor, + qweight_type: int) -> torch.Tensor: # HACK: when doing chunked prefill we don't generate output tokens # so input to logits generator is empty which causes invalid parameter if x.shape[0] == 0: @@ -130,6 +130,30 @@ def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, return y +def _fused_mul_mat_gguf_fake( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, +) -> torch.Tensor: + return torch.empty(x.shape[0], + qweight.shape[0], + dtype=x.dtype, + device=x.device) + + +try: + direct_register_custom_op( + op_name="_fused_mul_mat_gguf", + op_func=_fused_mul_mat_gguf, + mutates_args=[], + fake_impl=_fused_mul_mat_gguf_fake, + ) + fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf + +except AttributeError as error: + raise error + + def _fused_moe_gguf( x: torch.Tensor, w1: torch.Tensor, @@ -138,8 +162,21 @@ def _fused_moe_gguf( topk_ids: torch.Tensor, qweight_type: int, qweight_type2: int, - act, + activation: str, ) -> torch.Tensor: + + def act(x: torch.Tensor): + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + if activation == "silu": + torch.ops._C.silu_and_mul(out, x) + elif activation == "gelu": + torch.ops._C.gelu_and_mul(out, x) + else: + raise ValueError(f"Unsupported activation: {activation}") + return out + # lazy import to avoid triggering triton import in CPU backend from vllm.model_executor.layers.fused_moe.fused_moe import ( moe_align_block_size) @@ -189,12 +226,12 @@ def _fused_moe_gguf( for ww, ii in zip(w, idx): expert_up = w1[ii] - out = _fuse_mul_mat(inp, expert_up, qweight_type) + out = fused_mul_mat_gguf(inp, expert_up, qweight_type) out = act(out) expert_down = w2[ii] - current_state = _fuse_mul_mat(out, expert_down, - qweight_type2).mul_(ww) + current_state = fused_mul_mat_gguf(out, expert_down, + qweight_type2).mul_(ww) if current_hidden_state is None: current_hidden_state = current_state else: @@ -203,6 +240,78 @@ def _fused_moe_gguf( return out_hidden_states +def _fused_moe_gguf_fake( + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + qweight_type: int, + qweight_type2: int, + activation: str, +) -> torch.Tensor: + return torch.empty_like(x) + + +try: + direct_register_custom_op( + op_name="_fused_moe_gguf", + op_func=_fused_moe_gguf, + mutates_args=[], + fake_impl=_fused_moe_gguf_fake, + ) + fused_moe_gguf = torch.ops.vllm._fused_moe_gguf + +except AttributeError as error: + raise error + + +def _apply_gguf_embedding( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, + hidden_size: int, + dtype: Optional[torch.dtype] = None, +) -> torch.Tensor: + if qweight_type in UNQUANTIZED_TYPES: + return torch.embedding(qweight, x) + elif qweight_type in DEQUANT_TYPES: + block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] + x_flat = x.flatten() + assert (hidden_size == qweight.shape[1] // type_size * block_size) + quant = torch.index_select(qweight, dim=0, index=x_flat) + dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size, + x_flat.shape[0], dtype) + return dequant.view(*x.shape, hidden_size) + else: + qweight_type = WeightType(qweight_type) + raise NotImplementedError( + f"Unsupported GGUF quantization type: {qweight_type}") + + +def _apply_gguf_embedding_fake( + x: torch.Tensor, + qweight: torch.Tensor, + qweight_type: int, + hidden_size: int, + dtype: Optional[torch.dtype] = None, +) -> torch.Tensor: + return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device) + + +try: + direct_register_custom_op( + op_name="_apply_gguf_embedding", + op_func=_apply_gguf_embedding, + mutates_args=[], + fake_impl=_apply_gguf_embedding_fake, + ) + apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding + +except AttributeError as error: + raise error + + class GGUFLinearMethod(LinearMethodBase): """Linear method for GGUF. @@ -249,26 +358,76 @@ def create_weights(self, layer: torch.nn.Module, set_weight_attrs(qweight_type, extra_weight_attrs) layer.register_parameter("qweight_type", qweight_type) + def process_weights_after_loading(self, layer: torch.nn.Module): + qweight_type = layer.qweight_type.weight_type + if not (qweight_type in UNQUANTIZED_TYPES + or qweight_type in DEQUANT_TYPES): + qweight_type = WeightType(qweight_type) + raise ValueError( + f"Unsupported GGUF quantization type {qweight_type} in " + f"layer {layer}.") + # For MergedColumnParallelLinear and QKVParallelLinear, we need to + # materialize the padded weight parameter for CUDA Graph compatibility. + self._create_padded_weight_param(layer) + + def _create_padded_weight_param(self, layer: torch.nn.Module): + """Create padded weight parameter for GGUF MergedLinear layer.""" + qweight = layer.qweight + shard_id_map = qweight.shard_id_map + shard_id = qweight.shard_id + if len(data_container := qweight.data_container) > 1: + dtype = {data.dtype for data in data_container} + assert len(dtype) == 1, ValueError( + f"Data container has mixed dtypes: {dtype}") + dtype = next(iter(dtype)) + # concat dim0 and pad dim1 + padded_side = max(x.size(1) for x in data_container) + concat_side = sum(x.size(0) for x in data_container) + # Pad the quantized weights to dense tensor, and create a map + # with the location of each shard in the padded tensor. + padded_data = torch.zeros((concat_side, padded_side), + dtype=dtype, + device=qweight.device) + # (dim0_start, dim0_end, dim1_size) + shard_offset_map = dict[str, tuple[int, int, int]]() + for idx in shard_id: + id_in_container = shard_id_map[idx] + start = sum( + x.size(0) for x in data_container[:id_in_container]) + end = start + data_container[id_in_container].size(0) + size = data_container[id_in_container].size(1) + padded_data[start:end, :size] = data_container[id_in_container] + shard_offset_map[idx] = (start, end, size) + qweight.data_container.clear() + padded_param = Parameter(padded_data, requires_grad=False) + set_weight_attrs(padded_param, vars(qweight)) + set_weight_attrs(padded_param, + {"shard_offset_map": shard_offset_map}) + layer.register_parameter("qweight", padded_param) + def apply(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - shard_id = getattr(layer.qweight, "shard_id", None) + shard_id = layer.qweight.shard_id if shard_id: # dequantize shard weights respectively shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id - qweight = layer.qweight.unbind(0) + qweight = layer.qweight result = [] for idx in shard_id: - q_idx = layer.qweight.shard_id_map[idx] + start, end, offset = layer.qweight.shard_offset_map[idx] qweight_type = layer.qweight_type.shard_weight_type[idx] - result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type)) + result.append( + fused_mul_mat_gguf( + x, qweight[start:end, :offset].contiguous(), + qweight_type)) out = torch.cat(result, axis=1) else: qweight = layer.qweight qweight_type = layer.qweight_type.weight_type - out = _fuse_mul_mat(x, qweight, qweight_type) + out = fused_mul_mat_gguf(x, qweight, qweight_type) if bias is not None: out.add_(bias) return out @@ -338,7 +497,6 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, set_weight_attrs(w2_qweight_type, extra_weight_attrs) layer.register_parameter("w2_qweight_type", w2_qweight_type) - self.act = SiluAndMul() def apply( self, @@ -375,10 +533,10 @@ def apply( custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias) - return _fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight, - topk_weights, topk_ids, - layer.w13_qweight_type.weight_type, - layer.w2_qweight_type.weight_type, self.act) + return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight, + topk_weights, topk_ids, + layer.w13_qweight_type.weight_type, + layer.w2_qweight_type.weight_type, activation) class GGUFEmbeddingMethod(GGUFLinearMethod): @@ -392,34 +550,15 @@ def embedding(self, layer: torch.nn.Module, x: torch.Tensor) -> torch.Tensor: qweight = layer.qweight qweight_type = layer.qweight_type.weight_type + hidden_size = qweight.tensor_shape[1] - block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] - hidden_size = qweight.shape[1] // type_size * block_size - if qweight_type < 2: - return torch.embedding(qweight, x) - x_flat = x.flatten() - quant = torch.index_select(qweight, dim=0, index=x_flat) - dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size, - x_flat.shape[0], self.params_dtype) - return dequant.view(*x.shape, hidden_size) + return apply_gguf_embedding(x, + qweight, + qweight_type, + hidden_size, + dtype=self.params_dtype) class GGUFUninitializedParameter(UninitializedParameter): cls_to_become = Parameter data_container: list[torch.Tensor] - - def materialize_nested(self) -> Parameter: - dtype = {data.dtype for data in self.data_container} - assert len(dtype) == 1, ValueError( - f"Data container has mixed dtypes: {dtype}") - dtype = next(iter(dtype)) - nested_data = torch.nested.nested_tensor(self.data_container, - device=self.device, - dtype=dtype) - self.data_container.clear() - param = torch.Tensor._make_subclass(self.cls_to_become, - nested_data, - require_grad=False) - for k, v in self.__dict__.items(): - setattr(param, k, v) - return param From b50602d5f04677e75158c0d2e0e8b51793a5d545 Mon Sep 17 00:00:00 2001 From: Lukas Geiger <lukas.geiger94@gmail.com> Date: Tue, 27 May 2025 06:42:54 +0100 Subject: [PATCH 175/192] [Model][Gemma3] Cast image pixel values already on CPU (#18732) Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com> --- vllm/model_executor/models/gemma3_mm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index c4ae5b50c451..00a972d33b04 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -263,6 +263,11 @@ def _call_hf_processor( mm_data, mm_kwargs, ) + if "pixel_values" in processed_outputs: + # Cast pixel values to model dtype already here, + # so we need to transfer less data to the GPU + processed_outputs["pixel_values"] = processed_outputs[ + "pixel_values"].to(self.info.ctx.model_config.dtype) # HF processor pops the `num_crops` kwarg, which is needed by vLLM if (images := mm_data.get("images")) is not None: @@ -543,9 +548,7 @@ def _image_pixels_to_features( vision_tower: SiglipVisionModel, pixel_values: torch.Tensor, ) -> torch.Tensor: - target_dtype = vision_tower.get_input_embeddings().weight.dtype - image_features = vision_tower(pixel_values.to(dtype=target_dtype)) - return image_features + return vision_tower(pixel_values) def _process_image_input( self, From d260f799a95d1c49f1da17309dc42df1381f809b Mon Sep 17 00:00:00 2001 From: vllmellm <vllm.ellm@embeddedllm.com> Date: Tue, 27 May 2025 14:14:07 +0800 Subject: [PATCH 176/192] [FEAT] [ROCm] Upgrade AITER Fused MoE kernels. (#18271) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> --- vllm/model_executor/layers/fused_moe/layer.py | 6 +- .../layers/fused_moe/rocm_aiter_fused_moe.py | 401 ++++++------------ .../compressed_tensors_moe.py | 5 +- .../model_executor/layers/quantization/fp8.py | 38 +- 4 files changed, 133 insertions(+), 317 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b101f5862fa7..29b41e720852 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -419,10 +419,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: shuffle_weights) if self.rocm_aiter_moe_enabled: - # use 2stage ck moe layout - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data, - layer.w2_weight.data, - layout=(32, 32)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight.data = shuffled_w13 layer.w2_weight.data = shuffled_w2 diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index a92081862bfa..10b61fcda176 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +from enum import IntEnum from functools import cache from typing import Optional @@ -9,6 +10,28 @@ from vllm.utils import direct_register_custom_op +class QuantMethod(IntEnum): + # This allows interfacing with AITER QuantType Enum + # without importing the QuantType from AITER globally. + + # Note that these quantization methods are + # supported in AITER package. However, + # not all are used in this module. + + NO = 0 # a16w16 + PER_TENSOR = 1 # w8a8 (pre_Tensor) + PER_TOKEN = 2 # w8a8/w8a4 (per_Token) + BLOCK_1X128 = 3 # block quantized w8a8 (per_1x128) + BLOCK_128x128 = 4 # block quantized w8a8 (per_128x128) + + +class ActivationMethod(IntEnum): + # This allows interfacing with AITER ActivationType enum + # without importing the ActivationType enum from AITER globally. + SILU = 0 + GELU = 1 + + @cache def is_rocm_aiter_moe_enabled() -> bool: return current_platform.is_rocm() \ @@ -29,13 +52,12 @@ def rocm_aiter_asm_moe_tkw1_impl( a16: bool = False, per_tensor_quant_scale: Optional[torch.Tensor] = None, expert_mask: Optional[torch.Tensor] = None, - activation_str: str = "silu") -> torch.Tensor: + activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor: from aiter import ActivationType from aiter.fused_moe_bf16_asm import asm_moe_tkw1 - activation = \ - ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu + activation = ActivationType(activation_method) return asm_moe_tkw1(hidden_states, w1, @@ -65,163 +87,7 @@ def rocm_aiter_asm_moe_tkw1_fake( a16: bool = False, per_tensor_quant_scale: Optional[torch.Tensor] = None, expert_mask: Optional[torch.Tensor] = None, - activation_str: str = "silu") -> torch.Tensor: - return torch.empty_like(hidden_states) - - -def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl( - topk_ids: torch.Tensor, - topk_weights: torch.Tensor, - hidden_states_dtype: torch.dtype, - expert_mask: torch.Tensor, - a1: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - block_shape: list[int], - smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor: - from aiter import fmoe_fp8_blockscale_g1u1 - from aiter.fused_moe_bf16_asm import moe_sorting_ck - - topk = topk_ids.shape[1] - model_dim = w1.shape[-1] - local_E = E = w1.shape[0] - if expert_mask is not None: - E = expert_mask.numel() - - ( - sorted_token_ids, - sorted_weight_buf, - sorted_expert_ids, - num_valid_ids, - out_asm, - ) = moe_sorting_ck(topk_ids, - topk_weights, - E, - model_dim, - hidden_states_dtype, - expert_mask=expert_mask) - - fmoe_fp8_blockscale_g1u1(out_asm, a1, w1, w2, sorted_token_ids, - sorted_weight_buf, sorted_expert_ids, - num_valid_ids, topk, - a1_scale.t().contiguous(), - w1_scale.view(local_E, -1), - w2_scale.view(local_E, - -1), *block_shape, smooth_scale) - - return out_asm - - -def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake( - topk_ids: torch.Tensor, - topk_weights: torch.Tensor, - hidden_states_dtype: torch.dtype, - expert_mask: torch.Tensor, - a1: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - a1_scale: torch.Tensor, - block_shape: list[int], - smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor: - - return torch.empty_like(a1, dtype=hidden_states_dtype) - - -def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - fc1_smooth_scale: Optional[torch.Tensor] = None, - fc2_smooth_scale: Optional[torch.Tensor] = None, - a16: bool = False, - activation: str = "silu") -> torch.Tensor: - import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe - from aiter import ActivationType - - assert activation in ["silu", "gelu"], "The given activation:" \ - f" {activation}" \ - " is not supported in" \ - " AITER." - if activation == "silu": - aiter_activation = ActivationType.Silu - else: - aiter_activation = ActivationType.Gelu - - return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weight=topk_weights, - topk_ids=topk_ids, - fc1_scale=fc1_scale, - fc2_scale=fc2_scale, - fc1_smooth_scale=fc1_smooth_scale, - fc2_smooth_scale=fc2_smooth_scale, - a16=a16, - activation=aiter_activation) - - -def rocm_aiter_asm_moe_fake(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - fc1_smooth_scale: Optional[torch.Tensor] = None, - fc2_smooth_scale: Optional[torch.Tensor] = None, - a16: bool = False, - activation: str = "silu") -> torch.Tensor: - return torch.empty_like(hidden_states) - - -def rocm_aiter_ck_moe_2stages_impl( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_size: Optional[list[int]] = None, - expert_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: - from aiter.fused_moe_bf16_asm import ck_moe_2stages - return ck_moe_2stages(a1=hidden_states, - w1=w1, - w2=w2, - topk_weight=topk_weights, - topk_ids=topk_ids, - fc1_scale=fc1_scale, - fc2_scale=fc2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_size=block_size, - expert_mask=expert_mask) - - -def rocm_aiter_ck_moe_2stages_fake( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - fc1_scale: Optional[torch.Tensor] = None, - fc2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_size: Optional[list[int]] = None, - expert_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: + activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -274,6 +140,50 @@ def rocm_aiter_biased_grouped_topk_fake( pass +def rocm_aiter_fused_moe_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: Optional[torch.Tensor] = None, + activation_method: int = ActivationMethod.SILU.value, + quant_method: int = QuantMethod.NO.value, + doweight_stage1: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + from aiter import ActivationType, QuantType + from aiter.fused_moe import fused_moe + + activation = ActivationType(activation_method) + quant_type = QuantType(quant_method) + + return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask, + activation, quant_type, doweight_stage1, w1_scale, + w2_scale, a1_scale, a2_scale) + + +def rocm_aiter_fused_moe_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + expert_mask: Optional[torch.Tensor] = None, + activation_method: int = ActivationMethod.SILU.value, + quant_method: int = QuantMethod.NO.value, + doweight_stage1: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + if current_platform.is_rocm(): direct_register_custom_op( @@ -285,26 +195,10 @@ def rocm_aiter_biased_grouped_topk_fake( ) direct_register_custom_op( - op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1", - op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl, - mutates_args=[], - fake_impl=rocm_aiter_fmoe_fp8_blockscale_g1u1_fake, - dispatch_key=current_platform.dispatch_key, - ) - - direct_register_custom_op( - op_name="rocm_aiter_asm_moe", - op_func=rocm_aiter_asm_moe_impl, - mutates_args=[], - fake_impl=rocm_aiter_asm_moe_fake, - dispatch_key=current_platform.dispatch_key, - ) - - direct_register_custom_op( - op_name="rocm_aiter_ck_moe_2stages", - op_func=rocm_aiter_ck_moe_2stages_impl, + op_name="rocm_aiter_fused_moe", + op_func=rocm_aiter_fused_moe_impl, mutates_args=[], - fake_impl=rocm_aiter_ck_moe_2stages_fake, + fake_impl=rocm_aiter_fused_moe_fake, dispatch_key=current_platform.dispatch_key, ) @@ -373,32 +267,14 @@ def rocm_aiter_fused_experts( a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None) -> torch.Tensor: - from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8) - + activation_method = (ActivationMethod.SILU + if activation == "silu" else ActivationMethod.GELU) # All AITER Fused MoE kernels are expecting the following datatypes topk_weights = topk_weights.to(torch.float32) topk_ids = topk_ids.to(torch.int32) - # w8a8 block-scaled - if block_shape is not None and use_fp8_w8a8: - assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is not supported for block scaled moe" - ) - assert w1_scale is not None - assert w2_scale is not None - - # The default block sizes are 128 in AITER. - block_shape = [128, 128] if block_shape is None else block_shape - - a1, a1_scale = per_token_group_quant_fp8(hidden_states, block_shape[1]) - - return torch.ops.vllm.rocm_aiter_fmoe_fp8_blockscale_g1u1( - topk_ids, topk_weights, hidden_states.dtype, None, a1, w1, w2, - w1_scale, w2_scale, a1_scale, block_shape, None) - # w8a8 per-channel quantization - elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8: + if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8: # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input` # This applies topk_weights on the GEMM output of the first FC layer # rather than the second FC. @@ -421,60 +297,44 @@ def rocm_aiter_fused_experts( a16=False, per_tensor_quant_scale=None, expert_mask=None, - activation_str=activation) - - # w8a8 per-tensor activation per-tensor weight - elif use_fp8_w8a8: - assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is not supported for fp8_w8a8") - - # - faster static per-tensor-activation static per-tensor-weight - # fp8 quantization w8a8 - if a1_scale is not None and a2_scale is not None: - return torch.ops.vllm.rocm_aiter_ck_moe_2stages( - hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale) - - # - fallback static per-tensor-activation static per-tensor-weight - # fp8 quantization w8a8 - # - dynamic per-tensor activation static per-tensor-weight - # fp8 quantization w8a8 - return torch.ops.vllm.rocm_aiter_asm_moe(hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids, - fc1_scale=w1_scale, - fc2_scale=w2_scale, - fc1_smooth_scale=None, - fc2_smooth_scale=None, - a16=False, - activation=activation) - if apply_router_weight_on_input: - assert (topk_weights.dim() == 2 - ), "`topk_weights` should be in shape (num_tokens, topk)" - _, topk = topk_weights.shape - assert ( - topk == 1 - ), "Only support topk=1 when `apply_router_weight_on_input` is True" - - hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) - topk_ids = topk_ids.to(torch.int32) - topk_weights = torch.ones_like(topk_weights, dtype=torch.float32) + activation_method=activation_method) - return torch.ops.vllm.rocm_aiter_ck_moe_2stages( - hidden_states=hidden_states, - w1=w1, - w2=w2, - topk_weights=topk_weights, - topk_ids=topk_ids) + else: + quant_method = QuantMethod.NO.value + + # w8a8 block-scaled + if block_shape is not None and use_fp8_w8a8: + assert not apply_router_weight_on_input, ( + "apply_router_weight_on_input is\ + not supported for block scaled moe") + assert w1_scale is not None + assert w2_scale is not None + quant_method = QuantMethod.BLOCK_128x128.value + elif use_fp8_w8a8: + # Currently only per tensor quantization method is enabled. + quant_method = QuantMethod.PER_TENSOR.value + + if apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" + + return torch.ops.vllm.rocm_aiter_fused_moe( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + quant_method=quant_method, + activation_method=activation_method, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + doweight_stage1=apply_router_weight_on_input) def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, @@ -488,14 +348,21 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor, return topk_weights, topk_indices -def shuffle_weights(*tensors: torch.Tensor, - layout: tuple[int, int]) -> tuple[torch.Tensor, ...]: +def shuffle_weights( + *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16) +) -> tuple[torch.Tensor, ...]: """ Applies shuffle_weight function from AITER to each input tensor and returns them. + + Rearranges (shuffles) the input tensor/s + into a specified block layout for optimized computation. Args: - *tensors: Variable number of torch.Tensor objects. + *tensors: Variable number of torch.Tensor objects. + layout: A pair of integers specifying the + block sizes used to divide the tensors during shuffling. + Default is (16, 16). Returns: A Tuple of shuffled tensors. @@ -503,25 +370,3 @@ def shuffle_weights(*tensors: torch.Tensor, from aiter.ops.shuffle import shuffle_weight return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors) - - -def expand_weights(*tensors: torch.Tensor, - expansion_dims: list[int]) -> tuple[torch.Tensor, ...]: - """ - Expands the dimensions of input tensors. - - Args: - *tensors: A variable number of torch.Tensor objects. - expansion_dims: A list of expansion dimensions - corresponding to each tensor. - - Returns: - A Tuple of tensors with expanded dimensions. - """ - - assert len(tensors) == len(expansion_dims), \ - "Number of tensors must match the number of expansion dimensions." - - return tuple( - tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1)) - for tensor, dim in zip(tensors, expansion_dims)) \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fa0067c44802..9241ceeb4db2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -286,9 +286,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: rocm_aiter_fused_experts, shuffle_weights) # reshaping weights is required for aiter moe kernel. - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data, - layer.w2_weight.data, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index c2aca842c8b3..ac9b74945e0c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -595,7 +595,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, def process_weights_after_loading(self, layer: Module) -> None: # Lazy import to avoid importing triton too early. from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights) + is_rocm_aiter_moe_enabled, shuffle_weights) self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() @@ -627,9 +627,7 @@ def process_weights_after_loading(self, layer: Module) -> None: if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = shuffle_weights( - layer.w13_weight.data, - layer.w2_weight.data, - layout=(16, 16)) + layer.w13_weight.data, layer.w2_weight.data) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) @@ -675,20 +673,8 @@ def process_weights_after_loading(self, layer: Module) -> None: requires_grad=False) if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. - w13_scales, w2_scales = expand_weights( - layer.w13_weight_scale.data, - layer.w2_weight_scale.data, - expansion_dims=[ - layer.w13_weight.shape[1], layer.w2_weight.shape[1] - ]) - layer.w13_weight_scale = torch.nn.Parameter( - w13_scales.contiguous(), requires_grad=False) - layer.w2_weight_scale = torch.nn.Parameter( - w2_scales.contiguous(), requires_grad=False) - - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight, - layer.w2_weight, - layout=(16, 16)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight, layer.w2_weight) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) @@ -760,20 +746,8 @@ def process_weights_after_loading(self, layer: Module) -> None: start += shard_size if self.rocm_aiter_moe_enabled: - # reshaping weights is required for aiter moe kernel. - expansion_dims = [ - layer.w13_weight.shape[1], layer.w2_weight.shape[1] - ] - max_w13_scales, w2_scales = expand_weights( - max_w13_scales, - layer.w2_weight_scale.data, - expansion_dims=expansion_dims) - layer.w2_weight_scale = torch.nn.Parameter( - w2_scales.contiguous(), requires_grad=False) - - shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight, - layer.w2_weight, - layout=(32, 32)) + shuffled_w13, shuffled_w2 = shuffle_weights( + layer.w13_weight, layer.w2_weight) layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) From 25a817f202ae86ab884ab83a10ff64435e92a5dc Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 27 May 2025 14:30:31 +0800 Subject: [PATCH 177/192] [Doc] Update OOT model docs (#18742) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/contributing/model/registration.md | 31 ++++++++++++------------- docs/design/plugin_system.md | 6 +++-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md index 2f829889277c..7a7bd7914058 100644 --- a/docs/contributing/model/registration.md +++ b/docs/contributing/model/registration.md @@ -23,33 +23,32 @@ Finally, update our [list of supported models][supported-models] to promote your ## Out-of-tree models -You can load an external model using a plugin without modifying the vLLM codebase. - -!!! info - [vLLM's Plugin System][plugin-system] +You can load an external model [using a plugin][plugin-system] without modifying the vLLM codebase. To register the model, use the following code: ```python -from vllm import ModelRegistry -from your_code import YourModelForCausalLM -ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +# The entrypoint of your plugin +def register(): + from vllm import ModelRegistry + from your_code import YourModelForCausalLM + + ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) ``` If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: ```python -from vllm import ModelRegistry - -ModelRegistry.register_model( - "YourModelForCausalLM", - "your_code:YourModelForCausalLM" -) +# The entrypoint of your plugin +def register(): + from vllm import ModelRegistry + + ModelRegistry.register_model( + "YourModelForCausalLM", + "your_code:YourModelForCausalLM" + ) ``` !!! warning If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface. Read more about that [here][supports-multimodal]. - -!!! note - Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 5027a35c23e8..0764dfb6501b 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -30,8 +30,10 @@ def register(): from vllm import ModelRegistry if "MyLlava" not in ModelRegistry.get_supported_archs(): - ModelRegistry.register_model("MyLlava", - "vllm_add_dummy_model.my_llava:MyLlava") + ModelRegistry.register_model( + "MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava", + ) ``` For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). From 753944fa9bfaa6ce68ee2aa6cf1b49c41f6718f4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 27 May 2025 15:03:13 +0800 Subject: [PATCH 178/192] [Doc] Update reproducibility doc and example (#18741) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- docs/usage/reproducibility.md | 63 ++++++++++--------- examples/offline_inference/reproducibility.py | 27 ++++---- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md index 542b83a10352..a494dcf19191 100644 --- a/docs/usage/reproducibility.md +++ b/docs/usage/reproducibility.md @@ -1,51 +1,52 @@ # Reproducibility -## Overview +vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve +reproducible results: -The `seed` parameter in vLLM is used to control the random states for various random number generators. This parameter can affect the behavior of random operations in user code, especially when working with models in vLLM. +- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`. +- For V0: Set the global seed (see below). -## Default Behavior +Example: <gh-file:examples/offline_inference/reproducibility.py> -By default, the `seed` parameter is set to `None`. When the `seed` parameter is `None`, the global random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that the random operations will behave as expected, without any fixed random states. +!!! warning -## Specifying a Seed + Applying the above settings [changes the random state in user code](#locality-of-random-state). -If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly. This can be useful for reproducibility, as it ensures that the random operations produce the same results across multiple runs. +!!! note -## Example Usage + Even with the above settings, vLLM only provides reproducibility + when it runs on the same hardware and the same vLLM version. + Also, the online serving API (`vllm serve`) does not support reproducibility + because it is almost impossible to make the scheduling deterministic in the + online setting. -### Without Specifying a Seed +## Setting the global seed -```python -import random -from vllm import LLM +The `seed` parameter in vLLM is used to control the random states for various random number generators. -# Initialize a vLLM model without specifying a seed -model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") +If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly. -# Try generating random numbers -print(random.randint(0, 100)) # Outputs different numbers across runs -``` +However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state). -### Specifying a Seed +### Default Behavior -```python -import random -from vllm import LLM +In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected. -# Initialize a vLLM model with a specific seed -model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", seed=42) +In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`. -# Try generating random numbers -print(random.randint(0, 100)) # Outputs the same number across runs -``` +!!! note -## Important Notes + It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs + for workflows such as speculative decoding. + + For more information, see: <gh-pr:17929> -- If the `seed` parameter is not specified, the behavior of global random states remains unaffected. -- If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set to that value. -- This behavior can be useful for reproducibility but may lead to non-intuitive behavior if the user is not explicitly aware of it. +### Locality of random state -## Conclusion +The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions: -Understanding the behavior of the `seed` parameter in vLLM is crucial for ensuring the expected behavior of random operations in your code. By default, the `seed` parameter is set to `None`, which means that the global random states are not affected. However, specifying a seed value can help achieve reproducibility in your experiments. +- For V0: The seed is specified. +- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`. + +By default, these conditions are not active so you can use vLLM without having to worry about +accidentally making deterministic subsequent operations that rely on random state. diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py index b2be117d1a0a..6d048986e710 100644 --- a/examples/offline_inference/reproducibility.py +++ b/examples/offline_inference/reproducibility.py @@ -1,24 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 +""" +Demonstrates how to achieve reproducibility in vLLM. + +Main article: https://docs.vllm.ai/en/latest/usage/reproducibility.html +""" + import os +import random from vllm import LLM, SamplingParams -# vLLM does not guarantee the reproducibility of the results by default, -# for the sake of performance. You need to do the following to achieve -# reproducible results: -# 1. Turn off multiprocessing to make the scheduling deterministic. -# NOTE(woosuk): This is not needed and will be ignored for V0. +# V1 only: Turn off multiprocessing to make the scheduling deterministic. os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" -# 2. Fix the global seed for reproducibility. The default seed is None, which is + +# V0 only: Set the global seed. The default seed is None, which is # not reproducible. SEED = 42 -# NOTE(woosuk): Even with the above two settings, vLLM only provides -# reproducibility when it runs on the same hardware and the same vLLM version. -# Also, the online serving API (`vllm serve`) does not support reproducibility -# because it is almost impossible to make the scheduling deterministic in the -# online serving setting. - prompts = [ "Hello, my name is", "The president of the United States is", @@ -38,6 +36,11 @@ def main(): print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print("-" * 50) + # Try generating random numbers outside vLLM + # The same number is output across runs, meaning that the random state + # in the user code has been updated by vLLM + print(random.randint(0, 100)) + if __name__ == "__main__": main() From fc6d0c290f7d325b5d57d20fb92a213b306ac4a1 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Tue, 27 May 2025 15:07:01 +0800 Subject: [PATCH 179/192] [Misc] improve docs (#18734) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> --- examples/offline_inference/neuron_eagle.py | 80 ++++++++++--------- .../offline_inference/qwen2_5_omni/README.md | 22 +++-- 2 files changed, 59 insertions(+), 43 deletions(-) diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py index a51caa2aec8b..5d7fb819d347 100644 --- a/examples/offline_inference/neuron_eagle.py +++ b/examples/offline_inference/neuron_eagle.py @@ -15,40 +15,46 @@ "What is annapurna labs?", ] -# Create a sampling params object. -sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) - -# Create an LLM. -llm = LLM( - model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", - speculative_config={ - "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", - "num_speculative_tokens": 5, - "max_model_len": 2048, - }, - max_num_seqs=4, - # The max_model_len and block_size arguments are required to be same as - # max sequence length when targeting neuron device. - # Currently, this is a known limitation in continuous batching support - # in neuronx-distributed-inference. - max_model_len=2048, - block_size=2048, - # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, - # or explicitly assigned. - device="neuron", - tensor_parallel_size=32, - override_neuron_config={ - "enable_eagle_speculation": True, - "enable_fused_speculation": True, - }, -) - -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") + +def main(): + # Create a sampling params object. + sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) + + # Create an LLM. + llm = LLM( + model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", + speculative_config={ + "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", + "num_speculative_tokens": 5, + "max_model_len": 2048, + }, + max_num_seqs=4, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in neuronx-distributed-inference. + max_model_len=2048, + block_size=2048, + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + tensor_parallel_size=32, + override_neuron_config={ + "enable_eagle_speculation": True, + "enable_fused_speculation": True, + }, + ) + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index c30541a598ce..16d44cbadbc9 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -6,14 +6,19 @@ This folder provides several example scripts on how to inference Qwen2.5-Omni of ```bash # Audio + image + video -python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q mixed_modalities # Read vision and audio inputs from a single video file # NOTE: V1 engine does not support interleaved modalities yet. -VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video +VLLM_USE_V1=0 \ +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q use_audio_in_video # Multiple audios -VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios +VLLM_USE_V1=0 \ +python examples/offline_inference/qwen2_5_omni/only_thinker.py \ + -q multi_audios ``` This script will run the thinker part of Qwen2.5-Omni, and generate text response. @@ -22,11 +27,16 @@ You can also test Qwen2.5-Omni on a single modality: ```bash # Process audio inputs -python examples/offline_inference/audio_language.py --model-type qwen2_5_omni +python examples/offline_inference/audio_language.py \ + --model-type qwen2_5_omni # Process image inputs -python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni +python examples/offline_inference/vision_language.py \ + --modality image \ + --model-type qwen2_5_omni # Process video inputs -python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni +python examples/offline_inference/vision_language.py \ + --modality video \ + --model-type qwen2_5_omni ``` From a547aeb8283b6ba757912fe43169ec38adb9ca45 Mon Sep 17 00:00:00 2001 From: almersawi <43927639+almersawi@users.noreply.github.com> Date: Tue, 27 May 2025 11:07:53 +0400 Subject: [PATCH 180/192] feat(rocm-support): support mamba2 on rocm (#18565) Signed-off-by: Islam Almersawi <islam.almersawi@openinnovation.ai> Co-authored-by: Islam Almersawi <islam.almersawi@openinnovation.ai> --- CMakeLists.txt | 4 +- csrc/mamba/causal_conv1d/causal_conv1d.cu | 10 ++- csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 2 +- csrc/torch_bindings.cpp | 70 +++++++++---------- .../layers/mamba/mamba2_metadata.py | 23 ++++-- 5 files changed, 60 insertions(+), 49 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a1ed588749a..3c5856fc5909 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,6 +232,8 @@ endif() # set(VLLM_EXT_SRC + "csrc/mamba/mamba_ssm/selective_scan_fwd.cu" + "csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/cache_kernels.cu" "csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v2.cu" @@ -287,8 +289,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_MakeAvailable(cutlass) list(APPEND VLLM_EXT_SRC - "csrc/mamba/mamba_ssm/selective_scan_fwd.cu" - "csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/permute_cols.cu" diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index 98daf1a1b8e6..f62d08c17c6d 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -13,6 +13,10 @@ #include <cub/block/block_load.cuh> #include <cub/block/block_store.cuh> +#ifdef USE_ROCM + namespace cub = hipcub; +#endif + #include "static_switch.h" @@ -501,15 +505,9 @@ void causal_conv1d_fwd_launch(ConvParamsBase ¶ms, cudaStream_t stream) { auto kernel = &causal_conv1d_fwd_kernel<Ktraits>; if (kSmemSize >= 48 * 1024) { - #ifndef USE_ROCM - C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); - #else - // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function. C10_CUDA_CHECK(cudaFuncSetAttribute( (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl; - #endif } kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params); diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index bd0a34119c82..0c9df925bdbf 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { auto kernel = &selective_scan_fwd_kernel<Ktraits>; if (kSmemSize >= 48 * 1024) { C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); + (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); } kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params); C10_CUDA_KERNEL_LAUNCH_CHECK(); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 4eda1aaccc6b..371894c56a79 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -482,41 +482,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor page_table, float scale) -> ()"); ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode); - // Mamba selective scan kernel - ops.def( - "selective_scan_fwd(Tensor! u, Tensor! delta," - "Tensor! A, Tensor! B, Tensor! C," - "Tensor? D_, Tensor!? z_, Tensor? delta_bias_," - "bool delta_softplus," - "Tensor? query_start_loc," - "Tensor? cache_indices," - "Tensor? has_initial_state," - "Tensor! ssm_states," - "int pad_slot_id) -> ()"); - ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); - - ops.def( - "causal_conv1d_update(Tensor! x," - "Tensor! conv_state," - "Tensor! weight," - "Tensor? bias_," - "bool silu_activation," - "Tensor? cache_seqlens_," - "Tensor? conv_state_indices," - "int pad_slot_id) -> ()"); - ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update); - - ops.def( - "causal_conv1d_fwd(Tensor! x, Tensor! weight," - "Tensor? bias_," - "Tensor!? conv_states," - "Tensor? query_start_loc," - "Tensor? cache_indices," - "Tensor? has_initial_state," - "bool silu_activation," - "int pad_slot_id) -> ()"); - ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd); - // Compute NVFP4 block quantized tensor. ops.def( "scaled_fp4_quant(Tensor! output, Tensor input," @@ -584,6 +549,41 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, &dynamic_scaled_int8_quant); + // Mamba selective scan kernel + ops.def( + "selective_scan_fwd(Tensor! u, Tensor! delta," + "Tensor! A, Tensor! B, Tensor! C," + "Tensor? D_, Tensor!? z_, Tensor? delta_bias_," + "bool delta_softplus," + "Tensor? query_start_loc," + "Tensor? cache_indices," + "Tensor? has_initial_state," + "Tensor! ssm_states," + "int pad_slot_id) -> ()"); + ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); + + ops.def( + "causal_conv1d_update(Tensor! x," + "Tensor! conv_state," + "Tensor! weight," + "Tensor? bias_," + "bool silu_activation," + "Tensor? cache_seqlens_," + "Tensor? conv_state_indices," + "int pad_slot_id) -> ()"); + ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update); + + ops.def( + "causal_conv1d_fwd(Tensor! x, Tensor! weight," + "Tensor? bias_," + "Tensor!? conv_states," + "Tensor? query_start_loc," + "Tensor? cache_indices," + "Tensor? has_initial_state," + "bool silu_activation," + "int pad_slot_id) -> ()"); + ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd); + #ifndef USE_ROCM // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel ops.def( diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index e5b88de2fcc8..019f634a9ef4 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -5,10 +5,9 @@ import torch from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.backends.placeholder_attn import ( PlaceholderAttentionMetadata) -from vllm.attention.backends.xformers import XFormersMetadata +from vllm.platforms import current_platform @dataclass @@ -23,6 +22,21 @@ class Mamba2Metadata: chunk_offsets: torch.Tensor +def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: + """Returns the appropriate metadata classes for the current platform.""" + if current_platform.is_rocm(): + from vllm.attention.backends.rocm_flash_attn import ( + ROCmFlashAttentionMetadata) + return (ROCmFlashAttentionMetadata, PlaceholderAttentionMetadata) + elif current_platform.is_cuda(): + from vllm.attention.backends.flash_attn import FlashAttentionMetadata + from vllm.attention.backends.xformers import XFormersMetadata + return (FlashAttentionMetadata, XFormersMetadata, + PlaceholderAttentionMetadata) + raise ValueError( + f"Unsupported platform for Mamba2: {current_platform.device_type}") + + def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor, chunk_size: int, total_seqlens: int): @@ -78,9 +92,8 @@ def prepare_mamba2_metadata( # Compute seq_idx, chunk_indices and chunk_offsets for prefill only if num_prefills > 0: - if (isinstance(attn_metadata, - (FlashAttentionMetadata, XFormersMetadata, - PlaceholderAttentionMetadata)) + attn_metadata_instances = get_platform_metadata_classes() + if (isinstance(attn_metadata, attn_metadata_instances) and attn_metadata.context_lens_tensor is not None): has_initial_states = \ attn_metadata.context_lens_tensor[:num_prefills] > 0 #[batch,] From bbd9a84dc55882a95974449008c275794309599f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Durejko?= <ldurejko@habana.ai> Date: Tue, 27 May 2025 09:10:26 +0200 Subject: [PATCH 181/192] [Hardware][Intel-Gaudi] [CI/Build] Fix multiple containers using the same name in run-hpu-test.sh (#18752) Signed-off-by: Lukasz Durejko <ldurejko@habana.ai> --- .buildkite/scripts/hardware_ci/run-hpu-test.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh index c3b78d471297..5efac3ddf469 100644 --- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh @@ -10,17 +10,17 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu . # Setup cleanup # certain versions of HPU software stack have a bug that can # override the exit code of the script, so we need to use -# separate remove_docker_container and remove_docker_container_and_exit +# separate remove_docker_containers and remove_docker_containers_and_exit # functions, while other platforms only need one remove_docker_container # function. EXITCODE=1 -remove_docker_container() { docker rm -f hpu-test || true; } -remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; } -trap remove_docker_container_and_exit EXIT -remove_docker_container +remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; } +remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; } +trap remove_docker_containers_and_exit EXIT +remove_docker_containers # Run the image and launch offline inference docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2 +docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2 EXITCODE=$? From 4693a3438cebd6c2f764ad67af5a85f85d03db13 Mon Sep 17 00:00:00 2001 From: Calvin Chen <45745657+calvin0327@users.noreply.github.com> Date: Tue, 27 May 2025 15:12:02 +0800 Subject: [PATCH 182/192] [Doc] cleanup deprecated flag for doc (#18715) Signed-off-by: calvin chen <120380290@qq.com> --- benchmarks/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 4a8ab895e18e..ecab570bb31c 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -146,10 +146,9 @@ python3 vllm/benchmarks/benchmark_serving.py \ ``` bash VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --speculative-model "[ngram]" \ --ngram_prompt_lookup_min 2 \ --ngram-prompt-lookup-max 5 \ - --num_speculative_tokens 5 + --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5} ``` ``` bash @@ -274,10 +273,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \ --output-len=100 \ --num-prompts=2048 \ --async-engine \ - --speculative-model="[ngram]" \ --ngram_prompt_lookup_min=2 \ --ngram-prompt-lookup-max=5 \ - --num_speculative_tokens=5 + --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5} ``` ``` From c24b1572ac23bc5c495c3680a70d9f6c36ff7143 Mon Sep 17 00:00:00 2001 From: maobaolong <baoloongmao@tencent.com> Date: Tue, 27 May 2025 16:02:28 +0800 Subject: [PATCH 183/192] Minor fix about MooncakeStoreConnector (#18721) Signed-off-by: baoloongmao <baoloongmao@tencent.com> --- .../kv_transfer/kv_connector/mooncake_store_connector.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py index 56b55c2bb59d..58eabd0a37eb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py @@ -31,12 +31,12 @@ def __init__( local_rank: int, config: VllmConfig, ): - self.config = config.kv_transfer_config + self.kv_transfer_config = config.kv_transfer_config self.kv_helper = kv_helper(config) self.local_tp_rank = local_rank # Init kv_store - if self.config.kv_connector == "MooncakeStoreConnector": + if self.kv_transfer_config.kv_connector == "MooncakeStoreConnector": # Check if MOONCAKE_CONFIG_PATH is set import os use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None @@ -50,10 +50,11 @@ def __init__( MooncakeStore) logger.info( "Initializing KVStoreConnector under kv_transfer_config %s", - self.config) + self.kv_transfer_config) self.kv_store = MooncakeStore(config) else: - logger.error("Can not find %s", self.config.kv_connector) + logger.error("Can not find %s", + self.kv_transfer_config.kv_connector) assert self.kv_store is not None From e0f0ff87b89978d1d0bea2aec2d85cec72b71238 Mon Sep 17 00:00:00 2001 From: Kebe <mail@kebe7jun.com> Date: Tue, 27 May 2025 16:03:56 +0800 Subject: [PATCH 184/192] [Build] fix cpu build missing libtbbmalloc.so (#18744) Signed-off-by: Kebe <mail@kebe7jun.com> --- requirements/cpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/cpu.txt b/requirements/cpu.txt index c064ecbb9b1f..1213301584ce 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -22,5 +22,5 @@ datasets # for benchmark scripts triton==3.2.0; platform_machine == "x86_64" # Intel Extension for PyTorch, only for x86_64 CPUs -intel-openmp; platform_machine == "x86_64" +intel-openmp==2024.2.1; platform_machine == "x86_64" intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64" From 6881107948c00a8564bc2fa85308f6fc2f065d64 Mon Sep 17 00:00:00 2001 From: Shawn Huang <57223022+huangyuxiang03@users.noreply.github.com> Date: Tue, 27 May 2025 16:04:49 +0800 Subject: [PATCH 185/192] [BUG FIX] minicpm (#18739) Signed-off-by: huangyuxiang03 <huangyx0321@gmail.com> Co-authored-by: huangyuxiang03 <huangyx0321@gmail.com> --- vllm/model_executor/models/minicpm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index d99ae81468a9..0397b552ce9f 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -242,9 +242,6 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - # set rope as fp32 instead of bf16 - self.rotary_emb.cos_sin_cache = self.rotary_emb._compute_cos_sin_cache( - ) self.attn = Attention(self.num_heads, self.head_dim, self.scaling, From a68e293cb9dd40d415a9b25391b13ae370adc62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= <ohg3417@gmail.com> Date: Tue, 27 May 2025 17:44:20 +0900 Subject: [PATCH 186/192] [Doc] Convert Sphinx directives ( `{class}`, `{meth}`, `{attr}`, ...) to MkDocs format for better documentation linking (#18663) Signed-off-by: Zerohertz <ohg3417@gmail.com> --- vllm/compilation/compiler_interface.py | 3 +- vllm/config.py | 36 +++++----- vllm/connections.py | 5 +- vllm/engine/async_llm_engine.py | 50 +++++++------- vllm/engine/llm_engine.py | 8 +-- vllm/engine/multiprocessing/client.py | 10 +-- vllm/engine/multiprocessing/engine.py | 19 +++--- vllm/engine/output_processor/multi_step.py | 7 +- vllm/engine/output_processor/single_step.py | 21 ++++-- vllm/entrypoints/llm.py | 5 +- vllm/entrypoints/openai/serving_engine.py | 6 +- vllm/executor/executor_base.py | 2 +- vllm/inputs/__init__.py | 5 +- vllm/inputs/data.py | 68 +++++++++++-------- vllm/inputs/parse.py | 8 +-- vllm/inputs/preprocess.py | 56 +++++++++++----- vllm/inputs/registry.py | 4 +- vllm/logger.py | 10 +-- vllm/model_executor/layers/sampler.py | 20 +++--- vllm/model_executor/models/interfaces.py | 6 +- vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/pixtral.py | 4 +- vllm/model_executor/models/qwen_vl.py | 3 +- vllm/model_executor/models/registry.py | 2 +- vllm/model_executor/models/utils.py | 2 +- vllm/multimodal/__init__.py | 7 +- vllm/multimodal/inputs.py | 74 +++++++++++++-------- vllm/multimodal/parse.py | 13 ++-- vllm/multimodal/processing.py | 73 +++++++++++++------- vllm/multimodal/profiling.py | 2 +- vllm/multimodal/registry.py | 26 ++++---- vllm/multimodal/utils.py | 3 +- vllm/platforms/interface.py | 9 +-- vllm/sequence.py | 18 ++--- vllm/utils.py | 5 +- vllm/v1/worker/utils.py | 4 +- vllm/worker/multi_step_model_runner.py | 11 +-- 37 files changed, 360 insertions(+), 247 deletions(-) diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 89a131e8ea24..21af5eb76ee8 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -39,7 +39,8 @@ def compute_hash(self, vllm_config: VllmConfig) -> str: Gather all the relevant information from the vLLM config, to compute a hash so that we can cache the compiled model. - See {meth}`VllmConfig.compute_hash` to check what information + See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash] + to check what information is already considered by default. This function should only consider the information that is specific to the compiler. """ diff --git a/vllm/config.py b/vllm/config.py index 4196684639ee..db35c848b33a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2986,7 +2986,7 @@ class PoolerConfig: pooling_type: Optional[str] = None """ The pooling method of the pooling model. This should be a key in - {class}`vllm.model_executor.layers.pooler.PoolingType`. + [`vllm.model_executor.layers.pooler.PoolingType`][]. """ normalize: Optional[bool] = None @@ -3697,23 +3697,27 @@ class CompilationConfig: """Configuration for compilation. It has three parts: - Top-level Compilation control: - - {attr}`level` - - {attr}`debug_dump_path` - - {attr}`cache_dir` - - {attr}`backend` - - {attr}`custom_ops` - - {attr}`splitting_ops` + - [`level`][vllm.config.CompilationConfig.level] + - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path] + - [`cache_dir`][vllm.config.CompilationConfig.cache_dir] + - [`backend`][vllm.config.CompilationConfig.backend] + - [`custom_ops`][vllm.config.CompilationConfig.custom_ops] + - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops] - CudaGraph capture: - - {attr}`use_cudagraph` - - {attr}`cudagraph_capture_sizes` - - {attr}`cudagraph_num_of_warmups` - - {attr}`cudagraph_copy_inputs` - - {attr}`full_cuda_graph` + - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph] + - [`cudagraph_capture_sizes`] + [vllm.config.CompilationConfig.cudagraph_capture_sizes] + - [`cudagraph_num_of_warmups`] + [vllm.config.CompilationConfig.cudagraph_num_of_warmups] + - [`cudagraph_copy_inputs`] + [vllm.config.CompilationConfig.cudagraph_copy_inputs] + - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph] - Inductor compilation: - - {attr}`use_inductor` - - {attr}`compile_sizes` - - {attr}`inductor_compile_config` - - {attr}`inductor_passes` + - [`use_inductor`][vllm.config.CompilationConfig.use_inductor] + - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] + - [`inductor_compile_config`] + [vllm.config.CompilationConfig.inductor_compile_config] + - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes] - custom inductor passes Why we have different sizes for cudagraph and inductor: diff --git a/vllm/connections.py b/vllm/connections.py index 9abc66050e18..84e32a4d5ca9 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -167,4 +167,7 @@ async def async_download_file( global_http_connection = HTTPConnection() -"""The global {class}`HTTPConnection` instance used by vLLM.""" +""" +The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used +by vLLM. +""" diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 56b9e49d24d9..19b219b674f3 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -475,7 +475,8 @@ async def add_request_async( *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: - """Async version of {meth}`add_request`.""" + """Async version of + [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].""" if inputs is not None: prompt = inputs assert prompt is not None and params is not None @@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async( class AsyncLLMEngine(EngineClient): - """An asynchronous wrapper for {class}`LLMEngine`. + """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine]. - This class is used to wrap the {class}`LLMEngine` class to make it - asynchronous. It uses asyncio to create a background loop that keeps - processing incoming requests. The {class}`LLMEngine` is kicked by the - generate method when there are requests in the waiting queue. The generate - method yields the outputs from the {class}`LLMEngine` to the caller. + This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to + make it asynchronous. It uses asyncio to create a background loop that keeps + processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked + by the generate method when there are requests in the waiting queue. The + generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine] + to the caller. Args: log_requests: Whether to log the requests. start_engine_loop: If True, the background task to run the engine will be automatically started in the generate call. - *args: Arguments for {class}`LLMEngine`. - **kwargs: Arguments for {class}`LLMEngine`. + *args: Arguments for [`LLMEngine`][vllm.LLMEngine]. + **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine]. """ _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine @@ -985,8 +987,9 @@ async def generate( from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. @@ -1003,7 +1006,7 @@ async def generate( Details: - If the engine is not running, start the background loop, which iteratively invokes - {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` + [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step] to process the waiting requests. - Add the request to the engine's `RequestTracker`. On the next background loop, this request will be sent to @@ -1075,8 +1078,9 @@ async def encode( from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. @@ -1089,15 +1093,15 @@ async def encode( for the request. Details: - - If the engine is not running, start the background loop, - which iteratively invokes - {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. + - If the engine is not running, start the background loop, + which iteratively invokes + [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][] + to process the waiting requests. + - Add the request to the engine's `RequestTracker`. + On the next background loop, this request will be sent to + the underlying engine. + Also, a corresponding `AsyncStream` will be created. + - Wait for the request outputs from `AsyncStream` and yield them. Example: ``` diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5ca3ebe91d12..ff33d566ab68 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -130,11 +130,11 @@ class LLMEngine: iteration-level scheduling and efficient memory management to maximize the serving throughput. - The [LLM][vllm.LLM] class wraps this class for offline batched inference - and the [AsyncLLMEngine][] class wraps this class for online serving. + The [`LLM`][vllm.LLM] class wraps this class for offline batched inference + and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine] + class wraps this class for online serving. - The config arguments are derived from [EngineArgs][vllm.EngineArgs]. (See - [engine-args][]) + The config arguments are derived from [`EngineArgs`][vllm.EngineArgs]. Args: vllm_config: The configuration for initializing and running vLLM. diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index eea89a9a055f..18b7c187bdff 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -492,8 +492,9 @@ def generate( from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. @@ -561,8 +562,9 @@ def encode( from the LLMEngine to the caller. Args: - prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType` - for more details about the format of each input. + prompt: The prompt to the LLM. See + [`PromptType`][vllm.inputs.PromptType] for more details about + the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index ac234d25373d..434cb4985562 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -42,19 +42,22 @@ class MQLLMEngine: - """A multiprocessing wrapper for {class}`LLMEngine`. + """A multiprocessing wrapper for + [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. - This class is used to wrap the {class}`LLMEngine` class to enable use + This class is used to wrap the + [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use in concurrnet manner. It runs a background loop and uses zeromq to receive new requests and stream outputs incrementally via ipc. - The {class}`LLMEngine` generate or encode process is kicked off when a new - RPCProcessRequest is received by the input_socket. + The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode + process is kicked off when a new RPCProcessRequest is received by the + input_socket. The self.engine_loop checks the input_socket for new requests, adds them to the LLMEngine if there are any, calls the internal - {class}`LLMEngine.step()`, and sends the RequestOutputs back over - the output_socket. + [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends + the RequestOutputs back over the output_socket. If use_async_sockets is set, the logic associated with reading new requests from the socket and sending data to the socket is passed @@ -65,8 +68,8 @@ class MQLLMEngine: ipc_path: Base path for zeromq interprocess messaging use_async_sockets: Whether to make send/recv async with GPU log_requests: Whether to log the requests. - *args: Arguments for {class}`LLMEngine`. - **kwargs: Arguments for {class}`LLMEngine`. + *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. + **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine]. """ def __init__(self, diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 323580fa7482..110f84a65efc 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -56,8 +56,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, scheduled computation. Args: - seq_group: the outputs are associated with this {class}`SequenceGroup` - outputs: the {class}`SequenceGroupOutput`s for all scheduler steps + seq_group: the outputs are associated with this + [`SequenceGroup`][vllm.sequence.SequenceGroup] + outputs: the + [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s + for all scheduler steps """ for output in outputs: # Concatenate single-step prompt logprob processing results. diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index ea4b71a5b9cd..e88f119c8742 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -19,17 +19,21 @@ def single_step_process_prompt_logprob( sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup, output: CompletionSequenceGroupOutput) -> None: - """Process prompt logprobs associated with the {class}`SequenceGroupOutput` - for a given step. + """Process prompt logprobs associated with the + [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step. Do nothing if the output has no prompt logprobs. Account for the fact that transformers do not compute first-token logprobs. Args: - sg_output_proc: {class}`SequenceGroupOutputProcessor` instance - seq_group: the output is associated with this {class}`SequenceGroup` - output: the {class}`SequenceGroupOutput` for a single scheduler step + sg_output_proc: + [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor] + instance + seq_group: the output is associated with this + [`SequenceGroup`][vllm.sequence.SequenceGroup] + output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] + for a single scheduler step """ prompt_logprobs = output.prompt_logprobs @@ -103,8 +107,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, scheduled computation. Args: - seq_group: the output is associated with this {class}`SequenceGroup` - outputs: the {class}`SequenceGroupOutput` for a single scheduler step + seq_group: the output is associated with this + [`SequenceGroup`][vllm.sequence.SequenceGroup] + outputs: the + [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] + for a single scheduler step """ assert len(outputs) == 1, "Single step should only have 1 output." output = outputs[0] diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f818e1737975..1c7bd65053f9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -129,8 +129,7 @@ class LLM: compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. - **kwargs: Arguments for [EngineArgs][vllm.EngineArgs]. (See - [engine-args][]) + **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs]. Note: This class is intended to be used for offline inference. For online @@ -494,7 +493,7 @@ def collective_rpc(self, `self` argument, in addition to the arguments passed in `args` and `kwargs`. The `self` argument will be the worker object. timeout: Maximum time in seconds to wait for execution. Raises a - {exc}`TimeoutError` on timeout. `None` means wait indefinitely. + [`TimeoutError`][] on timeout. `None` means wait indefinitely. args: Positional arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method. diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 93de9f3a5c05..c73575b48d9c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -582,7 +582,8 @@ def _tokenize_prompt_input( add_special_tokens: bool = True, ) -> TextTokensPrompt: """ - A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` + A simpler implementation of + [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] that assumes single input. """ return next( @@ -603,7 +604,8 @@ def _tokenize_prompt_inputs( add_special_tokens: bool = True, ) -> Iterator[TextTokensPrompt]: """ - A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs` + A simpler implementation of + [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] that assumes multiple inputs. """ for text in prompt_inputs: diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 522bd940211f..40ca1d29939a 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -74,7 +74,7 @@ def collective_rpc(self, `self` argument, in addition to the arguments passed in `args` and `kwargs`. The `self` argument will be the worker object. timeout: Maximum time in seconds to wait for execution. Raises a - {exc}`TimeoutError` on timeout. `None` means wait indefinitely. + [`TimeoutError`][] on timeout. `None` means wait indefinitely. args: Positional arguments to pass to the worker method. kwargs: Keyword arguments to pass to the worker method. diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 0673aece9108..df4f844cd815 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -10,8 +10,9 @@ INPUT_REGISTRY = InputRegistry() """ -The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine` -to dispatch data processing according to the target model. +The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used +by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the +target model. """ __all__ = [ diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 3b58ec47d5bf..843c45bd6163 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -80,22 +80,24 @@ class EmbedsPrompt(TypedDict): """ Set of possible schemas for a single prompt: -- A text prompt ({class}`str` or {class}`TextPrompt`) -- A tokenized prompt ({class}`TokensPrompt`) -- An embeddings prompt ({class}`EmbedsPrompt`) +- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt]) +- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt]) +- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt]) Note that "singleton" is as opposed to a data structure which encapsulates multiple prompts, i.e. of the sort which may be utilized for encoder/decoder models when the user desires to express both the encoder & decoder -prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt` +prompts explicitly, i.e. +[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] -A prompt of type {class}`SingletonPrompt` may be employed -as (1) input to a decoder-only model, (2) input to +A prompt of type [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] may be +employed as (1) input to a decoder-only model, (2) input to the encoder of an encoder/decoder model, in the scenario where the decoder-prompt is not specified explicitly, or (3) as a member of a larger data structure encapsulating -more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt` +more than one prompt, i.e. +[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] """ @@ -126,18 +128,20 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): comprising an explicit encoder prompt and a decoder prompt. The encoder and decoder prompts, respectively, may be formatted - according to any of the {class}`SingletonPrompt` schemas, + according to any of the + [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas, and are not required to have the same schema. Only the encoder prompt may have multi-modal data. mm_processor_kwargs should be at the top-level, and should not be set in the encoder/decoder prompts, since they are agnostic to the encoder/decoder. - Note that an {class}`ExplicitEncoderDecoderPrompt` may not - be used as an input to a decoder-only model, + Note that an + [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] + may not be used as an input to a decoder-only model, and that the `encoder_prompt` and `decoder_prompt` fields of this data structure themselves must be - {class}`SingletonPrompt` instances. + [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances. """ encoder_prompt: _T1_co @@ -152,11 +156,11 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): Set of possible schemas for an LLM input, including both decoder-only and encoder/decoder input types: -- A text prompt ({class}`str` or {class}`TextPrompt`) -- A tokenized prompt ({class}`TokensPrompt`) -- An embeddings prompt ({class}`EmbedsPrompt`) +- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt]) +- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt]) +- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt]) - A single data structure containing both an encoder and a decoder prompt - ({class}`ExplicitEncoderDecoderPrompt`) + ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]) """ @@ -189,7 +193,8 @@ def token_inputs( prompt: Optional[str] = None, cache_salt: Optional[str] = None, ) -> TokenInputs: - """Construct {class}`TokenInputs` from optional values.""" + """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional + values.""" inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) if prompt is not None: @@ -221,7 +226,8 @@ def embeds_inputs( prompt_embeds: torch.Tensor, cache_salt: Optional[str] = None, ) -> EmbedsInputs: - """Construct :class:`EmbedsInputs` from optional values.""" + """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional + values.""" inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds) if cache_salt is not None: @@ -232,7 +238,7 @@ def embeds_inputs( DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ -The inputs in {class}`~vllm.LLMEngine` before they are +The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are passed to the model executor. This specifies the data required for decoder-only models. """ @@ -240,11 +246,12 @@ def embeds_inputs( class EncoderDecoderInputs(TypedDict): """ - The inputs in {class}`~vllm.LLMEngine` before they are - passed to the model executor. + The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they + are passed to the model executor. This specifies the required data for encoder-decoder models. """ + encoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the encoder portion.""" @@ -254,13 +261,13 @@ class EncoderDecoderInputs(TypedDict): SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ -A processed {class}`SingletonPrompt` which can be passed to -{class}`vllm.sequence.Sequence`. +A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be +passed to [`vllm.sequence.Sequence`][]. """ ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] """ -The inputs to {data}`vllm.inputs.InputProcessor`. +The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][]. """ _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) @@ -277,7 +284,8 @@ def build_explicit_enc_dec_prompt( return ExplicitEncoderDecoderPrompt( encoder_prompt=encoder_prompt, decoder_prompt=decoder_prompt, - mm_processor_kwargs=mm_processor_kwargs) + mm_processor_kwargs=mm_processor_kwargs, + ) def zip_enc_dec_prompts( @@ -288,7 +296,8 @@ def zip_enc_dec_prompts( ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of - {class}`ExplicitEncoderDecoderPrompt` instances. + [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt] + instances. ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same dictionary will be used for every encoder/decoder prompt. If an iterable is @@ -299,10 +308,11 @@ def zip_enc_dec_prompts( if isinstance(mm_processor_kwargs, dict): return [ build_explicit_enc_dec_prompt( - encoder_prompt, decoder_prompt, - cast(dict[str, Any], mm_processor_kwargs)) - for (encoder_prompt, - decoder_prompt) in zip(enc_prompts, dec_prompts) + encoder_prompt, + decoder_prompt, + cast(dict[str, Any], mm_processor_kwargs), + ) for (encoder_prompt, + decoder_prompt) in zip(enc_prompts, dec_prompts) ] return [ build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt, diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index d17122b48344..4c64a41ace31 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -23,13 +23,13 @@ class ParsedTokens(TypedDict): @overload def parse_and_batch_prompt( - prompt: Union[str, list[str]]) -> Sequence[ParsedText]: + prompt: Union[str, list[str]], ) -> Sequence[ParsedText]: ... @overload def parse_and_batch_prompt( - prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]: + prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]: ... @@ -86,7 +86,7 @@ class ParsedTokensPrompt(TypedDict): class ParsedEmbedsPrompt(TypedDict): - type: Literal['embeds'] + type: Literal["embeds"] content: EmbedsPrompt @@ -133,7 +133,7 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt: def is_explicit_encoder_decoder_prompt( - prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: + prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]: return isinstance(prompt, dict) and "encoder_prompt" in prompt diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 6e8effd60274..b9acabeabd8d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -67,11 +67,11 @@ def get_eos_token_id(self, return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id def get_decoder_start_token_id(self) -> Optional[int]: - ''' + """ Obtain the decoder start token id employed by an encoder/decoder model. Returns None for non-encoder/decoder models or if the model config is unavailable. - ''' + """ if not self.model_config.is_encoder_decoder: logger.warning_once( @@ -79,14 +79,14 @@ def get_decoder_start_token_id(self) -> Optional[int]: "this is not an encoder/decoder model.") return None - if (self.model_config is None or self.model_config.hf_config is None): + if self.model_config is None or self.model_config.hf_config is None: logger.warning_once( "Using None for decoder start token id because " "model config is not available.") return None dec_start_token_id = getattr(self.model_config.hf_config, - 'decoder_start_token_id', None) + "decoder_start_token_id", None) if dec_start_token_id is None: logger.warning_once( "Falling back on <BOS> for decoder start token " @@ -97,7 +97,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: return dec_start_token_id def _get_default_enc_dec_decoder_prompt(self) -> list[int]: - ''' + """ Specifically for encoder/decoder models: generate a default decoder prompt for when the user specifies only the encoder prompt. @@ -126,7 +126,7 @@ def _get_default_enc_dec_decoder_prompt(self) -> list[int]: Returns: * prompt_token_ids - ''' + """ bos_token_id = self.get_bos_token_id() assert bos_token_id is not None @@ -224,7 +224,10 @@ async def _tokenize_prompt_async( lora_request: Optional[LoRARequest], tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> list[int]: - """Async version of {meth}`_tokenize_prompt`.""" + """ + Async version of + [`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt]. + """ tokenizer = self.get_tokenizer_group() tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) @@ -287,7 +290,10 @@ async def _process_multimodal_async( lora_request: Optional[LoRARequest], return_mm_hashes: bool = False, ) -> MultiModalInputs: - """Async version of {meth}`_process_multimodal`.""" + """ + Async version of + [`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal]. + """ tokenizer = await self._get_mm_tokenizer_async(lora_request) mm_processor = self.mm_registry.create_processor(self.model_config, @@ -472,7 +478,7 @@ def _prompt_to_llm_inputs( Returns: - * {class}`SingletonInputs` instance + * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance """ parsed = parse_singleton_prompt(prompt) @@ -508,7 +514,10 @@ async def _prompt_to_llm_inputs_async( lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: - """Async version of {meth}`_prompt_to_llm_inputs`.""" + """ + Async version of + [`_prompt_to_llm_inputs`][vllm.inputs.preprocess.InputPreprocessor._prompt_to_llm_inputs]. + """ parsed = parse_singleton_prompt(prompt) if parsed["type"] == "embeds": @@ -644,7 +653,9 @@ def _process_encoder_decoder_prompt( ) -> EncoderDecoderInputs: """ For encoder/decoder models only: - Process an input prompt into an {class}`EncoderDecoderInputs` instance. + Process an input prompt into an + [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + instance. There are two types of input prompts: singleton prompts which carry only the @@ -670,7 +681,8 @@ def _process_encoder_decoder_prompt( Returns: - * {class}`EncoderDecoderInputs` instance + * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + instance """ encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -710,7 +722,10 @@ async def _process_encoder_decoder_prompt_async( prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, ) -> EncoderDecoderInputs: - """Async version of {meth}`_process_encoder_decoder_prompt`.""" + """ + Async version of + [`_process_encoder_decoder_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_encoder_decoder_prompt]. + """ encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -778,7 +793,8 @@ def _process_decoder_only_prompt( ) -> DecoderOnlyInputs: """ For decoder-only models: - Process an input prompt into an {class}`DecoderOnlyInputs` instance. + Process an input prompt into a + [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance. Arguments: @@ -789,7 +805,7 @@ def _process_decoder_only_prompt( Returns: - * {class}`DecoderOnlyInputs` instance + * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance """ prompt_comps = self._prompt_to_llm_inputs( @@ -812,7 +828,10 @@ async def _process_decoder_only_prompt_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> DecoderOnlyInputs: - """Async version of {meth}`_process_decoder_only_prompt`.""" + """ + Async version of + [`_process_decoder_only_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_decoder_only_prompt]. + """ prompt_comps = await self._prompt_to_llm_inputs_async( prompt, tokenization_kwargs=tokenization_kwargs, @@ -863,7 +882,10 @@ async def preprocess_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> ProcessorInputs: - """Async version of {meth}`preprocess`.""" + """ + Async version of + [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess]. + """ if self.model_config.is_encoder_decoder: assert not return_mm_hashes, ( "Multimodal hashes for encoder-decoder models should not be ", diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 148b3558c15e..f424a8f613ab 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -38,7 +38,7 @@ def get_hf_config( ) -> _C: """ Get the HuggingFace configuration - ({class}`transformers.PretrainedConfig`) of the model, + (`transformers.PretrainedConfig`) of the model, additionally checking its type. Raises: @@ -79,7 +79,7 @@ def get_hf_processor( ) -> _P: """ Get the HuggingFace processor - ({class}`transformers.ProcessorMixin`) of the model, + (`transformers.ProcessorMixin`) of the model, additionally checking its type. Raises: diff --git a/vllm/logger.py b/vllm/logger.py index cf32041c5b70..fd16dd95bb1b 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -68,22 +68,22 @@ class _VllmLogger(Logger): """ Note: This class is just to provide type information. - We actually patch the methods directly on the {class}`logging.Logger` + We actually patch the methods directly on the [`logging.Logger`][] instance to avoid conflicting with other libraries such as `intel_extension_for_pytorch.utils._logger`. """ def info_once(self, msg: str, *args: Hashable) -> None: """ - As {meth}`info`, but subsequent calls with the same message - are silently dropped. + As [`info`][logging.Logger.info], but subsequent calls with + the same message are silently dropped. """ _print_info_once(self, msg, *args) def warning_once(self, msg: str, *args: Hashable) -> None: """ - As {meth}`warning`, but subsequent calls with the same message - are silently dropped. + As [`warning`][logging.Logger.warning], but subsequent calls with + the same message are silently dropped. """ _print_warning_once(self, msg, *args) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index d6b910e4b75a..32375db0c8f1 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -228,17 +228,19 @@ def forward( ) -> Optional[SamplerOutput]: """ Single-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Pythonize sampling result & logprobs tensor + * Perform GPU-side sampling computation & compute + GPU-side logprobs tensor + * Pythonize sampling result & logprobs tensor Multi-step scheduling: - * Perform GPU-side sampling computation & compute - GPU-side logprobs tensor - * Defer Pythonization of sampling result & logprobs - tensor - * Encapsulate arguments required for deferred Pythonization - in the {class}`SamplerOutput` structure + * Perform GPU-side sampling computation & compute + GPU-side logprobs tensor + * Defer Pythonization of sampling result & logprobs + tensor + * Encapsulate arguments required for deferred Pythonization + in the + [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput] + structure Args: logits: (num_tokens, vocab_size). diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8f33a3e29c60..8be8841c1f6c 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -226,9 +226,11 @@ def forward( intermediate_tensors: Optional["IntermediateTensors"], ) -> Union[Tensor, "IntermediateTensors"]: """ - Accept {class}`IntermediateTensors` when PP rank > 0. + Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when + PP rank > 0. - Return {class}`IntermediateTensors` only for the last PP rank. + Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only + for the last PP rank. """ ... diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index e215582a37ac..640a2049a629 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -965,7 +965,7 @@ def select_tiling( class MolmoProcessorWrapper: """ - Wraps {class}`MolmoProcessor` so that it can be called directly. + Wraps `MolmoProcessor` so that it can be called directly. The original definition can be found here: https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index bbaa85cf54df..9f28d4cef425 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -67,14 +67,14 @@ class PixtralImagePixelInputs(TypedDict): """ Shape: `(batch_size * num_images, num_channels, image_width, image_height)` - The result of stacking {attr}`ImageEncoding.tokens` from each prompt. + The result of stacking `ImageEncoding.tokens` from each prompt. """ class PixtralProcessorAdapter: """ Provide a HF-compatible interface for - {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. + `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`. """ def __init__(self, tokenizer: MistralTokenizer) -> None: diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 57a66b793711..f5d242fdf1c2 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -382,7 +382,8 @@ def _get_tokenizer_without_image_pad( tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: """ The logic of adding image pad tokens should only be applied in - {class}`QwenVLProcessor`, so they are patched out here. + [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor], + so they are patched out here. The definition of the wrapped tokenizer can be found here: https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3d842848a419..97ea12de6537 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -383,7 +383,7 @@ def register_model( `model_cls` can be either: - - A {class}`torch.nn.Module` class directly referencing the model. + - A [`torch.nn.Module`][] class directly referencing the model. - A string in the format `<module>:<class>` which can be used to lazily import the model. This is useful to avoid initializing CUDA when importing the model and thus the related error diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 027cd748e9de..3d821d3dc6b5 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -66,7 +66,7 @@ def apply( class AutoWeightsLoader: """ - Helper class to load weights into a {class}`torch.nn.Module`. It is able + Helper class to load weights into a [`torch.nn.Module`][]. It is able to automatically detect child modules and parameters while iterating over the weights only once. diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 22fee2f74712..815e34d5ac5d 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -8,11 +8,12 @@ MULTIMODAL_REGISTRY = MultiModalRegistry() """ -The global {class}`~MultiModalRegistry` is used by model runners to -dispatch data processing according to the target model. +The global [`MultiModalRegistry`][vllm.multimodal.registry.MultiModalRegistry] +is used by model runners to dispatch data processing according to the target +model. Info: - [mm-processing][] + [mm_processing](../../../design/mm_processing.html) """ __all__ = [ diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 71ef1a98e0d0..162dd52e3e73 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -29,14 +29,14 @@ HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"] """ -A {class}`transformers.image_utils.ImageInput` representing a single image +A `transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. """ HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]] """ -A {class}`transformers.image_utils.VideoInput` representing a single video +A `transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. """ @@ -48,7 +48,7 @@ ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] """ -A {class}`transformers.image_utils.ImageInput` representing a single image +A `transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. Alternatively, a 3-D tensor or batch of 2-D tensors, @@ -58,7 +58,7 @@ VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"] """ -A {class}`transformers.image_utils.VideoInput` representing a single video +A `transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. Alternatively, a 3-D tensor or batch of 2-D tensors, @@ -108,7 +108,8 @@ class MultiModalDataBuiltins(TypedDict, total=False): """ A dictionary containing an entry for each modality type to input. -The built-in modalities are defined by {class}`MultiModalDataBuiltins`. +The built-in modalities are defined by +[`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins]. """ @@ -169,7 +170,8 @@ def __eq__(self, other: object) -> bool: def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: - """Equality check between {data}`NestedTensors` objects.""" + """Equality check between + [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.""" if isinstance(a, torch.Tensor): return isinstance(b, torch.Tensor) and torch.equal(a, b) elif isinstance(b, torch.Tensor): @@ -189,7 +191,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via -{meth}`MultiModalKwargs.batch`. +[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch]. """ @@ -197,7 +199,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: class MultiModalFieldElem: """ Represents a keyword argument corresponding to a multi-modal item - in {class}`MultiModalKwargs`. + in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]. """ modality: str @@ -208,13 +210,15 @@ class MultiModalFieldElem: key: str """ - The key of this field in {class}`MultiModalKwargs`, + The key of this field in + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs], i.e. the name of the keyword argument to be passed to the model. """ data: NestedTensors """ - The tensor data of this field in {class}`MultiModalKwargs`, + The tensor data of this field in + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs], i.e. the value of the keyword argument to be passed to the model. """ @@ -237,7 +241,8 @@ def __eq__(self, other: object) -> bool: class BaseMultiModalField(ABC): """ Defines how to interpret tensor data belonging to a keyword argument in - {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa. + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple + multi-modal items, and vice versa. """ def _field_factory(self, *, modality: str, key: str): @@ -262,10 +267,12 @@ def build_elems( data: NestedTensors, ) -> Sequence[MultiModalFieldElem]: """ - Construct {class}`MultiModalFieldElem` instances to represent - the provided data. + Construct + [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem] + instances to represent the provided data. - This is the inverse of {meth}`reduce_data`. + This is the inverse of + [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data]. """ raise NotImplementedError @@ -275,9 +282,11 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: """ - Merge the data from multiple instances of {class}`MultiModalFieldElem`. + Merge the data from multiple instances of + [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]. - This is the inverse of {meth}`build_elems`. + This is the inverse of + [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems]. """ field_types = [type(item.field) for item in elems] if len(set(field_types)) > 1: @@ -290,7 +299,7 @@ def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: class MultiModalBatchedField(BaseMultiModalField): """ Info: - [MultiModalFieldConfig.batched][] + [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched] """ def build_elems( @@ -320,8 +329,8 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: class MultiModalFlatField(BaseMultiModalField): """ Info: - [MultiModalFieldConfig.flat][] - [MultiModalFieldConfig.flat_from_sizes][] + [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat] + [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes] """ slices: Union[Sequence[slice], Sequence[Sequence[slice]]] dim: int = 0 @@ -362,7 +371,7 @@ def _expect_same_shape(tensor: torch.Tensor): class MultiModalSharedField(BaseMultiModalField): """ Info: - [MultiModalFieldConfig.shared][] + [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared] """ batch_size: int @@ -508,7 +517,7 @@ def flat_from_sizes(modality: str, ``` Info: - [MultiModalFieldConfig.flat][] + [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat] """ if size_per_item.ndim != 1: @@ -572,8 +581,10 @@ def build_elems( class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): """ - A collection of {class}`MultiModalFieldElem` - corresponding to a data item in {class}`MultiModalDataItems`. + A collection of + [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem] + corresponding to a data item in + [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. """ @staticmethod @@ -592,11 +603,13 @@ def modality(self) -> str: class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to - {meth}`~torch.nn.Module.forward`. + [`torch.nn.Module.forward`][]. The metadata `items` enables us to obtain the keyword arguments - corresponding to each data item in {class}`MultiModalDataItems`, via - {meth}`get_item` and {meth}`get_items`. + corresponding to each data item in + [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via + [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and + [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items]. """ @staticmethod @@ -635,7 +648,9 @@ def from_hf_inputs( @staticmethod def from_items(items: Sequence[MultiModalKwargsItem]): - """Construct a new {class}`MultiModalKwargs` from multiple items.""" + """Construct a new + [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] + from multiple items.""" elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) for item in items: for key, elem in item.items(): @@ -800,7 +815,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: class MultiModalInputs(TypedDict): """ Represents the outputs of - {class}`vllm.multimodal.processing.BaseMultiModalProcessor`, + [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor], ready to be passed to vLLM internals. """ @@ -836,7 +851,8 @@ class MultiModalInputs(TypedDict): class MultiModalEncDecInputs(MultiModalInputs): """ - Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor` + Represents the outputs of + [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor] ready to be passed to vLLM internals. """ diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 6e9ec9555802..63af842747a5 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -28,7 +28,8 @@ class ModalityDataItems(ABC, Generic[_T, _I]): """ - Represents data items for a modality in {class}`MultiModalDataItems`. + Represents data items for a modality in + [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. """ def __init__(self, data: _T, modality: str) -> None: @@ -251,15 +252,15 @@ def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): """ - As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized - such that each entry corresponds to a list. + As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but + normalized such that each entry corresponds to a list. """ def get_count(self, modality: str, *, strict: bool = True) -> int: """ Get the number of data items belonging to a modality. - If `strict=False`, return `0` instead of raising {exc}`KeyError` + If `strict=False`, return `0` instead of raising [`KeyError`][] even if the modality is not found. """ if modality not in self: @@ -305,8 +306,8 @@ def get_items( class MultiModalDataParser: """ - Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into - {class}`MultiModalDataItems`. + Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict] + into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]. Args: target_sr (float, optional): Enables automatic resampling of audio diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index f7a3c327982d..aa7914e40cbf 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -114,13 +114,14 @@ class PromptUpdateDetails(Generic[_S]): is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None """ - Given {attr}`full`, return a boolean mask of shape `(len(full),)` - indicating which positions of `full` to assign embeddings to. + Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full], + return a boolean mask of shape `(len(full),)` indicating which positions + of `full` to assign embeddings to. `None` (default) means to assign embeddings to all positions of `full`. The embeddings are obtained by calling - {class}`SupportsMultiModal.get_multimodal_embeddings`. + [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings]. """ @staticmethod @@ -159,13 +160,15 @@ def select_token_id( The token sequence or text that are part of the update. If only part of the content corresponds to feature placeholders, you can -use {class}`PromptUpdateDetails` to specify which part. +use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to +specify which part. """ PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], PromptUpdateInfo] """ -Given the index of the processed item within {attr}`modality`, +Given the index of the processed item within +[`modality`][vllm.multimodal.processing.PromptUpdate.modality], output the corresponding token sequence (or text). For convenience, you can directly pass in the token sequence (or text) @@ -260,8 +263,10 @@ class PromptInsertion(PromptUpdate): insertion: PromptUpdateContent = field(repr=False) """ - Given the index of the processed item within {attr}`modality`, - output the token sequence (or text) to insert right after {attr}`target`. + Given the index of the processed item within + [`modality`][vllm.multimodal.processing.PromptUpdate.modality], + output the token sequence (or text) to insert right after + [`target`][vllm.multimodal.processing.PromptUpdate.target]. For convenience, you can directly pass in the token sequence (or text) instead of a function if it does not depend on the input. @@ -332,8 +337,10 @@ class PromptReplacement(PromptUpdate): replacement: PromptUpdateContent = field(repr=False) """ - Given the index of the processed item within {attr}`modality`, - output the token sequence (or text) to replace {attr}`target`. + Given the index of the processed item within + [`modality`][vllm.multimodal.processing.PromptUpdate.modality], + output the token sequence (or text) to replace + [`target`][vllm.multimodal.processing.PromptUpdate.target]. For convenience, you can directly pass in the token sequence (or text) instead of a function if it does not depend on the input. @@ -387,14 +394,16 @@ def modality(self) -> str: def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: - """Convenience function to apply [full_groupby][] based on modality.""" + """Convenience function to apply [`full_groupby`][vllm.utils.full_groupby] + based on modality.""" return full_groupby(values, key=lambda x: x.modality) @dataclass class _BoundPromptSequence: """ - A {data}`_PromptSeq` bound to a tokenizer to automatically + A [`_PromptSeq`][vllm.multimodal.processing.PromptSeq] bound + to a tokenizer to automatically convert between token sequence and text representations. """ tokenizer: AnyTokenizer = field(repr=False) @@ -446,9 +455,11 @@ class _BoundPromptContent: @dataclass class BoundPromptUpdate: """ - A {class}`PromptUpdate` bound to a tokenizer to automatically convert - {attr}`target` and the result of {meth}`get_content` between - token sequence and text representations. + A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] bound + to a tokenizer to automatically convert + [`target`][vllm.multimodal.processing.PromptUpdate.target] and the result of + [`get_content`][vllm.multimodal.processing.BoundPromptUpdate.get_content] + between token sequence and text representations. """ _origin: PromptUpdate tokenizer: AnyTokenizer = field(repr=False) @@ -482,7 +493,8 @@ def mode(self) -> UpdateMode: def get_content(self, item_idx: int) -> _BoundPromptContent: """ - Given the index of the processed item within {attr}`modality`, + Given the index of the processed item within + [`modality`][vllm.multimodal.processing.PromptUpdate.modality], output the token sequence (or text) to update. """ content = self.content @@ -1019,7 +1031,8 @@ def put( ) -> None: """ Put a processed multi-modal item into the cache - according to its dependencies (see {meth}`get`). + according to its dependencies + (see [`get`][vllm.multimodal.processing.ProcessingCache.get]). """ cache_key = MultiModalHasher.hash_kwargs(model_id=model_id, **{modality: input_item}, @@ -1091,7 +1104,8 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]: MultiModalHashes = dict[str, list[str]] """ -A collection of hashes with a similar structure as {class}`MultiModalKwargs`. +A collection of hashes with a similar structure as +[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]. """ @@ -1099,7 +1113,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): """ Abstract base class to process multi-modal inputs to be used in vLLM. - Not to be confused with {class}`transformers.ProcessorMixin`. + Not to be confused with `transformers.ProcessorMixin`. """ def __init__(self, @@ -1126,10 +1140,12 @@ def __call__( def _get_data_parser(self) -> MultiModalDataParser: """ Construct a parser to preprocess multi-modal data items - before passing them to {meth}`_get_hf_mm_data`. + before passing them to + [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. You can support additional modalities by creating a subclass - of {class}`MultiModalDataParser` that has additional subparsers. + of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser] + that has additional subparsers. """ return MultiModalDataParser() @@ -1138,8 +1154,11 @@ def _to_mm_items( mm_data: MultiModalDataDict, ) -> MultiModalDataItems: """ - Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems` - before passing them to {meth}`_get_hf_mm_data`. + Normalize + [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict] + to [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems] + before passing them to + [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. """ mm_items = self.data_parser.parse_mm_data(mm_data) supported_mm_limits = self.info.get_supported_mm_limits() @@ -1191,7 +1210,8 @@ def _get_prompt_updates( inputs. Moreover, this information is critical to determine the token positions - in order to construct {class}`~vllm-multimodal.input.PlaceholderRange` + in order to construct + [`PlaceholderRange`][vllm.multimodal.inputs.PlaceholderRange] for each multi-modal item. """ raise NotImplementedError @@ -1315,7 +1335,9 @@ def _apply_hf_processor_tokens_only( Most HF processors accept prompt text but not prompt tokens. If the HF processor adds or removes tokens that are not related to multi-modal data, you should override this method so it is consistent - with the output of {meth}`_apply_hf_processor_text_only` on the + with the output of + [`_apply_hf_processor_text_only`][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_text_only] + on the corresponding text. """ return prompt_tokens @@ -1330,7 +1352,8 @@ def _apply_hf_processor_mm_only( Since HF processor requires that text and multi-modal items correspond to each other, we generate dummy text using - {class}`DummyInputsBuilder` to go along with the multi-modal data. + [`DummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder] + to go along with the multi-modal data. """ mm_counts = mm_items.get_all_counts() diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 59427f35293a..a85b13fb2387 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -25,7 +25,7 @@ class ProcessorInputs: """ Represents the keyword arguments to - {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. + [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][]. """ prompt: Union[str, list[int]] mm_data: MultiModalDataDict diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 0d0d4a4363f4..b9f5cee922a7 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -29,7 +29,11 @@ class ProcessingInfoFactory(Protocol[_I_co]): - """Constructs a {class}`MultiModalProcessor` instance from the context.""" + """ + Constructs a + [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor] + instance from the context. + """ def __call__( self, @@ -40,7 +44,9 @@ def __call__( class DummyInputsBuilderFactory(Protocol[_I]): """ - Constructs a {class}`BaseDummyInputsBuilder` instance from the context. + Constructs a + [`BaseDummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder] + instance from the context. """ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: @@ -48,7 +54,11 @@ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]: class MultiModalProcessorFactory(Protocol[_I]): - """Constructs a {class}`MultiModalProcessor` instance from the context.""" + """ + Constructs a + [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor] + instance from the context. + """ def __call__( self, @@ -155,8 +165,6 @@ def get_max_tokens_by_modality( """ Get the maximum number of tokens from each modality for profiling the memory usage of a model. - - See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details. """ mm_limits = self.get_mm_limits_per_prompt(model_config) @@ -170,8 +178,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. - - See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details. """ return sum(self.get_max_tokens_by_modality(model_config).values()) @@ -213,9 +219,6 @@ def register_processor( When the model receives multi-modal data, the provided function is invoked to transform the data into a dictionary of model inputs. - - Info: - [mm-processing][] """ def wrapper(model_cls: N) -> N: @@ -258,9 +261,6 @@ def create_processor( ) -> BaseMultiModalProcessor[BaseProcessingInfo]: """ Create a multi-modal processor for a specific model and tokenizer. - - Info: - [mm-processing][] """ if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index aef5f669ac68..9ddba67bff70 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -259,7 +259,8 @@ def fetch_image_embedding( global_media_connector = MediaConnector() -"""The global {class}`MediaConnector` instance used by vLLM.""" +"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector] +instance used by vLLM.""" fetch_audio = global_media_connector.fetch_audio fetch_image = global_media_connector.fetch_image diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 646faa944565..504c3b42a75d 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -84,7 +84,7 @@ def as_version_str(self) -> str: def to_int(self) -> int: """ - Express device capability as an integer ``<major><minor>``. + Express device capability as an integer `<major><minor>`. It is assumed that the minor version is always a single digit. """ @@ -206,10 +206,11 @@ def has_device_capability( """ Test whether this platform is compatible with a device capability. - The ``capability`` argument can either be: + The `capability` argument can either be: - - A tuple ``(major, minor)``. - - An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`) + - A tuple `(major, minor)`. + - An integer `<major><minor>`. (See + [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int]) """ current_capability = cls.get_device_capability(device_id=device_id) if current_capability is None: diff --git a/vllm/sequence.py b/vllm/sequence.py index a4b4bd66c843..d359f897da25 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -27,7 +27,7 @@ def array_full(token_id: int, count: int): - """{class}`array` equivalent of [numpy.full][].""" + """[`array`][] equivalent of [numpy.full][].""" return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count @@ -192,8 +192,8 @@ class SequenceData(msgspec.Struct, def from_prompt_token_counts( *token_counts: tuple[int, int]) -> "SequenceData": """ - Construct a {class}`SequenceData` instance by concatenating - prompt token sequences. + Construct a [`SequenceData`][vllm.sequence.SequenceData] instance + by concatenating prompt token sequences. Each tuple represents one token sequence, expressed in the form `(token_id, count)`. @@ -216,8 +216,8 @@ def from_seqs( prompt_embeds: Optional[torch.Tensor] = None, ) -> "SequenceData": """ - Construct a {class}`SequenceData` instance from prompt and output - token sequences. + Construct a [`SequenceData`][vllm.sequence.SequenceData] instance + from prompt and output token sequences. """ prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, prompt_token_ids) @@ -452,9 +452,11 @@ def __repr__(self) -> str: class Sequence: """Stores the data, status, and block information of a sequence. - The sequence is constructed from the {data}`DecoderOnlyInputs` - (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder) - instance passed in through the `inputs` constructor argument. + The sequence is constructed from the + [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only) + or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs] + (for encoder-decoder) instance passed in through the `inputs` + constructor argument. Args: seq_id: The ID of the sequence. diff --git a/vllm/utils.py b/vllm/utils.py index 86873ff75817..7222a3c99102 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1005,7 +1005,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]: def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]): """ - Unlike {class}`itertools.groupby`, groups are not broken by + Unlike [`itertools.groupby`][], groups are not broken by non-contiguous data. """ groups = defaultdict[_K, list[_V]](list) @@ -1926,7 +1926,8 @@ class _PlaceholderBase: Disallows downstream usage of placeholder modules. We need to explicitly override each dunder method because - {meth}`__getattr__` is not called when they are accessed. + [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__] + is not called when they are accessed. Info: [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 28503a0a926d..91548a52cfc7 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs( ) -> None: """ Perform sanity checks for the result of - {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`. + [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][]. """ assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), ( "Expected multimodal embeddings to be a list/tuple of 2D tensors, " @@ -39,7 +39,7 @@ def scatter_mm_placeholders( Scatter the multimodal embeddings into a contiguous tensor that represents the placeholder tokens. - {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`. + [`vllm.multimodal.processing.PromptUpdateDetails.is_embed`][]. Args: embeds: The multimodal embeddings. diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index d9cf2055ed56..f8d5acf586c5 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -733,12 +733,13 @@ def _pythonize_sampler_output( logprobs_tensor: Optional[torch.Tensor], cache: Optional[PythonizationCache], ) -> None: - """ This function is only called when the output tensors are ready. - See {class}`ModelOutput`. - - Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, + """ This function is only called when the output tensors are ready. + See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput]. + + Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, adding a Pythonized output data structure - ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`. + ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput]) + for each [`SequenceGroup`][vllm.sequence.SequenceGroup]. Args: model_input From 4318c0559d9edab7c644d6a1968fdc008bc75104 Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 27 May 2025 17:19:18 +0800 Subject: [PATCH 187/192] [CI/Build] Remove imports of built-in `re` (#18750) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- .pre-commit-config.yaml | 1 + docs/mkdocs/hooks/generate_examples.py | 4 ++-- docs/mkdocs/hooks/url_schemes.py | 3 +-- requirements/docs.txt | 1 + tools/check_triton_import.py | 3 ++- .../openai/tool_parsers/llama4_pythonic_tool_parser.py | 3 +-- vllm/model_executor/guided_decoding/guidance_decoding.py | 2 +- vllm/model_executor/guided_decoding/outlines_decoding.py | 2 +- 8 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d0fa4e8f64cc..b45619a3234c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -142,6 +142,7 @@ repos: language: python types: [python] pass_filenames: false + additional_dependencies: [regex] # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index c2f1f2d96f00..6f290efe45c2 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 - import itertools -import re from dataclasses import dataclass, field from pathlib import Path from typing import Literal +import regex as re + ROOT_DIR = Path(__file__).parent.parent.parent.parent ROOT_DIR_RELATIVE = '../../../../..' EXAMPLE_DIR = ROOT_DIR / "examples" diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py index 03e7ffbb2733..c738828085ba 100644 --- a/docs/mkdocs/hooks/url_schemes.py +++ b/docs/mkdocs/hooks/url_schemes.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import re - +import regex as re from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.files import Files from mkdocs.structure.pages import Page diff --git a/requirements/docs.txt b/requirements/docs.txt index a1f51334ed81..64c70cb65c55 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -5,4 +5,5 @@ mkdocstrings-python mkdocs-gen-files mkdocs-awesome-nav python-markdown-math +regex ruff diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py index d938ff1df594..18c9726a11ac 100644 --- a/tools/check_triton_import.py +++ b/tools/check_triton_import.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -import re import subprocess import sys +import regex as re + FORBIDDEN_IMPORT_RE = re.compile(r"^(from|import)\s+triton(\s|\.|$)") # the way allowed to import triton diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index f483ac4eeee6..858c8db99fd2 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -1,11 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 - import ast import json -import re from collections.abc import Sequence from typing import Any, Union +import regex as re from transformers import PreTrainedTokenizerBase from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py index 0b1f4762bc73..58adcc3caff9 100644 --- a/vllm/model_executor/guided_decoding/guidance_decoding.py +++ b/vllm/model_executor/guided_decoding/guidance_decoding.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from re import escape as regex_escape import llguidance +from regex import escape as regex_escape from transformers import PreTrainedTokenizerBase from vllm.model_executor.guided_decoding.guidance_logits_processors import ( diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index bcd7494e6cec..e41af4b360e4 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -5,9 +5,9 @@ import os from enum import Enum from json import dumps as json_dumps -from re import escape as regex_escape from typing import Optional, Union +from regex import escape as regex_escape from transformers import PreTrainedTokenizerBase from vllm.model_executor.guided_decoding.outlines_logits_processors import ( From 06a0338015a3ac959edb4d152a69793359983a4d Mon Sep 17 00:00:00 2001 From: Mark McLoughlin <markmc@redhat.com> Date: Tue, 27 May 2025 10:37:06 +0100 Subject: [PATCH 188/192] [V1][Metrics] Add API for accessing in-memory Prometheus metrics (#17010) Signed-off-by: Mark McLoughlin <markmc@redhat.com> --- .buildkite/test-pipeline.yaml | 1 + examples/offline_inference/eagle.py | 36 ++-- examples/offline_inference/metrics.py | 49 ++++++ tests/v1/engine/test_llm_engine.py | 65 +++++++ tests/v1/test_metrics_reader.py | 112 ++++++++++++ vllm/entrypoints/llm.py | 20 ++- vllm/v1/engine/llm_engine.py | 29 ++- vllm/v1/metrics/loggers.py | 8 +- vllm/v1/metrics/reader.py | 245 ++++++++++++++++++++++++++ vllm/v1/spec_decode/metrics.py | 6 +- 10 files changed, 543 insertions(+), 28 deletions(-) create mode 100644 examples/offline_inference/metrics.py create mode 100644 tests/v1/test_metrics_reader.py create mode 100644 vllm/v1/metrics/reader.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 66e2e3312337..80a5a610c8ac 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -222,6 +222,7 @@ steps: - pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_metrics_reader.py # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index 3dd9e5464641..606ce7799a88 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -6,6 +6,7 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams +from vllm.v1.metrics.reader import Counter, Vector def load_prompts(dataset_path, num_prompts): @@ -105,30 +106,33 @@ def main(): print(f"generated text: {output.outputs[0].text}") print("-" * 50) - if not hasattr(outputs, "metrics") or outputs.metrics is None: + try: + metrics = llm.get_metrics() + except AssertionError: + print("Metrics are not supported in the V0 engine.") return - # calculate the average number of accepted tokens per forward pass, +1 is - # to account for the token from the target model that's always going to be - # accepted - acceptance_counts = [0] * (args.num_spec_tokens + 1) - for output in outputs: - for step, count in enumerate(output.metrics.spec_token_acceptance_counts): - acceptance_counts[step] += count + num_drafts = num_accepted = 0 + acceptance_counts = [0] * args.num_spec_tokens + for metric in metrics: + if metric.name == "vllm:spec_decode_num_drafts": + assert isinstance(metric, Counter) + num_drafts += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens": + assert isinstance(metric, Counter) + num_accepted += metric.value + elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + assert isinstance(metric, Vector) + for pos in range(len(metric.values)): + acceptance_counts[pos] += metric.values[pos] print("-" * 50) - print( - f"mean acceptance length (including bonus tokens): \ - {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}" - ) + print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}") print("-" * 50) # print acceptance at each token position for i in range(len(acceptance_counts)): - print( - f"acceptance at token {i}:" - f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}" - ) + print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}") if __name__ == "__main__": diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py new file mode 100644 index 000000000000..7927f758cb57 --- /dev/null +++ b/examples/offline_inference/metrics.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams +from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + +def main(): + # Create an LLM. + llm = LLM(model="facebook/opt-125m", disable_log_stats=False) + + # Generate texts from the prompts. + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + # Dump all metrics + for metric in llm.get_metrics(): + if isinstance(metric, Gauge): + print(f"{metric.name} (gauge) = {metric.value}") + elif isinstance(metric, Counter): + print(f"{metric.name} (counter) = {metric.value}") + elif isinstance(metric, Vector): + print(f"{metric.name} (vector) = {metric.values}") + elif isinstance(metric, Histogram): + print(f"{metric.name} (histogram)") + print(f" sum = {metric.sum}") + print(f" count = {metric.count}") + for bucket_le, value in metric.buckets.items(): + print(f" {bucket_le} = {value}") + + +if __name__ == "__main__": + main() diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index cefb89eb652b..e77916f95823 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -6,6 +6,7 @@ import pytest from vllm import LLM, SamplingParams +from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector MODEL = "facebook/opt-125m" DTYPE = "half" @@ -97,3 +98,67 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: raise AssertionError( f"{len(completion_counts)} unique completions; expected" f" {n}. Repeats: {repeats}") + + +def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): + max_tokens = 100 + # Use spec decoding to test num_accepted_tokens_per_pos + speculative_config = { + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": 5, + } + monkeypatch.setenv("VLLM_USE_V1", "1") + with vllm_runner( + MODEL, + speculative_config=speculative_config, + disable_log_stats=False, + ) as vllm_model: + model: LLM = vllm_model.model + sampling_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens) + outputs = model.generate(example_prompts, sampling_params) + + n_prompts = len(example_prompts) + assert len(outputs) == n_prompts + + total_tokens = 0 + for out in outputs: + assert len(out.outputs) == 1 + total_tokens += len(out.outputs[0].token_ids) + assert total_tokens == max_tokens * n_prompts + + metrics = model.get_metrics() + + def find_metric(name) -> list[Metric]: + found = [] + for metric in metrics: + if metric.name == name: + found.append(metric) + return found + + num_requests_running = find_metric("vllm:num_requests_running") + assert len(num_requests_running) == 1 + assert isinstance(num_requests_running[0], Gauge) + assert num_requests_running[0].value == .0 + + generation_tokens = find_metric("vllm:generation_tokens") + assert len(generation_tokens) == 1 + assert isinstance(generation_tokens[0], Counter) + assert generation_tokens[0].value == total_tokens + + request_generation_tokens = find_metric( + "vllm:request_generation_tokens") + assert len(request_generation_tokens) == 1 + assert isinstance(request_generation_tokens[0], Histogram) + assert "+Inf" in request_generation_tokens[0].buckets + assert request_generation_tokens[0].buckets["+Inf"] == n_prompts + assert request_generation_tokens[0].count == n_prompts + assert request_generation_tokens[0].sum == total_tokens + + num_accepted_tokens_per_pos = find_metric( + "vllm:spec_decode_num_accepted_tokens_per_pos") + assert len(num_accepted_tokens_per_pos) == 1 + assert isinstance(num_accepted_tokens_per_pos[0], Vector) + assert len(num_accepted_tokens_per_pos[0].values) == 5 diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py new file mode 100644 index 000000000000..68539c80b59c --- /dev/null +++ b/tests/v1/test_metrics_reader.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 + +import prometheus_client +import pytest + +from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector, + get_metrics_snapshot) + + +@pytest.fixture(autouse=True) +def test_registry(monkeypatch): + # Use a custom registry for tests + test_registry = prometheus_client.CollectorRegistry(auto_describe=True) + monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry) + return test_registry + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_gauge_metric(test_registry, num_engines): + g = prometheus_client.Gauge("vllm:test_gauge", + "Test gauge metric", + labelnames=["model", "engine_index"], + registry=test_registry) + for i in range(num_engines): + g.labels(model="foo", engine_index=str(i)).set(98.5) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Gauge) + assert m.name == "vllm:test_gauge" + assert m.value == 98.5 + assert m.labels["model"] == "foo" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_counter_metric(test_registry, num_engines): + c = prometheus_client.Counter("vllm:test_counter", + "Test counter metric", + labelnames=["model", "engine_index"], + registry=test_registry) + for i in range(num_engines): + c.labels(model="bar", engine_index=str(i)).inc(19) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Counter) + assert m.name == "vllm:test_counter" + assert m.value == 19 + assert m.labels["model"] == "bar" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_histogram_metric(test_registry, num_engines): + h = prometheus_client.Histogram("vllm:test_histogram", + "Test histogram metric", + labelnames=["model", "engine_index"], + buckets=[10, 20, 30, 40, 50], + registry=test_registry) + for i in range(num_engines): + hist = h.labels(model="blaa", engine_index=str(i)) + hist.observe(42) + hist.observe(21) + hist.observe(7) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Histogram) + assert m.name == "vllm:test_histogram" + assert m.count == 3 + assert m.sum == 70 + assert m.buckets["10.0"] == 1 + assert m.buckets["20.0"] == 1 + assert m.buckets["30.0"] == 2 + assert m.buckets["40.0"] == 2 + assert m.buckets["50.0"] == 3 + assert m.labels["model"] == "blaa" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) + + +@pytest.mark.parametrize("num_engines", [1, 4]) +def test_vector_metric(test_registry, num_engines): + c = prometheus_client.Counter( + "vllm:spec_decode_num_accepted_tokens_per_pos", + "Vector-like counter metric", + labelnames=["position", "model", "engine_index"], + registry=test_registry) + for i in range(num_engines): + c.labels(position="0", model="llama", engine_index=str(i)).inc(10) + c.labels(position="1", model="llama", engine_index=str(i)).inc(5) + c.labels(position="2", model="llama", engine_index=str(i)).inc(1) + + metrics = get_metrics_snapshot() + assert len(metrics) == num_engines + engine_labels = [str(i) for i in range(num_engines)] + for m in metrics: + assert isinstance(m, Vector) + assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos" + assert m.values == [10, 5, 1] + assert m.labels["model"] == "llama" + assert m.labels["engine_index"] in engine_labels + engine_labels.remove(m.labels["engine_index"]) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1c7bd65053f9..59cc44eb0e18 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -4,7 +4,8 @@ import warnings from collections.abc import Sequence from contextlib import contextmanager -from typing import Any, Callable, ClassVar, Optional, Union, cast, overload +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union, + cast, overload) import cloudpickle import torch.nn as nn @@ -47,6 +48,9 @@ from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs, is_list_of) +if TYPE_CHECKING: + from vllm.v1.metrics.reader import Metric + logger = init_logger(__name__) _R = TypeVar("_R", default=Any) @@ -1294,6 +1298,20 @@ def wake_up(self, tags: Optional[list[str]] = None): """ self.llm_engine.wake_up(tags) + def get_metrics(self) -> list["Metric"]: + """Return a snapshot of aggregated metrics from Prometheus. + + Returns: + A ``MetricSnapshot`` instance capturing the current state + of all aggregated metrics from Prometheus. + + Note: + This method is only available with the V1 LLM engine. + """ + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + assert isinstance(self.llm_engine, V1LLMEngine) + return self.llm_engine.get_metrics() + # LEGACY def _convert_v1_inputs( self, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 112896d6c767..c856e2645a2c 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -27,7 +27,10 @@ from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import StatLoggerFactory +from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, + StatLoggerFactory) +from vllm.v1.metrics.reader import Metric, get_metrics_snapshot +from vllm.v1.metrics.stats import IterationStats logger = init_logger(__name__) @@ -64,6 +67,11 @@ def __init__( self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config + self.log_stats = log_stats + self.stat_logger: Optional[StatLoggerBase] = None + if self.log_stats: + self.stat_logger = PrometheusStatLogger(vllm_config) + # important: init dp group before init the engine_core # In the decoupled engine case this is handled in EngineCoreProc. parallel_config = vllm_config.parallel_config @@ -86,7 +94,7 @@ def __init__( # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). self.output_processor = OutputProcessor(self.tokenizer, - log_stats=False) + log_stats=self.log_stats) # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( @@ -94,7 +102,7 @@ def __init__( asyncio_mode=False, vllm_config=vllm_config, executor_class=executor_class, - log_stats=False, # FIXME: implement + log_stats=self.log_stats, ) if not multiprocess_mode: @@ -223,12 +231,21 @@ def step(self) -> list[RequestOutput]: outputs = self.engine_core.get_output() # 2) Process EngineCoreOutputs. + iteration_stats = IterationStats() if self.log_stats else None processed_outputs = self.output_processor.process_outputs( - outputs.outputs) + outputs.outputs, + engine_core_timestamp=outputs.timestamp, + iteration_stats=iteration_stats) # 3) Abort any reqs that finished due to stop strings. self.engine_core.abort_requests(processed_outputs.reqs_to_abort) + # 4) Record stats + if self.stat_logger is not None: + assert outputs.scheduler_stats is not None + self.stat_logger.record(scheduler_stats=outputs.scheduler_stats, + iteration_stats=iteration_stats) + return processed_outputs.request_outputs def get_vllm_config(self): @@ -260,6 +277,10 @@ def wake_up(self, tags: Optional[list[str]] = None): def is_sleeping(self) -> bool: return self.engine_core.is_sleeping() + def get_metrics(self) -> list[Metric]: + assert self.log_stats, "Stat logging disabled" + return get_metrics_snapshot() + def get_tokenizer_group(self) -> TokenizerGroup: if self.tokenizer is None: raise ValueError("Unable to get tokenizer because " diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 2b75a3a2ecbd..3dc2f77444f6 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -200,24 +200,24 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): # Counters # self.counter_num_preempted_reqs = self._counter_cls( - name="vllm:num_preemptions_total", + name="vllm:num_preemptions", documentation="Cumulative number of preemption from the engine.", labelnames=labelnames).labels(*labelvalues) self.counter_prompt_tokens = self._counter_cls( - name="vllm:prompt_tokens_total", + name="vllm:prompt_tokens", documentation="Number of prefill tokens processed.", labelnames=labelnames).labels(*labelvalues) self.counter_generation_tokens = self._counter_cls( - name="vllm:generation_tokens_total", + name="vllm:generation_tokens", documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) self.counter_request_success: dict[FinishReason, prometheus_client.Counter] = {} counter_request_success_base = self._counter_cls( - name="vllm:request_success_total", + name="vllm:request_success", documentation="Count of successfully processed requests.", labelnames=labelnames + ["finished_reason"]) for reason in FinishReason: diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py new file mode 100644 index 000000000000..5ab78129a009 --- /dev/null +++ b/vllm/v1/metrics/reader.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from typing import Optional + +from prometheus_client import REGISTRY +from prometheus_client import Metric as PromMetric +from prometheus_client.samples import Sample + + +@dataclass +class Metric: + """A base class for prometheus metrics. + + Each metric may be associated with key=value labels, and + in some cases a single vLLM instance may have multiple + metrics with the same name but different sets of labels. + """ + name: str + labels: dict[str, str] + + +@dataclass +class Counter(Metric): + """A monotonically increasing integer counter.""" + value: int + + +@dataclass +class Vector(Metric): + """An ordered array of integer counters. + + This type - which doesn't exist in Prometheus - models one very + specific metric, vllm:spec_decode_num_accepted_tokens_per_pos. + """ + values: list[int] + + +@dataclass +class Gauge(Metric): + """A numerical value that can go up or down.""" + value: float + + +@dataclass +class Histogram(Metric): + """Observations recorded in configurable buckets. + + Buckets are represented by a dictionary. The key is + the upper limit of the bucket, and the value is the + observed count in that bucket. A '+Inf' key always + exists. + + The count property is the total count across all + buckets, identical to the count of the '+Inf' bucket. + + The sum property is the total sum of all observed + values. + """ + count: int + sum: float + buckets: dict[str, int] + + +def get_metrics_snapshot() -> list[Metric]: + """An API for accessing in-memory Prometheus metrics. + + Example: + >>> for metric in llm.get_metrics(): + ... if isinstance(metric, Counter): + ... print(f"{metric} = {metric.value}") + ... elif isinstance(metric, Gauge): + ... print(f"{metric} = {metric.value}") + ... elif isinstance(metric, Histogram): + ... print(f"{metric}") + ... print(f" sum = {metric.sum}") + ... print(f" count = {metric.count}") + ... for bucket_le, value in metrics.buckets.items(): + ... print(f" {bucket_le} = {value}") + """ + collected: list[Metric] = [] + for metric in REGISTRY.collect(): + if not metric.name.startswith("vllm:"): + continue + if metric.type == "gauge": + samples = _get_samples(metric) + for s in samples: + collected.append( + Gauge(name=metric.name, labels=s.labels, value=s.value)) + elif metric.type == "counter": + samples = _get_samples(metric, "_total") + if metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos": + # + # Ugly vllm:num_accepted_tokens_per_pos special case. + # + # This metric is a vector of counters - for each spec + # decoding token position, we observe the number of + # accepted tokens using a Counter labeled with 'position'. + # We convert these into a vector of integer values. + # + for labels, values in _digest_num_accepted_by_pos_samples( + samples): + collected.append( + Vector(name=metric.name, labels=labels, values=values)) + else: + for s in samples: + collected.append( + Counter(name=metric.name, + labels=s.labels, + value=int(s.value))) + + elif metric.type == "histogram": + # + # A histogram has a number of '_bucket' samples where + # the 'le' label represents the upper limit of the bucket. + # We convert these bucketized values into a dict of values + # indexed by the value of the 'le' label. The 'le=+Inf' + # label is a special case, catching all values observed. + # + bucket_samples = _get_samples(metric, "_bucket") + count_samples = _get_samples(metric, "_count") + sum_samples = _get_samples(metric, "_sum") + for labels, buckets, count_value, sum_value in _digest_histogram( + bucket_samples, count_samples, sum_samples): + collected.append( + Histogram(name=metric.name, + labels=labels, + buckets=buckets, + count=count_value, + sum=sum_value)) + else: + raise AssertionError(f"Unknown metric type {metric.type}") + + return collected + + +def _get_samples(metric: PromMetric, + suffix: Optional[str] = None) -> list[Sample]: + name = (metric.name + suffix) if suffix is not None else metric.name + return [s for s in metric.samples if s.name == name] + + +def _strip_label(labels: dict[str, str], key_to_remove: str) -> dict[str, str]: + labels_copy = labels.copy() + labels_copy.pop(key_to_remove) + return labels_copy + + +def _digest_histogram( + bucket_samples: list[Sample], count_samples: list[Sample], + sum_samples: list[Sample] +) -> list[tuple[dict[str, str], dict[str, int], int, float]]: + # + # In the case of DP, we have an indigestable + # per-bucket-per-engine count as a list of labelled + # samples, along with total and sum samples + # + # bucket_samples (in): + # labels = {bucket: 100, idx: 0}, value = 2 + # labels = {bucket: 200, idx: 0}, value = 4 + # labels = {bucket: Inf, idx: 0}, value = 10 + # labels = {bucket: 100, idx: 1}, value = 1 + # labels = {bucket: 200, idx: 2}, value = 5 + # labels = {bucket: Inf, idx: 3}, value = 7 + # count_samples (in): + # labels = {idx: 0}, value = 10 + # labels = {idx: 1}, value = 7 + # sum_samples (in): + # labels = {idx: 0}, value = 2000 + # labels = {idx: 1}, value = 1200 + # + # output: [ + # {idx: 0}, {"100": 2, "200": 4, "Inf": 10}, 10, 2000 + # {idx: 1}, {"100": 1, "200": 5, "Inf": 7}, 7, 1200 + # ] + buckets_by_labels: dict[frozenset[tuple[str, str]], dict[str, int]] = {} + for s in bucket_samples: + bucket = s.labels["le"] + labels_key = frozenset(_strip_label(s.labels, "le").items()) + if labels_key not in buckets_by_labels: + buckets_by_labels[labels_key] = {} + buckets_by_labels[labels_key][bucket] = int(s.value) + + counts_by_labels: dict[frozenset[tuple[str, str]], int] = {} + for s in count_samples: + labels_key = frozenset(s.labels.items()) + counts_by_labels[labels_key] = int(s.value) + + sums_by_labels: dict[frozenset[tuple[str, str]], float] = {} + for s in sum_samples: + labels_key = frozenset(s.labels.items()) + sums_by_labels[labels_key] = s.value + + assert set(buckets_by_labels.keys()) == set( + counts_by_labels.keys()) == set(sums_by_labels.keys()) + + output = [] + label_keys = list(buckets_by_labels.keys()) + for k in label_keys: + labels = dict(k) + output.append((labels, buckets_by_labels[k], counts_by_labels[k], + sums_by_labels[k])) + return output + + +def _digest_num_accepted_by_pos_samples( + samples: list[Sample]) -> list[tuple[dict[str, str], list[int]]]: + # + # In the case of DP, we have an indigestable + # per-position-per-engine count as a list of + # labelled samples + # + # samples (in): + # labels = {pos: 0, idx: 0}, value = 10 + # labels = {pos: 1, idx: 0}, value = 7 + # labels = {pos: 2, idx: 0}, value = 2 + # labels = {pos: 0, idx: 1}, value = 5 + # labels = {pos: 1, idx: 1}, value = 3 + # labels = {pos: 2, idx: 1}, value = 1 + # + # output: [ + # {idx: 0}, [10, 7, 2] + # {idx: 1}, [5, 3, 1] + # ] + # + max_pos = 0 + values_by_labels: dict[frozenset[tuple[str, str]], dict[int, int]] = {} + + for s in samples: + position = int(s.labels["position"]) + max_pos = max(max_pos, position) + + labels_key = frozenset(_strip_label(s.labels, "position").items()) + if labels_key not in values_by_labels: + values_by_labels[labels_key] = {} + values_by_labels[labels_key][position] = int(s.value) + + output = [] + for labels_key, values_by_position in values_by_labels.items(): + labels = dict(labels_key) + values = [0] * (max_pos + 1) + for pos, val in values_by_position.items(): + values[pos] = val + output.append((labels, values)) + return output diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index 899aa9200e85..36091bef2895 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -134,17 +134,17 @@ def __init__( self.counter_spec_decode_num_drafts = \ self._counter_cls( - name="vllm:spec_decode_num_drafts_total", + name="vllm:spec_decode_num_drafts", documentation="Number of spec decoding drafts.", labelnames=labelnames).labels(*labelvalues) self.counter_spec_decode_num_draft_tokens = \ self._counter_cls( - name="vllm:spec_decode_num_draft_tokens_total", + name="vllm:spec_decode_num_draft_tokens", documentation="Number of draft tokens.", labelnames=labelnames,).labels(*labelvalues) self.counter_spec_decode_num_accepted_tokens = \ self._counter_cls( - name="vllm:spec_decode_num_accepted_tokens_total", + name="vllm:spec_decode_num_accepted_tokens", documentation="Number of accepted tokens.", labelnames=labelnames).labels(*labelvalues) From aaa4ac1c95aaf70afab51582c56d80554a21bbd0 Mon Sep 17 00:00:00 2001 From: cascade <cascade812@outlook.com> Date: Tue, 27 May 2025 05:06:34 -0700 Subject: [PATCH 189/192] Disable prefix cache by default for benchmark (#18639) Signed-off-by: cascade812 <cascade812@outlook.com> --- benchmarks/benchmark_latency.py | 3 +++ vllm/benchmarks/latency.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d5aaceeb8c9c..84759c5c354d 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -189,5 +189,8 @@ def run_to_completion(profile_dir: Optional[str] = None): ) parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) args = parser.parse_args() main(args) diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 06f6848f50cb..2c992727b139 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -80,6 +80,9 @@ def add_cli_args(parser: argparse.ArgumentParser): ) parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=True) def main(args: argparse.Namespace): From 6b6d4961147220fb80f9cc7dcb74db478f9c9a23 Mon Sep 17 00:00:00 2001 From: chunxiaozheng <55471457+chunxiaozheng@users.noreply.github.com> Date: Tue, 27 May 2025 21:08:44 +0800 Subject: [PATCH 190/192] optimize get_kv_cache_torch_dtype (#18531) Signed-off-by: idellzheng <idellzheng@tencent.com> --- vllm/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 7222a3c99102..846df7743736 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -759,16 +759,15 @@ def get_kv_cache_torch_dtype( model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype: if isinstance(cache_dtype, str): if cache_dtype == "auto": - if isinstance(model_dtype, str): + if isinstance(model_dtype, + str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] elif isinstance(model_dtype, torch.dtype): torch_dtype = model_dtype else: raise ValueError(f"Invalid model dtype: {model_dtype}") - elif cache_dtype in ["half", "bfloat16", "float"]: + elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8": - torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") elif isinstance(cache_dtype, torch.dtype): From 696259ca0180c4357cf437a334aaf0966be5cb4b Mon Sep 17 00:00:00 2001 From: Cyrus Leung <tlleungac@connect.ust.hk> Date: Tue, 27 May 2025 23:45:48 +0800 Subject: [PATCH 191/192] [Core] Automatically cast multi-modal input dtype (#18756) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> --- vllm/model_executor/models/deepseek_vl2.py | 4 +--- vllm/model_executor/models/gemma3_mm.py | 5 ----- vllm/multimodal/inputs.py | 8 +++++++- vllm/spec_decode/draft_model_runner.py | 7 +++++-- vllm/v1/worker/gpu_model_runner.py | 12 +++++++++--- vllm/v1/worker/tpu_model_runner.py | 14 ++++++++++---- vllm/worker/cpu_enc_dec_model_runner.py | 7 +++++-- vllm/worker/cpu_model_runner.py | 5 ++++- vllm/worker/cpu_pooling_model_runner.py | 7 +++++-- vllm/worker/enc_dec_model_runner.py | 10 +++++++--- vllm/worker/model_runner.py | 7 +++++-- vllm/worker/multi_step_neuron_model_runner.py | 7 +++++-- ...ulti_step_neuronx_distributed_model_runner.py | 7 +++++-- vllm/worker/neuron_model_runner.py | 16 ++++++++++------ vllm/worker/pooling_model_runner.py | 10 +++++++--- vllm/worker/xpu_model_runner.py | 9 ++++++--- 16 files changed, 91 insertions(+), 44 deletions(-) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 164fa40ffebe..5c8793f59ffb 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -210,9 +210,7 @@ def _call_hf_processor( dict(prompt=prompt, **mm_data), mm_kwargs, ) - target_dtype = self.info.ctx.model_config.dtype - pixel_values = processed_outputs.pop("pixel_values").to( - target_dtype) + pixel_values = processed_outputs["pixel_values"] # split pixel values into patches corresponding to each image images_spatial_crop = processed_outputs["images_spatial_crop"] patches_per_image = [ diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 00a972d33b04..182cc86d3ca8 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -263,11 +263,6 @@ def _call_hf_processor( mm_data, mm_kwargs, ) - if "pixel_values" in processed_outputs: - # Cast pixel values to model dtype already here, - # so we need to transfer less data to the GPU - processed_outputs["pixel_values"] = processed_outputs[ - "pixel_values"].to(self.info.ctx.model_config.dtype) # HF processor pops the `num_crops` kwarg, which is needed by vLLM if (images := mm_data.get("images")) is not None: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 162dd52e3e73..600a34d39ef6 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -746,11 +746,17 @@ def as_kwargs( batched_inputs: BatchedTensorInputs, *, device: torch.types.Device, + dtype: Optional[torch.dtype] = None, ) -> BatchedTensorInputs: json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) + def maybe_cast_dtype(x: torch.Tensor): + # This mimics the behavior of transformers.BatchFeature + return x.to(dtype=dtype) if x.is_floating_point() else x + json_mapped = json_map_leaves( - lambda x: x.to(device, non_blocking=True), + # NOTE: Cast the dtype before sending it to device + lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True), json_inputs, ) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index a6276c563394..991d2040a878 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -294,8 +294,11 @@ def execute_model( inputs_embeds=None, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_runner.model_config.dtype, + device=self.device, + ), **model_execute_kwargs, ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index aa47ac253bb9..910c0e80bb31 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -929,8 +929,11 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): encoder_outputs = [] for grouped_mm_inputs in grouped_mm_inputs_list: batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, - device=self.device) + batched_mm_inputs = MultiModalKwargs.as_kwargs( + batched_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) # Run the encoder. # `curr_group_outputs` is either of the following: @@ -1874,7 +1877,10 @@ def profile_run(self) -> None: batched_dummy_mm_inputs = MultiModalKwargs.batch( [dummy_mm_kwargs] * max_num_mm_items) batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( - batched_dummy_mm_inputs, device=self.device) + batched_dummy_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) # Run multimodal encoder. dummy_encoder_outputs = self.model.get_multimodal_embeddings( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index b13ff9f97e6f..46bcf64ed0c3 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -652,8 +652,11 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): encoder_outputs = [] for grouped_mm_inputs in grouped_mm_inputs_list: batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, - device=self.device) + batched_mm_inputs = MultiModalKwargs.as_kwargs( + batched_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) # Run the encoder. # `curr_group_outputs` is either of the following: @@ -1435,8 +1438,11 @@ def _get_mm_dummy_batch(self, modality: str, batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] * batch_size) - return MultiModalKwargs.as_kwargs(batched_dummy_mm_inputs, - device=self.device) + return MultiModalKwargs.as_kwargs( + batched_dummy_mm_inputs, + dtype=self.model_config.dtype, + device=self.device, + ) def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]: diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index c2120c035175..82eeeb570d22 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -297,8 +297,11 @@ def execute_model( model_input.encoder_input_tokens, "encoder_positions": model_input.encoder_input_positions, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), "intermediate_tensors": intermediate_tensors, } diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 710ca1a13b0c..fb436a079f87 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -628,7 +628,10 @@ def execute_model( multimodal_kwargs = {} if model_input.multi_modal_kwargs is not None: multimodal_kwargs = MultiModalKwargs.as_kwargs( - model_input.multi_modal_kwargs, device=self.device) + model_input.multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ) execute_model_kwargs = {} if previous_hidden_states is not None: execute_model_kwargs.update( diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 1ceb2557c6b3..2a60e51261ad 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -50,8 +50,11 @@ def execute_model( model_input.input_tokens, "positions": model_input.input_positions, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), **cross_enc_kwargs, "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 4864163b0de2..3957e5608524 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -202,9 +202,13 @@ def execute_model( encoder_input_ids=model_input.encoder_input_tokens, encoder_positions=model_input.encoder_input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), - **seqlen_agnostic_kwargs) + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ), + **seqlen_agnostic_kwargs, + ) logits = self.model.compute_logits(hidden_or_intermediate_states, model_input.sampling_metadata) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 53e79adf9aae..8c968faa7810 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1845,8 +1845,11 @@ def execute_model( inputs_embeds=model_input.inputs_embeds, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ), **seqlen_agnostic_kwargs, **model_kwargs, ) diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py index 9618a4b49ff8..aafb7ab7cfb8 100644 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ b/vllm/worker/multi_step_neuron_model_runner.py @@ -70,8 +70,11 @@ def execute_model( input_ids=model_input.input_tokens, positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) output = self.model.sample( diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py index b6a3492a493b..3a9c0993e004 100644 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py @@ -49,8 +49,11 @@ def execute_model( positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, sampling_params=sampling_params, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) output = self.model.sample( diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index e97adf757cc1..968596471a26 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -378,9 +378,11 @@ def execute_model( positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, sampling_params=sampling_params, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs - or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) elif current_platform.use_transformers_neuronx(): # [TODO] validate on-device sampling @@ -389,9 +391,11 @@ def execute_model( input_ids=model_input.input_tokens, positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs - or {}, - device=self.device), + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), ) # Compute the logits only if the on-device sampling is turned off as diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index fdb7353f2f9c..912e04c435f5 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -119,10 +119,14 @@ def execute_model( input_ids=model_input.input_tokens, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), + **MultiModalKwargs.as_kwargs( + multi_modal_kwargs, + dtype=self.model_config.dtype, + device=self.device, + ), **cross_enc_kwargs, - **seqlen_agnostic_kwargs) + **seqlen_agnostic_kwargs, + ) if (self.observability_config is not None and self.observability_config.collect_model_forward_time): diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 7042b575aa78..79fa7d2c73e8 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -562,9 +562,12 @@ def execute_model( input_ids=model_input.input_tokens, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs - or {}, - device=self.device)) + **MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs or {}, + dtype=self.model_config.dtype, + device=self.device, + ), + ) # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: return hidden_or_intermediate_states From 58738772410c5e0d60b61db39538a9b313d2d7ad Mon Sep 17 00:00:00 2001 From: Michael Goin <mgoin64@gmail.com> Date: Tue, 27 May 2025 12:05:37 -0400 Subject: [PATCH 192/192] [Bugfix] Mistral tool calling when content is list (#18729) Signed-off-by: mgoin <mgoin64@gmail.com> --- tests/tokenization/test_mistral_tokenizer.py | 116 +++++++++++++++++- vllm/transformers_utils/tokenizers/mistral.py | 6 +- 2 files changed, 115 insertions(+), 7 deletions(-) diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index f1c880286951..b16d9af35be9 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -1,15 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -from mistral_common.protocol.instruct.messages import UserMessage +from mistral_common.protocol.instruct.messages import (AssistantMessage, + ToolMessage, + UserMessage) from mistral_common.protocol.instruct.request import ChatCompletionRequest -from mistral_common.protocol.instruct.tool_calls import Function, Tool +from mistral_common.protocol.instruct.tool_calls import (Function, + FunctionCall, Tool, + ToolCall) from vllm.transformers_utils.tokenizers.mistral import ( make_mistral_chat_completion_request) -# yapf: enable @pytest.mark.parametrize( "openai_request,expected_mistral_request", [( @@ -78,6 +81,107 @@ ) def test_make_mistral_chat_completion_request(openai_request, expected_mistral_request): - assert (make_mistral_chat_completion_request( - openai_request["messages"], - openai_request["tools"]) == expected_mistral_request) + actual_request = make_mistral_chat_completion_request( + openai_request["messages"], openai_request["tools"]) + assert actual_request == expected_mistral_request + + +# Tool use with list content and reasoning_content +@pytest.mark.parametrize("openai_request,expected_mistral_request", [( + { + "messages": [ + { + "role": "user", + "content": "What's the weather in Paris?", + }, + { + "role": + "assistant", + "reasoning_content": + None, + "content": + None, + "tool_calls": [{ + "id": "call123", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city": "Paris"}', + }, + }], + }, + { + "role": "tool", + "content": [{ + "type": "text", + "text": "Rainy" + }], + "name": "get_weather", + "tool_call_id": "call123", + }, + ], + "tools": [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Gets the current weather in a city.", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name" + } + }, + "required": ["city"], + }, + }, + }], + }, + ChatCompletionRequest( + messages=[ + UserMessage(content="What's the weather in Paris?"), + AssistantMessage( + content=None, + tool_calls=[ + ToolCall( + id="call123", + function=FunctionCall( + name="get_weather", + arguments='{"city": "Paris"}', + ), + ) + ], + ), + ToolMessage( + content="Rainy", + tool_call_id="call123", + name="get_weather", + ), + ], + tools=[ + Tool( + type="function", + function=Function( + name="get_weather", + description="Gets the current weather in a city.", + parameters={ + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city name" + } + }, + "required": ["city"], + }, + ), + ) + ], + ), +)]) +def test_make_mistral_chat_completion_request_list_content( + openai_request, expected_mistral_request): + actual_request = make_mistral_chat_completion_request( + openai_request["messages"], openai_request["tools"]) + assert actual_request == expected_mistral_request diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 05de6a603655..23b6f67f09df 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -156,7 +156,11 @@ def make_mistral_chat_completion_request( # # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80 for message in messages: - if message.get("role") == "assistant": + # Remove reasoning_content as unsupported by Mistral + _ = message.pop("reasoning_content", None) # type: ignore + + # Convert list text content to string + if message.get("role") in ("assistant", "tool"): content = message.get("content") if isinstance(content, list): content = "\n".join(chunk.get("text") for chunk in content)