diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 1a794af572fef..9a793fffade82 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -25,10 +25,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 + pip install ruff==0.5.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 - name: Analysing the code with ruff run: | - ruff . + ruff check . - name: Spelling check with codespell run: | codespell --toml pyproject.toml diff --git a/format.sh b/format.sh index a8fd95a1ea445..07ea45c22efb0 100755 --- a/format.sh +++ b/format.sh @@ -161,7 +161,7 @@ echo 'vLLM codespell: Done' # Lint specified files lint() { - ruff "$@" + ruff check "$@" } # Lint files that differ from main branch. Ignores dirs that are not slated @@ -177,7 +177,7 @@ lint_changed() { if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff + ruff check fi } diff --git a/requirements-lint.txt b/requirements-lint.txt index bd34227d3e824..e16767403299c 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -2,7 +2,7 @@ yapf==0.32.0 toml==0.10.2 tomli==2.0.1 -ruff==0.1.5 +ruff==0.5.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 diff --git a/tests/conftest.py b/tests/conftest.py index c7a349f1e9e2a..3212d4307d50e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -98,11 +98,7 @@ def should_do_global_cleanup_after_test(request) -> bool: This can provide a ~10x speedup for non-GPU unit tests since they don't need to initialize torch. """ - - if request.node.get_closest_marker("skip_global_cleanup"): - return False - - return True + return request.node.get_closest_marker("skip_global_cleanup") @pytest.fixture(autouse=True) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0bcae5b0c96dc..1c03c1515b091 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -64,11 +64,7 @@ def should_do_global_cleanup_after_test(request) -> bool: This can provide a ~10x speedup for non-GPU unit tests since they don't need to initialize torch. """ - - if request.node.get_closest_marker("skip_global_cleanup"): - return False - - return True + return request.node.get_closest_marker("skip_global_cleanup") @pytest.fixture(autouse=True) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index f9f246436c0f7..b46df7108f9cb 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -165,7 +165,7 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs, test_name = request.node.name model = kwargs["model"] - draft_model = kwargs.get("speculative_model", None) + draft_model = kwargs.get("speculative_model") same_draft_target_model = (draft_model is not None and draft_model == model) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index fe413d1228021..3576a4834ebc3 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -66,8 +66,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, hashes.append([]) prompts = [prefix + prompt for prompt in sample_prompts] - seq_id = 0 - for prompt in prompts: + for seq_id, prompt in enumerate(prompts): hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) seq = Sequence(seq_id, @@ -83,8 +82,6 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, for idx in range(num_blocks): hashes[-1][-1].append(seq.hash_of_block(idx)) - seq_id += 1 - # Check that hashes made with two prefixes with different first blocks are # different everywhere. for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])): diff --git a/tests/test_logger.py b/tests/test_logger.py index 52aa73761fd68..7004c64fd5fac 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -110,7 +110,7 @@ def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(): configuration occurs.""" with pytest.raises(RuntimeError) as ex_info: _configure_vllm_root_logger() - assert ex_info.type == RuntimeError + assert ex_info.type is RuntimeError assert "File does not exist" in str(ex_info) @@ -151,7 +151,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json( logging_config_file.name): with pytest.raises(ValueError) as ex_info: _configure_vllm_root_logger() - assert ex_info.type == ValueError + assert ex_info.type is ValueError assert "Invalid logging config. Expected Dict, got" in str(ex_info) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 4a0e2b4184936..3c23ce261461c 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -235,11 +235,6 @@ def test_prepare_decode_cuda_graph(batch_size): torch.allclose(input_tokens, input_positions) # Verify Sampling - expected_selected_token_indices = [] - selected_token_start_idx = 0 - for _ in context_lens: - expected_selected_token_indices.append(selected_token_start_idx) - selected_token_start_idx += 1 sampling_metadata = SamplingMetadata.prepare( seq_group_metadata_list, seq_lens, @@ -248,7 +243,7 @@ def test_prepare_decode_cuda_graph(batch_size): device=model_runner.device, pin_memory=model_runner.pin_memory) actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, + expected = torch.arange(len(context_lens), device=actual.device, dtype=actual.dtype) torch.testing.assert_close(actual, expected) diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index 6c5411f7d3d5c..1e9adca50093b 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -42,7 +42,7 @@ def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]: def get_adapter(adapter_id: int, registered_adapters: Dict[int, Any]) -> Optional[Any]: - return registered_adapters.get(adapter_id, None) + return registered_adapters.get(adapter_id) ## worker functions diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index f7cb2ee996501..0b2c065a84e32 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -23,10 +23,9 @@ def is_block_tables_empty(block_tables: Union[None, Dict]): """ if block_tables is None: return True - if isinstance(block_tables, dict) and all( - value is None for value in block_tables.values()): - return True - return False + + return isinstance(block_tables, dict) and all( + value is None for value in block_tables.values()) def compute_slot_mapping_start_idx(is_prompt: bool, query_len: int, diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index d102ad4045591..08b381f352b0b 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -406,9 +406,7 @@ def all_block_ids(self) -> FrozenSet[int]: def is_block_cached(self, block: Block) -> bool: assert block.content_hash is not None - if block.content_hash in self._cached_blocks: - return True - return False + return block.content_hash in self._cached_blocks def promote_to_immutable_block(self, block: Block) -> BlockId: """Once a mutable block is full, it can be promoted to an immutable diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index b48ea1b19b82a..ced23c036c4dc 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -394,9 +394,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: """ alloc_status = self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) - if alloc_status == AllocStatus.OK: - return True - return False + return alloc_status == AllocStatus.OK def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: """Returns the block id mapping (from GPU to CPU) generated by diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 554dcc0ed43ed..be80d901d899d 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -67,9 +67,9 @@ def __call__(self, input_ids: List[int], instruction = self._guide.get_next_instruction( state=self._fsm_state[seq_id]) - if type(instruction) == Generate: + if isinstance(instruction, Generate): allowed_tokens = instruction.tokens - elif type(instruction) == Write: + elif isinstance(instruction, Write): # TODO: support fast forward tokens allowed_tokens = [instruction.tokens[0]] else: diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 2cc080608c7a9..144014f397672 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -110,9 +110,9 @@ def get_scaled_act_names(self) -> List[str]: def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() - num_bits = quant_config.get("bits", None) - group_size = quant_config.get("group_size", None) - has_zp = quant_config.get("zero_point", None) + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + has_zp = quant_config.get("zero_point") if quant_method != "awq": return False diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 39d00bd5733ff..ed73b00ff021c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import torch from pydantic import BaseModel @@ -25,7 +25,7 @@ class CompressedTensorsConfig(QuantizationConfig): def __init__(self, target_scheme_map: Dict[str, Any], ignore: List[str], - quant_format: str, + quant_format: Optional[str], kv_cache_scheme: Optional[Dict[str, Any]] = None): self.ignore = ignore @@ -67,8 +67,8 @@ def get_quant_method( @classmethod def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": target_scheme_map: Dict[str, Any] = dict() - ignore: List[str] = config.get("ignore", None) - quant_format: str = config.get("format", None) + ignore: List[str] = config.get("ignore", []) + quant_format: Union[str, None] = config.get("format") # The quant_config has multiple config_groups, each containing # an input_activations key with details about how the activations are @@ -169,7 +169,8 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel, is_symmetric_activation = input_quant.symmetric is_per_tensor_activation = ( input_quant.strategy == QuantizationStrategy.TENSOR) - if not (is_symmetric_activation and is_per_tensor_activation): + if not (is_symmetric_activation # noqa: SIM103 + and is_per_tensor_activation): return False # All conditions satisfied. @@ -191,7 +192,7 @@ def _is_fp8_w8a16(self, weight_quant: BaseModel, is_per_tensor_or_channel_weight = (weight_quant.strategy in [ QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL ]) - if not (is_symmetric_weight and is_static_weight + if not (is_symmetric_weight and is_static_weight # noqa: SIM103 and is_per_tensor_or_channel_weight): return False diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 7912cbde5721f..a9559179460a9 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -80,7 +80,7 @@ class QuantizationArgs(BaseModel): ) -def is_activation_quantization_format(format: str) -> bool: +def is_activation_quantization_format(format: Optional[str]) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ CompressionFormat.naive_quantized.value, CompressionFormat.int_quantized.value, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4a11b14971076..45c572c3c3e47 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -119,10 +119,10 @@ def get_scaled_act_names(self) -> List[str]: def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() - num_bits = quant_config.get("bits", None) - group_size = quant_config.get("group_size", None) - sym = quant_config.get("sym", None) - desc_act = quant_config.get("desc_act", None) + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + sym = quant_config.get("sym") + desc_act = quant_config.get("desc_act") if quant_method != "gptq": return False diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index b009ad8c882d4..2025e1fdeef81 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -401,9 +401,8 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool: "inferred as vLLM models, so setting vllm_tensorized=True is " "only necessary for models serialized prior to this change.") return True - if (".vllm_tensorized_marker" in deserializer): - return True - return False + + return ".vllm_tensorized_marker" in deserializer def serialize_vllm_model( diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index b76a1ab4cf243..03a6af7ddf0f6 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -241,10 +241,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): return False # TODO: Add soft-tuning prompt adapter support - if self.prompt_adapter_config: - return False - - return True + return self.prompt_adapter_config @torch.inference_mode() def execute_model( diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 9036d117041f0..e6be820fecf43 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -102,10 +102,8 @@ def _should_collect_rejsample_metrics(self, now: float) -> bool: if self._rank != 0: return False - if (now - self._last_metrics_collect_time < - self._rejsample_metrics_collect_interval_s): - return False - return True + return (now - self._last_metrics_collect_time < + self._rejsample_metrics_collect_interval_s) def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: """Copy rejection/typical-acceptance sampling metrics diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py index ae00af44a048a..3c3a8492a4de5 100644 --- a/vllm/triton_utils/libentry.py +++ b/vllm/triton_utils/libentry.py @@ -35,9 +35,9 @@ def key(self, spec_args, dns_args, const_args): dns_key = [ arg.dtype if hasattr( arg, "data_ptr") else type(arg) if not isinstance(arg, int) - else "i32" if -(2**31) <= arg and arg <= 2**31 - - 1 else "u64" if 2**63 <= arg and arg <= 2**64 - 1 else "i64" - for arg in dns_args + else "i32" if -(2**31) <= arg and arg <= 2**31 - 1 # noqa: SIM300 + else "u64" if 2**63 <= arg and arg <= 2**64 - # noqa: SIM300 + 1 else "i64" for arg in dns_args ] # const args passed by position return tuple(spec_key + dns_key + const_args)