fix: typos (#18151)

omahs · web-flow · commit a9944aabfa0e · 2025-05-15T02:16:15.000-07:00
Signed-off-by: omahs &lt;73983677+omahs@users.noreply.github.com&gt;
diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh
@@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
 
   // Load the query to registers.
   // Each thread in a thread group has a different part of the query.
-  // For example, if the the thread group size is 4, then the first thread in
+  // For example, if the thread group size is 4, then the first thread in
   // the group has 0, 4, 8, ... th vectors of the query, and the second thread
   // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
   // q is split from a qkv tensor, it may not be contiguous.
@@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
 
     // Load a key to registers.
     // Each thread in a thread group has a different part of the key.
-    // For example, if the the thread group size is 4, then the first thread in
+    // For example, if the thread group size is 4, then the first thread in
     // the group has 0, 4, 8, ... th vectors of the key, and the second thread
     // has 1, 5, 9, ... th vectors of the key, and so on.
     for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
@@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
             "partly cloudly, with highs in the 90's.")
 
 
-tool_funtions = {"get_current_weather": get_current_weather}
+tool_functions = {"get_current_weather": get_current_weather}
 
 tools = [{
     "type": "function",
@@ -122,7 +122,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
 # above defined function
 tool_calls = json.loads(output)
 tool_answers = [
-    tool_funtions[call['name']](**call['arguments']) for call in tool_calls
+    tool_functions[call['name']](**call['arguments']) for call in tool_calls
 ]
 
 # append the answer as a tool message and let the LLM give you an answer
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
@@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
 
     lora_path = get_adapter_absolute_path(lora_name)
 
-    # lora loading should work for either absolute path and hugggingface id.
+    # lora loading should work for either absolute path and huggingface id.
     peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
     lora_model = LoRAModel.from_local_checkpoint(
         lora_path,
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py
@@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
     try:
         # enable hf hub transfer if available
         import hf_transfer  # type: ignore # noqa
-        HF_TRANFER_ACTIVE = True
+        HF_TRANSFER_ACTIVE = True
     except ImportError:
-        HF_TRANFER_ACTIVE = False
+        HF_TRANSFER_ACTIVE = False
     assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
-            HF_TRANFER_ACTIVE)
+            HF_TRANSFER_ACTIVE)
 
 
 def test_download_weights_from_hf():
diff --git a/vllm/config.py b/vllm/config.py
@@ -297,7 +297,7 @@ class ModelConfig:
     - 1K -> 1024\n
     - 25.6k -> 25,600"""
     spec_target_max_model_len: Optional[int] = None
-    """Specify the the maximum length for spec decoding draft models."""
+    """Specify the maximum length for spec decoding draft models."""
     quantization: Optional[QuantizationMethods] = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -153,7 +153,7 @@ def _lora_expand(
         lora_token_start_loc (torch.Tensor): A cumulative sum of
             num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
             lora_token_start_loc[i], along with num_tokens_per_lora[i]
-            identifies the the region in token_indices_sorted_by_lora_ids that
+            identifies the region in token_indices_sorted_by_lora_ids that
             LoRA lora_ids[i] should process.
         lora_ids (torch.Tensor): LoRA ids to process.
         no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader(
 ) -> LoaderFunction:
     """Create a weight loader for mamba v2. This ensures that the projections 
     are correctly sharded so that they can be split into x, B, C. It also 
-    ensures the the all the groups corresponding to a head shard is placed 
+    ensures that all the groups corresponding to a head shard is placed 
     together with it.
     """
 
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
@@ -21,7 +21,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only IBM Granite speeech model."""
+"""Inference-only IBM Granite speech model."""
 import math
 from collections.abc import Iterable, Mapping
 from typing import Optional, TypedDict, Union
@@ -626,7 +626,7 @@ def _build_input_features_mask(
         audio_embed_sizes: torch.Tensor,
     ) -> torch.Tensor:
         """Calculate the input features mask, which will generally be used
-        to mask the the padded features for all entries in the batch except
+        to mask the padded features for all entries in the batch except
         for those with the most audio features.
 
         Args:
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
@@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module):
             if set to True, use GLULinear module,
              otherwise, used GLUPointWiseConv module.
               default to False.
-        attention_innner_dim: int, optional
+        attention_inner_dim: int, optional
             if equal to -1, attention dim for linears k/q/v is
-            equal to d_model. otherwise attention_innner_dim is used.
+            equal to d_model. otherwise attention_inner_dim is used.
             default -1.
         attention_glu_type: str, optional
             activation function for glu used in the multihead attention,
@@ -148,7 +148,7 @@ def __init__(
         conv_glu_type="sigmoid",
         bias_in_glu=True,
         linear_glu_in_convm=False,
-        attention_innner_dim=-1,
+        attention_inner_dim=-1,
         attention_glu_type="swish",
         activation_checkpointing="",
         export=False,
@@ -169,7 +169,7 @@ def __init__(
             n_head,
             d_model,
             dropout_rate,
-            attention_innner_dim,
+            attention_inner_dim,
             attention_glu_type,
             bias_in_glu,
             use_pt_scaled_dot_product_attention=
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
@@ -72,7 +72,7 @@ def __init__(
             assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Read-only views
-        # Prevent directly appending to the these lists since
+        # Prevent directly appending to these lists since
         # they should also be updated simultaneously.
         self.output_token_ids = ConstantList(self._output_token_ids)
         self.all_token_ids = ConstantList(self._all_token_ids)