Merge pull request vllm-project#3 from beagleski/bapatra/bugfix-longr…

…ope-type minor change for LongRoPE config to account for rename from longrope …
linxihui · May 4, 2024 · 561d5a8 · 561d5a8
2 parents 63b9bb8 + e1dd365
commit 561d5a8
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 2 deletions.
diff --git a/vllm/config.py b/vllm/config.py
@@ -972,7 +972,7 @@ def _get_and_verify_max_len(
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
-    if rope_scaling is not None and rope_scaling["type"] != "longrope":
+    if rope_scaling is not None and rope_scaling["type"] not in ("longrope", "su"):
         assert "factor" in rope_scaling
         scaling_factor = rope_scaling["factor"]
         if rope_scaling["type"] == "yarn":

diff --git a/vllm/model_executor/models/phi3small/phi3small.py b/vllm/model_executor/models/phi3small/phi3small.py
@@ -391,7 +391,11 @@ def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
         logits = self.logits_processor(self.lm_head.weight, hidden_states,
                                        sampling_metadata)
-        if self.dummy_token_indices is not None:
+        if self.dummy_token_indices is not None and logits is not None:
+            # In case of tensor-parallelism, the logit processor under the hood
+            # does an `tensor_model_parallel_gather`, so that the vocab multiplication
+            # would happen only on rank 0. For all other ranks, the logits are returned as
+            # None. Hence only rank with not None logits should fill the dummy tokens with -inf.
             logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
         return logits