|
37 | 37 | QKVParallelLinear, |
38 | 38 | RowParallelLinear) |
39 | 39 | from vllm.model_executor.layers.logits_processor import LogitsProcessor |
40 | | -from vllm.model_executor.layers.pooler import Pooler, PoolingType |
41 | 40 | from vllm.model_executor.layers.quantization import QuantizationConfig |
42 | 41 | from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( |
43 | 42 | get_compressed_tensors_cache_scale) |
|
47 | 46 | DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) |
48 | 47 | from vllm.model_executor.model_loader.weight_utils import ( |
49 | 48 | default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) |
50 | | -from vllm.model_executor.pooling_metadata import PoolingMetadata |
51 | 49 | from vllm.model_executor.sampling_metadata import SamplingMetadata |
52 | 50 | from vllm.platforms import current_platform |
53 | | -from vllm.sequence import IntermediateTensors, PoolerOutput |
| 51 | +from vllm.sequence import IntermediateTensors |
54 | 52 |
|
55 | 53 | from .interfaces import SupportsLoRA, SupportsPP |
56 | | -from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, |
57 | | - is_pp_missing_parameter, |
| 54 | +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, |
58 | 55 | make_empty_intermediate_tensors_factory, make_layers, |
59 | 56 | maybe_prefix) |
60 | 57 |
|
@@ -497,7 +494,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): |
497 | 494 | config = vllm_config.model_config.hf_config |
498 | 495 | quant_config = vllm_config.quant_config |
499 | 496 | lora_config = vllm_config.lora_config |
500 | | - pooler_config = vllm_config.model_config.pooler_config |
501 | 497 | self.config = config |
502 | 498 | self.lora_config = lora_config |
503 | 499 |
|
@@ -530,13 +526,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): |
530 | 526 | self.sampler = get_sampler() |
531 | 527 | else: |
532 | 528 | self.lm_head = PPMissingLayer() |
| 529 | + |
533 | 530 | self.make_empty_intermediate_tensors = ( |
534 | 531 | self.model.make_empty_intermediate_tensors) |
535 | | - self._pooler = Pooler.from_config_with_defaults( |
536 | | - pooler_config, |
537 | | - pooling_type=PoolingType.STEP, |
538 | | - normalize=False, |
539 | | - softmax=False) |
540 | 532 |
|
541 | 533 | def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): |
542 | 534 | return LlamaModel(vllm_config=vllm_config, prefix=prefix) |
@@ -567,14 +559,6 @@ def compute_logits( |
567 | 559 | sampling_metadata) |
568 | 560 | return logits |
569 | 561 |
|
570 | | - def pooler( |
571 | | - self, |
572 | | - hidden_states: torch.Tensor, |
573 | | - pooling_metadata: PoolingMetadata, |
574 | | - ) -> Optional[PoolerOutput]: |
575 | | - logits = self.compute_logits(hidden_states, None) |
576 | | - return self._pooler(logits, pooling_metadata) |
577 | | - |
578 | 562 | def sample(self, logits: torch.Tensor, |
579 | 563 | sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: |
580 | 564 | next_tokens = self.sampler(logits, sampling_metadata) |
@@ -625,79 +609,3 @@ def permute(w: torch.Tensor, n_heads: int): |
625 | 609 | name = name.replace(item, mapping[item]) |
626 | 610 |
|
627 | 611 | return name, loaded_weight |
628 | | - |
629 | | - |
630 | | -# TODO: Remove this once reward modeling is separated from LlamaForCausalLM |
631 | | -class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): |
632 | | - """ |
633 | | - A model that uses Llama with additional embedding functionalities. |
634 | | -
|
635 | | - This class encapsulates the LlamaModel and provides an interface for |
636 | | - embedding operations and customized pooling functions. |
637 | | -
|
638 | | - Attributes: |
639 | | - model: An instance of LlamaModel used for forward operations. |
640 | | - _pooler: An instance of Pooler used for pooling operations. |
641 | | - """ |
642 | | - packed_modules_mapping = { |
643 | | - "qkv_proj": ["q_proj", "k_proj", "v_proj"], |
644 | | - "gate_up_proj": ["gate_proj", "up_proj"] |
645 | | - } |
646 | | - |
647 | | - # LoRA specific attributes |
648 | | - supported_lora_modules = [ |
649 | | - "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" |
650 | | - ] |
651 | | - embedding_modules = { |
652 | | - "embed_tokens": "input_embeddings", |
653 | | - } |
654 | | - embedding_padding_modules = [] |
655 | | - |
656 | | - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): |
657 | | - super().__init__() |
658 | | - |
659 | | - pooler_config = vllm_config.model_config.pooler_config |
660 | | - |
661 | | - self.model = LlamaModel(vllm_config=vllm_config, |
662 | | - prefix=maybe_prefix(prefix, "model")) |
663 | | - self._pooler = Pooler.from_config_with_defaults( |
664 | | - pooler_config, |
665 | | - pooling_type=PoolingType.LAST, |
666 | | - normalize=True, |
667 | | - softmax=False) |
668 | | - self.make_empty_intermediate_tensors = ( |
669 | | - self.model.make_empty_intermediate_tensors) |
670 | | - |
671 | | - def forward( |
672 | | - self, |
673 | | - input_ids: Optional[torch.Tensor], |
674 | | - positions: torch.Tensor, |
675 | | - kv_caches: List[torch.Tensor], |
676 | | - attn_metadata: AttentionMetadata, |
677 | | - intermediate_tensors: Optional[IntermediateTensors] = None, |
678 | | - inputs_embeds: Optional[torch.Tensor] = None, |
679 | | - ) -> Union[torch.Tensor, IntermediateTensors]: |
680 | | - return self.model(input_ids, positions, kv_caches, attn_metadata, |
681 | | - intermediate_tensors, inputs_embeds) |
682 | | - |
683 | | - def pooler( |
684 | | - self, |
685 | | - hidden_states: torch.Tensor, |
686 | | - pooling_metadata: PoolingMetadata, |
687 | | - ) -> Optional[PoolerOutput]: |
688 | | - return self._pooler(hidden_states, pooling_metadata) |
689 | | - |
690 | | - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): |
691 | | - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) |
692 | | - weights = hf_to_vllm_mapper.apply(weights) |
693 | | - weights = ((name, data) for name, data in weights |
694 | | - if not name.startswith("lm_head.")) |
695 | | - self.model.load_weights(weights) |
696 | | - |
697 | | - def load_kv_cache_scales(self, quantization_param_path: str) -> None: |
698 | | - self.model.load_kv_cache_scales(quantization_param_path) |
699 | | - |
700 | | - # LRUCacheWorkerLoRAManager instantiation requires model config. |
701 | | - @property |
702 | | - def config(self): |
703 | | - return self.model.config |
0 commit comments