|
5 | 5 | from typing import Optional |
6 | 6 |
|
7 | 7 | import torch |
8 | | -from torch import nn, softmax |
| 8 | +import torch.nn as nn |
9 | 9 | from torch.nn import functional as F |
10 | | -from torch.nn.functional import gumbel_softmax, pad |
11 | 10 |
|
12 | 11 | from vllm.model_executor.layers.layernorm import RMSNorm |
13 | 12 | from vllm.model_executor.layers.linear import ReplicatedLinear |
14 | 13 | from vllm.model_executor.layers.quantization.base_config import ( |
15 | 14 | QuantizationConfig) |
16 | | -from vllm.transformers_utils.configs.ovis2 import (AIMv2Config, |
17 | | - Aimv2VisualTokenizerConfig) |
18 | | - |
19 | | -IMAGE_INDICATOR_IDS = [-301, -302, -303, -304, |
20 | | - -305] # kept for vocab prefixed tokens |
21 | | - |
22 | | - |
23 | | -def st_argmax(y_soft: torch.Tensor, dim: int): # straight-through softmax |
24 | | - index = y_soft.max(dim, keepdim=True)[1] |
25 | | - y_hard = torch.zeros_like( |
26 | | - y_soft, memory_format=torch.legacy_contiguous_format).scatter_( |
27 | | - dim, index, 1.0) |
28 | | - ret = y_hard - y_soft.detach() + y_soft |
29 | | - return ret |
30 | | - |
31 | | - |
32 | | -class Aimv2VisualTokenizer(torch.nn.Module): |
33 | | - |
34 | | - def __init__(self, |
35 | | - config: Aimv2VisualTokenizerConfig, |
36 | | - quant_config: Optional[QuantizationConfig] = None, |
37 | | - prefix: str = "", |
38 | | - **kwargs): |
39 | | - super().__init__() |
40 | | - self.config = config |
41 | | - self.backbone = AIMv2Model( |
42 | | - config=config.backbone_config, # noqa |
43 | | - quant_config=quant_config, |
44 | | - prefix=f"{prefix}.visual_tokenizer") |
45 | | - # reserved tokens for IMAGE_INDICATORS |
46 | | - head_dim = config.vocab_size - len(IMAGE_INDICATOR_IDS) |
47 | | - self.head = torch.nn.Sequential( |
48 | | - ReplicatedLinear( |
49 | | - config.backbone_config.hidden_size * config.hidden_stride * |
50 | | - config.hidden_stride, |
51 | | - head_dim, |
52 | | - bias=False, |
53 | | - ), torch.nn.LayerNorm(head_dim)) |
54 | | - |
55 | | - @property |
56 | | - def dtype(self): |
57 | | - return self.backbone.dtype |
58 | | - |
59 | | - @property |
60 | | - def device(self): |
61 | | - return self.backbone.device |
62 | | - |
63 | | - def tokenize(self, logits): |
64 | | - if self.config.tokenize_function == 'softmax': |
65 | | - tokens = softmax(logits, dim=-1) |
66 | | - elif self.config.tokenize_function == 'gumbel_argmax': |
67 | | - tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True) |
68 | | - elif self.config.tokenize_function == 'st_argmax': |
69 | | - tokens = st_argmax(logits, dim=-1) |
70 | | - else: |
71 | | - raise ValueError( |
72 | | - 'Invalid `max_type`, expected softmax or gumbel_argmax ' |
73 | | - f'or st_argmax, but got {self.config.tokenize_function}') |
74 | | - return tokens |
75 | | - |
76 | | - def encode(self, pixel_values): |
77 | | - features = self.backbone(pixel_values) |
78 | | - if self.config.drop_cls_token: |
79 | | - features = features[:, 1:, :] |
80 | | - |
81 | | - # merge number of `hidden_stride * hidden_stride` hidden states together |
82 | | - # to reduce token sequence length |
83 | | - # e.g., for hidden_stride=2, this leads to a token length reduction: |
84 | | - # 1024 -> 256 for aimv2 |
85 | | - if self.config.hidden_stride > 1: |
86 | | - # this `d` maybe different from the above `d`` |
87 | | - n, L, d = features.shape |
88 | | - sqrt_l = int(L**0.5) |
89 | | - assert sqrt_l**2 == L, ( |
90 | | - "The token sequence length should be a perfect square.") |
91 | | - features = features.reshape(n, sqrt_l, sqrt_l, d) |
92 | | - pl = (self.config.hidden_stride - |
93 | | - (sqrt_l % |
94 | | - self.config.hidden_stride)) % self.config.hidden_stride |
95 | | - features = pad(features, (0, 0, 0, pl, 0, pl), "constant", 0) |
96 | | - sqrt_l += pl |
97 | | - features = features.reshape(n, sqrt_l // self.config.hidden_stride, |
98 | | - self.config.hidden_stride, |
99 | | - sqrt_l // self.config.hidden_stride, |
100 | | - self.config.hidden_stride, d) |
101 | | - # [n, sqrt_l/hs, sqrt_l/hs, hs, hs, d] |
102 | | - features = features.permute(0, 1, 3, 2, 4, 5) |
103 | | - # [n, sqrt_l/hs, sqrt_l/hs, hs*hs*d] |
104 | | - features = features.flatten(3) |
105 | | - # [n, sqrt_l/hs*sqrt_l/hs, hs*hs*d] |
106 | | - features = features.reshape( |
107 | | - n, -1, |
108 | | - self.config.hidden_stride * self.config.hidden_stride * d) |
109 | | - |
110 | | - return features |
111 | | - |
112 | | - def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: |
113 | | - """[BatchSize, ImageShape] -> [BatchSize, Token, VocabSize]""" |
114 | | - features = self.encode(pixel_values) |
115 | | - logits, _ = self.head[0]( |
116 | | - features) # we spllit the sequncial here for not throwing an error |
117 | | - logits = self.head[1](logits) |
118 | | - tokens = self.tokenize(logits) |
119 | | - # tokens' shape is [BatchSize, #Token, VocabSize-5], so padding with |
120 | | - # [BatchSize, #Token, 5], after which, tokens' shape should become |
121 | | - # [BatchSize, #Token, VocabSize] |
122 | | - batch_size, token_len, _ = tokens.shape |
123 | | - padding_tensor = torch.zeros(size=(batch_size, token_len, |
124 | | - len(IMAGE_INDICATOR_IDS)), |
125 | | - dtype=tokens.dtype, |
126 | | - device=tokens.device, |
127 | | - layout=tokens.layout, |
128 | | - requires_grad=False) |
129 | | - tokens = torch.cat((tokens, padding_tensor), dim=2) |
130 | | - return tokens |
| 15 | +from vllm.transformers_utils.configs.ovis import AIMv2Config |
131 | 16 |
|
132 | 17 |
|
133 | 18 | class AIMv2SwiGLUFFN(nn.Module): |
@@ -302,14 +187,6 @@ def __init__(self, |
302 | 187 | quant_config=quant_config, |
303 | 188 | prefix=f"{prefix}.trunk") |
304 | 189 |
|
305 | | - @property |
306 | | - def dtype(self): |
307 | | - return self.trunk.blocks[0].attn.qkv.weight.dtype |
308 | | - |
309 | | - @property |
310 | | - def device(self): |
311 | | - return self.trunk.blocks[0].attn.qkv.device |
312 | | - |
313 | 190 | def forward( |
314 | 191 | self, |
315 | 192 | pixel_values: torch.Tensor, |
|
0 commit comments