Skip to content

Commit 947a37e

Browse files
authored
Update recent processors for vLLM backend (#39583)
* update recent models and make sure it runs withh vLLM * delete!
1 parent 7b897fe commit 947a37e

File tree

6 files changed

+79
-9
lines changed

6 files changed

+79
-9
lines changed

src/transformers/models/glm4v/configuration_glm4v.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1919
# See the License for the specific language governing permissions and
2020
# limitations under the License.
21-
2221
from ...configuration_utils import PretrainedConfig
2322
from ...modeling_rope_utils import rope_config_validation
2423

src/transformers/models/glm4v/image_processing_glm4v.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -454,11 +454,11 @@ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=Non
454454

455455
factor = patch_size * merge_size
456456
resized_height, resized_width = smart_resize(
457-
t=self.temporal_patch_size,
457+
num_frames=self.temporal_patch_size,
458458
height=height,
459459
width=width,
460460
factor=factor,
461-
t_factor=self.temporal_patch_size,
461+
temporal_factor=self.temporal_patch_size,
462462
)
463463
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
464464
return grid_h * grid_w

src/transformers/models/glm4v/modeling_glm4v.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1919
# See the License for the specific language governing permissions and
2020
# limitations under the License.
21-
2221
import itertools
2322
from dataclasses import dataclass
2423
from typing import Any, Callable, Optional, Union
@@ -753,6 +752,7 @@ def forward(
753752
output_attentions=output_attentions,
754753
use_cache=use_cache,
755754
cache_position=cache_position,
755+
**kwargs,
756756
)
757757

758758
hidden_states = self.post_self_attn_layernorm(hidden_states)

src/transformers/models/glm4v/modular_glm4v.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
1615
import itertools
1716
from typing import Callable, Optional, Union
1817

18+
import numpy as np
1919
import torch
2020
import torch.nn as nn
2121
import torch.nn.functional as F
@@ -822,6 +822,7 @@ def forward(
822822
output_attentions=output_attentions,
823823
use_cache=use_cache,
824824
cache_position=cache_position,
825+
**kwargs,
825826
)
826827

827828
hidden_states = self.post_self_attn_layernorm(hidden_states)
@@ -1566,6 +1567,7 @@ class Glm4vProcessorKwargs(Qwen2_5_VLProcessorKwargs):
15661567
_defaults = {
15671568
"text_kwargs": {
15681569
"padding": False,
1570+
"return_mm_token_type_ids": False,
15691571
},
15701572
}
15711573

@@ -1707,9 +1709,15 @@ def __call__(
17071709

17081710
text[i] = text[i].replace("<|placeholder|>", self.image_token)
17091711
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
1712+
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
17101713
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
17111714
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
17121715

1716+
if return_mm_token_type_ids:
1717+
array_ids = np.array(text_inputs["input_ids"])
1718+
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
1719+
mm_token_type_ids[array_ids == self.image_token_id] = 1
1720+
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
17131721
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
17141722

17151723

src/transformers/models/glm4v/processing_glm4v.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1919
# See the License for the specific language governing permissions and
2020
# limitations under the License.
21-
2221
from typing import Optional, Union
2322

23+
import numpy as np
24+
2425
from ...feature_extraction_utils import BatchFeature
2526
from ...image_utils import ImageInput
2627
from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
@@ -44,6 +45,7 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
4445
_defaults = {
4546
"text_kwargs": {
4647
"padding": False,
48+
"return_mm_token_type_ids": False,
4749
},
4850
}
4951

@@ -200,9 +202,15 @@ def __call__(
200202

201203
text[i] = text[i].replace("<|placeholder|>", self.image_token)
202204
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
205+
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
203206
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
204207
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
205208

209+
if return_mm_token_type_ids:
210+
array_ids = np.array(text_inputs["input_ids"])
211+
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
212+
mm_token_type_ids[array_ids == self.image_token_id] = 1
213+
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
206214
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
207215

208216
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):

src/transformers/models/perception_lm/processing_perception_lm.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717

1818
from typing import Iterable, Union
1919

20+
import numpy as np
21+
2022
from ...feature_extraction_utils import BatchFeature
2123
from ...image_utils import ImageInput, get_image_size, to_numpy_array
22-
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
24+
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
2325
from ...tokenization_utils_base import PreTokenizedInput, TextInput
2426
from ...utils import logging
2527
from ...video_utils import VideoInput
@@ -32,6 +34,7 @@ class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
3234
_defaults = {
3335
"text_kwargs": {
3436
"padding": False,
37+
"return_mm_token_type_ids": False,
3538
},
3639
}
3740

@@ -157,9 +160,17 @@ def __call__(
157160
prompt_strings.append(sample)
158161

159162
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
160-
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
163+
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
164+
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
161165
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"])
162-
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
166+
167+
if return_mm_token_type_ids:
168+
array_ids = np.array(text_inputs["input_ids"])
169+
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
170+
mm_token_type_ids[array_ids == self.image_token_id] = 1
171+
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
172+
173+
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
163174

164175
def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
165176
media_count = sample.count(media_token)
@@ -183,6 +194,50 @@ def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
183194
sample += sample_splits[-1]
184195
return sample
185196

197+
def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
198+
"""
199+
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
200+
201+
Args:
202+
image_sizes (`list[list[int]]`, *optional*):
203+
The input sizes formatted as (height, width) per each image.
204+
205+
Returns:
206+
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
207+
input modalities, along with other useful data.
208+
"""
209+
210+
vision_data = {}
211+
if image_sizes is not None:
212+
images_kwargs = PerceptionLMProcessorKwargs._defaults.get("images_kwargs", {})
213+
images_kwargs.update(kwargs)
214+
tile_size = images_kwargs.get("tile_size", None) or self.image_processor.tile_size
215+
216+
num_image_tokens = []
217+
num_image_patches = []
218+
for height, width in image_sizes:
219+
if self.image_processor.vision_input_type == "thumb+tile":
220+
aspect_ratio = self.image_processor._fit_image_to_canvas(
221+
img_width=width, img_height=height, tile_size=tile_size
222+
)
223+
if aspect_ratio is None:
224+
aspect_ratio = self.image_processor._find_closest_aspect_ratio(
225+
img_width=width, img_height=height, tile_size=tile_size
226+
)
227+
num_tiles = aspect_ratio[0] * aspect_ratio[1] + 1 # base image and tiles
228+
else:
229+
num_tiles = 1
230+
231+
num_image_tokens.append(
232+
(tile_size // self.patch_size // self.pooling_ratio)
233+
* (tile_size // self.patch_size // self.pooling_ratio)
234+
* num_tiles
235+
)
236+
num_image_patches.append(num_tiles)
237+
238+
vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
239+
return MultiModalData(**vision_data)
240+
186241
def batch_decode(self, *args, **kwargs):
187242
"""
188243
This method forwards all its arguments to PerceptionLMTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please

0 commit comments

Comments
 (0)