-
Notifications
You must be signed in to change notification settings - Fork 27.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VLM: fixes after refactor #32907
VLM: fixes after refactor #32907
Changes from all commits
70a70b1
4b1179d
4fc07d0
a1d4434
255de78
41dc04d
cb787c9
cf35b01
6a7a40e
ead33ff
663a354
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,30 +140,29 @@ def __call__( | |
elif not isinstance(text, list) and not isinstance(text[0], str): | ||
raise ValueError("Invalid input text. Please provide a string, or a list of strings") | ||
|
||
if self.patch_size is None or self.vision_feature_select_strategy is None: | ||
prompt_strings = text | ||
logger.warning_once( | ||
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " | ||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " | ||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " | ||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47." | ||
) | ||
# cannot infer image expansion length if no images are found | ||
elif not image_inputs: | ||
prompt_strings = text | ||
else: | ||
image_sizes = image_inputs["image_sizes"] | ||
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0])) | ||
prompt_strings = [] | ||
for image_size, sample in zip(image_sizes, text): | ||
# Replace the image token with the expanded image token sequence | ||
orig_height, orig_width = image_size | ||
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) | ||
if self.vision_feature_select_strategy == "default": | ||
num_image_tokens -= 1 | ||
|
||
sample = sample.replace(self.image_token, self.image_token * num_image_tokens) | ||
prompt_strings.append(sample) | ||
prompt_strings = text | ||
if image_inputs: | ||
if self.patch_size is None or self.vision_feature_select_strategy is None: | ||
logger.warning_once( | ||
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " | ||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " | ||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " | ||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47." | ||
) | ||
else: | ||
image_sizes = iter(image_inputs["image_sizes"]) | ||
height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0])) | ||
prompt_strings = [] | ||
for sample in text: | ||
while self.image_token in sample: | ||
image_size = next(image_sizes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this right? The previous logic implies There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I verified twice to be sure. The number of images == number of image sizes. The new added test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @amyeroberts do you have any other concerns regarding this PR? |
||
orig_height, orig_width = image_size | ||
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) | ||
if self.vision_feature_select_strategy == "default": | ||
num_image_tokens -= 1 | ||
sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1) | ||
prompt_strings.append(sample) | ||
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings] | ||
|
||
text_inputs = self.tokenizer( | ||
prompt_strings, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was needed for cases with mutli-image inputs where we cannot be sure that number of image sizes is same as text. For ex, one text and two images