Skip to content

Commit

Permalink
Some fixes. Many tests are still failing but some are passing now.
Browse files Browse the repository at this point in the history
- I have added TODO's for some of the hacks I made to unblock me
  and I will address them soon
- I have the processing_idefics.py hacked in my view to support TF temporarily
  • Loading branch information
a8nova committed Nov 21, 2023
1 parent 335e47d commit b42fe29
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 25 deletions.
3 changes: 2 additions & 1 deletion src/transformers/models/idefics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
"TFIdeficsForVisionText2Text",
"TFIdeficsModel",
"TFIdeficsPreTrainedModel",
"TFIdeficsProcessor"
]

if TYPE_CHECKING:
Expand Down Expand Up @@ -85,7 +86,7 @@

try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
Expand Down
68 changes: 54 additions & 14 deletions src/transformers/models/idefics/modeling_tf_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,8 @@ def _init_weights(self, module):
module.embeddings = tf.random.normal(shape=module.embeddings.shape, mean=0.0, stddev=std)

def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, TFIdeficsModel):
# TODO: Alazar, should below be TFIdeficsModel instead?
if isinstance(module, TFIdeficsMainLayer):
module.gradient_checkpointing = value


Expand Down Expand Up @@ -1055,16 +1056,16 @@ def _set_gradient_checkpointing(self, module, value=False):
"The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
LLAMA_START_DOCSTRING,
)
class TFIdeficsModel(TFIdeficsPreTrainedModel):
class TFIdeficsMainLayer(tf.keras.layers.Layer):
"""
Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]
Args:
config: IdeficsConfig
"""

def __init__(self, config: IdeficsConfig, **kwargs):
super().__init__(config, **kwargs)
def __init__(self, config: IdeficsConfig, add_pooling_year: bool = True, **kwargs):
super().__init__(**kwargs)
self.config = config
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
Expand Down Expand Up @@ -1094,7 +1095,7 @@ def __init__(self, config: IdeficsConfig, **kwargs):
name="perceiver_resampler",
)

self.layers = [TFIdeficsDecoderLayer(config, name=f"layers_{i}") for i in range(config.num_hidden_layers)]
self.decoder_layers = [TFIdeficsDecoderLayer(config, name=f"layers_{i}") for i in range(config.num_hidden_layers)]

self.cross_layer_interval = config.cross_layer_interval
num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
Expand All @@ -1107,10 +1108,8 @@ def __init__(self, config: IdeficsConfig, **kwargs):
self.norm = TFIdeficsRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="norm")

self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()

self.freeze_relevant_params(config)
# TODO: Alazar
#self.freeze_relevant_params(config)

def freeze_relevant_params(self, config=None):
if config is None:
Expand All @@ -1123,7 +1122,7 @@ def freeze_relevant_params(self, config=None):
freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)

def freeze_text_layers(self, module_exceptions=[]):
for module in [self.layers, self.norm]:
for module in [self.decoder_layers, self.norm]:
freeze_model(module, module_exceptions=module_exceptions)

def freeze_vision_layers(self, module_exceptions=[]):
Expand Down Expand Up @@ -1218,7 +1217,7 @@ def call(
no_images = tf.reduce_sum(tf.cast(pixel_values, dtype=tf.int32)) == 0
pixel_values = tf.cast(pixel_values, dtype=self.dtype) # fp16 compatibility
batch_size, num_images = shape_list(pixel_values)[:2]
pixel_values = tf.reshape(pixel_values, (batch_size * num_images, *shape_list(pixel_values)[2:]))
pixel_values = tf.reshape(pixel_values, [batch_size * num_images, *pixel_values.shape[2:]])

# Get sequence from the vision encoder
image_hidden_states = self.vision_model(
Expand Down Expand Up @@ -1407,14 +1406,57 @@ def vblock(
image_hidden_states=image_hidden_states,
)

class TFIdeficsModel(TFIdeficsPreTrainedModel):
def __init__(self, config: IdeficsConfig, **kwargs):
super().__init__(config, **kwargs)

self.model = TFIdeficsMainLayer(config, name="idefics")

def call(
self,
input_ids: tf.Tensor = None,
attention_mask: Optional[tf.Tensor] = None,
position_ids: Optional[tf.Tensor] = None,
past_key_values: Optional[List[tf.Tensor]] = None,
inputs_embeds: Optional[tf.Tensor] = None,
pixel_values: Optional[tf.Tensor] = None,
image_encoder_embeddings: Optional[tf.Tensor] = None,
perceiver_embeddings: Optional[tf.Tensor] = None,
image_attention_mask: Optional[tf.Tensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
training: Optional[bool] = None,
) -> Union[Tuple, TFIdeficsBaseModelOutputWithPast]:
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
pixel_values=pixel_values,
image_encoder_embeddings=image_encoder_embeddings,
perceiver_embeddings=perceiver_embeddings,
image_attention_mask=image_attention_mask,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
training=training,
)
return outputs


class TFIdeficsForVisionText2Text(TFPreTrainedModel):
_keys_to_ignore_on_load_missing = [r"lm_head.weight"]
_tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]

def __init__(self, config, vision_model=None, **kwargs):
super().__init__(config, **kwargs)
self.model = TFIdeficsModel(config)
self.model = TFIdeficsMainLayer(config)

self.lm_head = TFIdeficsDecoupledLinear(
config.hidden_size,
Expand All @@ -1424,8 +1466,6 @@ def __init__(self, config, vision_model=None, **kwargs):
partially_freeze=config.freeze_lm_head,
)

# Initialize weights and apply final processing
self.post_init()

def get_input_embeddings(self):
return self.model.embed_tokens
Expand Down
13 changes: 8 additions & 5 deletions src/transformers/models/idefics/vision_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import tensorflow as tf

from ...activations import ACT2FN
from ...activations_tf import get_tf_activation
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
from ...modeling_tf_utils import shape_list, TFPreTrainedModel
from ...utils import ModelOutput, logging
Expand Down Expand Up @@ -77,7 +77,10 @@ def __init__(self, config: IdeficsVisionConfig, **kwargs):
kernel_size=self.patch_size,
strides=self.patch_size,
use_bias=False,
data_format="channels_last",
# TODO: Alazar, channel_first data format isn't supported on CPU
# but I was getting a weird crash when it is set to channels_last
# I will investigate later, just a temporary hack
data_format="channels_first",
name="patch_embedding",
)

Expand Down Expand Up @@ -119,15 +122,15 @@ def interpolate_pos_encoding(self, embeddings: tf.Tensor, height: int, width: in
return tf.concat((class_pos_embed[tf.newaxis, :], patch_pos_embed), axis=1)

def call(self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False) -> tf.Tensor:
batch_size, height, width, num_channels = shape_list(pixel_values)
batch_size, num_channels, height, width = shape_list(pixel_values)
if not interpolate_pos_encoding:
if height != self.image_size or width != self.image_size:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model"
f" ({self.image_size}*{self.image_size}). You should try to set `interpolate_pos_encoding=True`"
)

pixel_values = tf.transpose(pixel_values, perm=[0, 3, 1, 2])
#pixel_values = tf.transpose(pixel_values, perm=[0, 3, 1, 2])
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]

patch_embeds = tf.reshape(patch_embeds, [batch_size, self.num_patches, -1])
Expand Down Expand Up @@ -254,7 +257,7 @@ class TFIdeficsVisionMLP(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.activation_fn = get_tf_activation(config.hidden_act)
self.fc1 = tf.keras.layers.Dense(config.intermediate_size, name="fc1")
self.fc2 = tf.keras.layers.Dense(config.hidden_size, name="fc2")

Expand Down
10 changes: 5 additions & 5 deletions tests/models/idefics/test_modeling_tf_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
if is_tf_available():
import tensorflow as tf

from transformers import TFIdeficsForVisionText2Text, TFIdeficsModel, TFIdeficsProcessor
from transformers.models.idefics.configuration_idefics import TFIdeficsPerceiverConfig, TFIdeficsVisionConfig
from transformers import TFIdeficsForVisionText2Text, TFIdeficsModel, IdeficsProcessor
from transformers.models.idefics.configuration_idefics import IdeficsPerceiverConfig, IdeficsVisionConfig
from transformers.models.idefics.modeling_idefics import IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST

if is_vision_available():
Expand Down Expand Up @@ -279,7 +279,7 @@ def test_model_outputs_equivalence(self):

def setUp(self):
self.model_tester = IdeficsModelTester(self)
self.config_tester = ConfigTester(self, config_class=TFIdeficsConfig, hidden_size=37)
self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)

def test_config(self):
self.config_tester.run_common_tests()
Expand Down Expand Up @@ -335,7 +335,7 @@ def test_training(self):
for model_class in self.all_model_classes:
# IdeficsModel does not support training, users should use
# IdeficsForVisionText2Text for this purpose
if model_class == IdeficsModel:
if model_class == TFIdeficsModel:
return

config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
Expand All @@ -354,7 +354,7 @@ def test_training_gradient_checkpointing(self):
for model_class in self.all_model_classes:
# IdeficsModel does not support training, users should use
# IdeficsForVisionText2Text for this purpose
if model_class == IdeficsModel:
if model_class == TFIdeficsModel:
return

config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
Expand Down

0 comments on commit b42fe29

Please sign in to comment.