Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/models/bamba/modular_bamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ class BambaRMSNorm(LlamaRMSNorm):

class BambaDecoderLayer(JambaAttentionDecoderLayer):
def __init__(self, config: BambaConfig, layer_idx: int, layer_type: str = "mamba"):
super().__init__()
super().__init__(config, layer_idx)

del self.self_attn

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/cohere2/modular_cohere2.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ class Cohere2LayerNorm(CohereLayerNorm):
pass


class Cohere2Attention(CohereAttention, nn.Module):
class Cohere2Attention(CohereAttention):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(self, config: Cohere2Config, layer_idx: Optional[int] = None):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/d_fine/modular_d_fine.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,7 +898,7 @@ def __init__(self, config: DFineConfig):

class DFineForObjectDetection(RTDetrForObjectDetection, DFinePreTrainedModel):
def __init__(self, config: DFineConfig):
DFinePreTrainedModel.__init__(config)
DFinePreTrainedModel.__init__(self, config)

# D-FINE encoder-decoder model
self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/data2vec/modular_data2vec_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def forward(self, hidden_states):
return hidden_states


class Data2VecAudioFeatureEncoder(Wav2Vec2FeatureEncoder, nn.Module):
class Data2VecAudioFeatureEncoder(Wav2Vec2FeatureEncoder):
def __init__(self, config):
nn.Module.__init__(self)
self.conv_layers = nn.ModuleList(
Expand Down Expand Up @@ -183,7 +183,7 @@ def load_adapter(self):

class Data2VecAudioModel(Data2VecAudioPreTrainedModel, Wav2Vec2Model):
def __init__(self, config: Data2VecAudioConfig):
Data2VecAudioPreTrainedModel.__init__(config)
Data2VecAudioPreTrainedModel.__init__(self, config)
self.config = config
self.feature_extractor = Data2VecAudioFeatureEncoder(config)
self.feature_projection = Data2VecAudioFeatureProjection(config)
Expand Down Expand Up @@ -215,7 +215,7 @@ def forward(self, **super_kwargs):

class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel, Wav2Vec2ForCTC):
def __init__(self, config):
Data2VecAudioPreTrainedModel.__init__(config)
Data2VecAudioPreTrainedModel.__init__(self, config)

self.data2vec_audio = Data2VecAudioModel(config)
self.dropout = nn.Dropout(config.final_dropout)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/dia/modular_dia.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class DiaRotaryEmbedding(LlamaRotaryEmbedding):
pass


class DiaSelfAttention(LlamaAttention, nn.Module):
class DiaSelfAttention(LlamaAttention):
"""Multi-headed attention from 'Attention Is All You Need' paper"""

def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/dots1/modular_dots1.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class Dots1TopkRouter(DeepseekV3TopkRouter):

class Dots1DecoderLayer(DeepseekV3DecoderLayer):
def __init__(self, config: Dots1Config, layer_idx: int):
super().__init__()
super().__init__(config, layer_idx)
self.attention_type = config.layer_types[layer_idx]


Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/ernie4_5/modular_ernie4_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):

class Ernie4_5MLP(LlamaMLP):
def __init__(self, config: Ernie4_5Config):
super().__init__()
super().__init__(config)

self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,6 @@ def make_bond_key(atom1_name: str, atom2_name: str) -> str:

# A compact atom encoding with 14 columns
# pylint: disable=line-too-long
# pylint: disable=bad-whitespace
restype_name_to_atom14_names: dict[str, list[str]] = {
"ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""],
"ARG": ["N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2", "", "", ""],
Expand All @@ -566,7 +565,6 @@ def make_bond_key(atom1_name: str, atom2_name: str) -> str:
"UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
}
# pylint: enable=line-too-long
# pylint: enable=bad-whitespace


# This is the standard residue order when coding AA type as a number.
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/evolla/modular_evolla.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@

class EvollaSaProtEmbeddings(EsmEmbeddings):
def __init__(self, config):
super().__init__()
super().__init__(config)
# remove the position_ids in EsmEmbeddings
self.position_ids = None

Expand Down Expand Up @@ -127,7 +127,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch
)


class EvollaSaProtSelfAttention(EsmSelfAttention, nn.Module):
class EvollaSaProtSelfAttention(EsmSelfAttention):
def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cross_attention=False):
nn.Module.__init__(self)
self.config = config
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/falcon_h1/modeling_falcon_h1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@ def forward(


class FalconH1MLP(nn.Module):
def __init__(self, config: FalconH1Config = None):
def __init__(self, config: FalconH1Config):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/falcon_h1/modular_falcon_h1.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def forward(

class FalconH1RMSNormGated(MambaRMSNormGated):
def __init__(self, hidden_size, eps=1e-6, n_groups=1, norm_before_gate=True):
super().__init__()
super().__init__(hidden_size=hidden_size, eps=eps)
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
self.n_groups = n_groups
Expand Down Expand Up @@ -812,8 +812,8 @@ def forward(


class FalconH1MLP(LlamaMLP):
def __init__(self, config: FalconH1Config = None):
super().__init__()
def __init__(self, config: FalconH1Config):
super().__init__(config)
self.gate_multiplier, self.down_multiplier = config.mlp_multipliers

def forward(self, x):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ class FalconMambaCausalLMOutput(MambaCausalLMOutput):

class FalconMambaModel(MambaModel, FalconMambaPreTrainedModel):
def __init__(self, config):
FalconMambaPreTrainedModel.__init__(config)
FalconMambaPreTrainedModel.__init__(self, config)

self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
self.layers = nn.ModuleList(
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/florence2/modular_florence2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:

class Florence2VisionMLP(Llama4VisionMLP):
def __init__(self, config: Florence2VisionConfig, stage_idx: int):
super().__init__()
super().__init__(config)
self.fc1 = nn.Linear(config.embed_dim[stage_idx], int(config.embed_dim[stage_idx] * config.mlp_ratio))
self.activation_fn = ACT2FN[config.activation_function]
self.fc2 = nn.Linear(int(config.embed_dim[stage_idx] * config.mlp_ratio), config.embed_dim[stage_idx])
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gemma/modular_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def extra_repr(self):

class GemmaMLP(LlamaMLP):
def __init__(self, config):
super().__init__()
super().__init__(config)
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/gemma2/modular_gemma2.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ class Gemma2RMSNorm(GemmaRMSNorm):

class Gemma2MLP(GemmaMLP):
def __init__(self, config):
super().__init__()
super().__init__(config)
self.act_fn = ACT2FN[config.hidden_activation]


Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/gemma3/modular_gemma3.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def __init__(self, config: Gemma3TextConfig):

class Gemma3RMSNorm(Gemma2RMSNorm):
def __init__(self, dim: int, eps: float = 1e-6):
super().__init__()
super().__init__(dim=dim, eps=eps)


class Gemma3RotaryEmbedding(Gemma2RotaryEmbedding):
Expand All @@ -396,7 +396,7 @@ class Gemma3Attention(Gemma2Attention):
def __init__(self, config: Gemma3TextConfig, layer_idx: int):
self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"

super().__init__()
super().__init__(config, layer_idx)
self.sliding_window = config.sliding_window if self.is_sliding else None

self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/gemma3n/modular_gemma3n.py
Original file line number Diff line number Diff line change
Expand Up @@ -1739,7 +1739,7 @@ def apply_rotary_pos_emb(

class Gemma3nTextAttention(Gemma3Attention):
def __init__(self, config: Gemma3nTextConfig, layer_idx: int):
super().__init__()
super().__init__(config, layer_idx)
del self.attn_logit_softcapping
del self.scaling
self.v_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps, with_scale=False)
Expand Down Expand Up @@ -2234,7 +2234,7 @@ class Gemma3nModel(PaliGemmaModel):
_checkpoint_conversion_mapping = {}

def __init__(self, config: Gemma3nConfig):
super().__init__()
super().__init__(config)
del self.multi_modal_projector # Replaced by Gemma3nVisionEmbedder
self.vocab_size_per_layer_input = config.text_config.vocab_size_per_layer_input
self.audio_tower = AutoModel.from_config(config.audio_config)
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/glm4_moe/modular_glm4_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def __init__(
)


class Glm4MoeAttention(CohereAttention, nn.Module):
class Glm4MoeAttention(CohereAttention):
def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
nn.Module.__init__(self)
self.config = config
Expand Down Expand Up @@ -287,7 +287,7 @@ class Glm4MoeMLP(DeepseekV3MLP):
pass


class Glm4MoeTopkRouter(DeepseekV3TopkRouter, nn.Module):
class Glm4MoeTopkRouter(DeepseekV3TopkRouter):
def __init__(self, config: Glm4MoeConfig):
nn.Module.__init__(self)
self.config = config
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/glm4v/modular_glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,15 +507,15 @@ def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torc

class Glm4vVisionAttention(Qwen2_5_VLVisionAttention):
def __init__(self, config: Glm4vVisionConfig) -> None:
super().__init__()
super().__init__(config)
self.attention_dropout = config.attention_dropout
self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)


class Glm4vVisionBlock(Qwen2_5_VLVisionBlock):
def __init__(self, config) -> None:
super().__init__()
super().__init__(config)
self.norm1 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.norm2 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.attn = Glm4vVisionAttention(config)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/got_ocr2/modular_got_ocr2.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ class GotOcr2VisionAttention(SamVisionAttention):

class GotOcr2VisionLayer(SamVisionLayer):
def __init__(self, config, window_size):
super().__init__()
super().__init__(config, window_size)
self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.attn = GotOcr2VisionAttention(config, window_size)
self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/helium/modular_helium.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):

class HeliumDecoderLayer(LlamaDecoderLayer):
def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None):
super().__init__()
super().__init__(config, layer_idx)

self.mlp = HeliumMLP(config)
self.input_layernorm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def forward(

class HunYuanDenseV1DecoderLayer(LlamaDecoderLayer):
def __init__(self, config: HunYuanDenseV1Config, layer_idx: int):
super().__init__()
super().__init__(config, layer_idx)
self.layer_idx = layer_idx


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:

class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
super().__init__()
super().__init__(config, layer_idx)
self.hidden_size = config.hidden_size
self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx)
self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/internvl/modular_internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class InternVLVisionRMSNorm(LlamaRMSNorm):

class InternVLVisionAttention(JanusVisionAttention):
def __init__(self, config: InternVLVisionConfig):
super().__init__()
super().__init__(config)
del self.num_key_value_groups

# Needed for flash attention
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/janus/modular_janus.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:

class JanusVisionEncoderLayer(SiglipEncoderLayer):
def __init__(self, config: JanusVisionConfig):
super().__init__()
super().__init__(config)
self.config = config
self.embed_dim = config.hidden_size
self.self_attn = JanusVisionAttention(config)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mistral/modular_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, config):

class MistralAttention(LlamaAttention):
def __init__(self, config: MistralConfig, layer_idx: int):
super().__init__()
super().__init__(config, layer_idx)
self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ class MMGroundingDinoDecoder(GroundingDinoDecoder):

class MMGroundingDinoModel(GroundingDinoModel, MMGroundingDinoPreTrainedModel):
def __init__(self, config: MMGroundingDinoConfig):
MMGroundingDinoPreTrainedModel.__init__(config)
MMGroundingDinoPreTrainedModel.__init__(self, config)

# Create backbone + positional encoding
backbone = MMGroundingDinoConvEncoder(config)
Expand Down Expand Up @@ -400,7 +400,7 @@ class MMGroundingDinoForObjectDetection(GroundingDinoForObjectDetection, MMGroun
]

def __init__(self, config: MMGroundingDinoConfig):
MMGroundingDinoPreTrainedModel.__init__(config)
MMGroundingDinoPreTrainedModel.__init__(self, config)

self.model = MMGroundingDinoModel(config)

Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/ovis2/modular_ovis2.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class Ovis2VisionMLP(LlamaMLP):

class Ovis2VisionEmbeddings(SiglipVisionEmbeddings):
def __init__(self, config: Ovis2VisionConfig):
super().__init__()
super().__init__(config)
self.rms_norm = Ovis2RMSNorm(config.hidden_size, config.rms_norm_eps)

def interpolate_pos_encoding(self):
Expand All @@ -87,7 +87,7 @@ class Ovis2VisionEncoderLayer(Aimv2EncoderLayer):

class Ovis2VisionEncoder(SiglipEncoder):
def __init__(self, config: Ovis2VisionConfig):
super().__init__()
super().__init__(config)
self.layers = nn.ModuleList([Ovis2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def __init__(self, config: Phi4MultimodalVisionConfig):

class Phi4MultimodalVisionEncoder(SiglipEncoder):
def __init__(self, config: Phi4MultimodalVisionConfig):
super().__init__()
super().__init__(config)
self.layers = nn.ModuleList(
[Phi4MultimodalVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]
)
Expand Down Expand Up @@ -582,7 +582,7 @@ def _init_weights(self, module):
module.weight.data.fill_(1.0)


class Phi4MultimodalVisionEmbeddings(SiglipVisionEmbeddings, nn.Module):
class Phi4MultimodalVisionEmbeddings(SiglipVisionEmbeddings):
def __init__(self, config: Phi4MultimodalVisionConfig):
nn.Module.__init__(self)
self.config = config
Expand Down Expand Up @@ -1455,7 +1455,7 @@ def _init_weights(self, module):
module.sub_img_feature_extensor.data.zero_()


class Phi4MultimodalModel(Phi3Model, nn.Module):
class Phi4MultimodalModel(Phi3Model):
def __init__(self, config: Phi4MultimodalConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
Expand Down Expand Up @@ -1570,7 +1570,7 @@ def forward(
)


class Phi4MultimodalForCausalLM(Phi3ForCausalLM, nn.Module):
class Phi4MultimodalForCausalLM(Phi3ForCausalLM):
_tied_weights_keys = ["lm_head.weight"]

def __init__(self, config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2061,7 +2061,7 @@ def __init__(self, config: Qwen2_5OmniThinkerConfig, device=None):

# It's same as `Qwen2_5_VLAttention`, but talker model's hidden_size isn't divisible by num_heads.
# Removes the value error as a workaround.
class Qwen2_5OmniAttention(Qwen2_5_VLAttention, nn.Module):
class Qwen2_5OmniAttention(Qwen2_5_VLAttention):
def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None):
nn.Module.__init__(self)
self.config = config
Expand Down
Loading