@@ -97,39 +97,39 @@ def __init__(
9797 projector_config = {
9898 "attention_probs_dropout_prob" : 0.1 ,
9999 "cross_attention_frequency" : 1 ,
100- "downsample_rate" : 5 ,
101100 "encoder_hidden_size" : 32 ,
102101 "hidden_act" : "gelu" ,
103102 "hidden_dropout_prob" : 0.1 ,
104103 "hidden_size" : 32 ,
105104 "initializer_range" : 0.02 ,
106105 "intermediate_size" : 256 ,
107106 "layer_norm_eps" : 1e-12 ,
108- "llm_dim" : 32 ,
109107 "max_position_embeddings" : 2048 ,
110- "model_type" : "granite_speech_qformer " ,
108+ "model_type" : "blip_2_qformer " ,
111109 "num_attention_heads" : 4 ,
112110 "num_hidden_layers" : 2 ,
113111 "position_embedding_type" : "absolute" ,
114112 "use_qformer_text_input" : False ,
115113 "vocab_size" : 30522 ,
116- "window_size" : 15 ,
117114 },
118115 audio_token_index = 0 ,
119116 tie_word_embeddings = True ,
120117 initializer_range = 0.02 ,
121118 has_lora_adapter = True ,
119+ downsample_rate = 5 ,
120+ window_size = 15 ,
122121 is_training = True ,
123122 ):
124123 self .parent = parent
125- self .projector_config = None
126124 self .encoder_config = encoder_config
127125 self .text_config = text_config
128126 self .projector_config = projector_config
129127 self .audio_token_index = audio_token_index
130128 self .tie_word_embeddings = tie_word_embeddings
131129 self .initializer_range = initializer_range
132130 self .has_lora_adapater = has_lora_adapter
131+ self .downsample_rate = downsample_rate
132+ self .window_size = window_size
133133 self .is_training = is_training
134134
135135 # Dims for audio features
0 commit comments