Skip to content

Commit

Permalink
Feature to convert videomae huge and small finetuned on kinetics and …
Browse files Browse the repository at this point in the history
…ssv2 added to the videomae to pytorch converter (huggingface#22788)

* Feature to convert videomae huge finetuned kinetics and videomae small finetuned kinetics and ssv2 added to videomae to pytorch converter

* Reformat convert_videomae_to_pytorch using black

* Value exception added for the possible videomae model architectures
  • Loading branch information
sandstorm12 authored and novice03 committed Jun 23, 2023
1 parent b7255e2 commit 9c26457
Showing 1 changed file with 46 additions and 10 deletions.
56 changes: 46 additions & 10 deletions src/transformers/models/videomae/convert_videomae_to_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,7 @@
def get_videomae_config(model_name):
config = VideoMAEConfig()

if "large" in model_name:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 512
config.decoder_intermediate_size = 2048
set_architecture_configs(model_name, config)

if "finetuned" not in model_name:
config.use_mean_pooling = False
Expand All @@ -64,6 +56,38 @@ def get_videomae_config(model_name):
return config


def set_architecture_configs(model_name, config):
if "small" in model_name:
config.hidden_size = 384
config.intermediate_size = 1536
config.num_hidden_layers = 12
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 3
config.decoder_hidden_size = 192
config.decoder_intermediate_size = 768
elif "large" in model_name:
config.hidden_size = 1024
config.intermediate_size = 4096
config.num_hidden_layers = 24
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 512
config.decoder_intermediate_size = 2048
elif "huge" in model_name:
config.hidden_size = 1280
config.intermediate_size = 5120
config.num_hidden_layers = 32
config.num_attention_heads = 16
config.decoder_num_hidden_layers = 12
config.decoder_num_attention_heads = 8
config.decoder_hidden_size = 640
config.decoder_intermediate_size = 2560
elif "base" not in model_name:
raise ValueError('Model name should include either "small", "base", "large", or "huge"')


def rename_key(name):
if "encoder." in name:
name = name.replace("encoder.", "")
Expand Down Expand Up @@ -186,13 +210,16 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
logits = outputs.logits

model_names = [
"videomae-small-finetuned-kinetics",
"videomae-small-finetuned-ssv2",
# Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
"videomae-base-short",
"videomae-base-short-finetuned-kinetics",
"videomae-base",
"videomae-base-finetuned-kinetics",
"videomae-large",
"videomae-large-finetuned-kinetics",
"videomae-huge-finetuned-kinetics",
# Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400)
"videomae-base-short-ssv2",
"videomae-base-short-finetuned-ssv2",
Expand All @@ -201,7 +228,13 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
]

# NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
if model_name == "videomae-base":
if model_name == "videomae-small-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([-0.9291, -0.4061, -0.9307])
elif model_name == "videomae-small-finetuned-ssv2":
expected_shape = torch.Size([1, 174])
expected_slice = torch.tensor([0.2671, -0.4689, -0.8235])
elif model_name == "videomae-base":
expected_shape = torch.Size([1, 1408, 1536])
expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
elif model_name == "videomae-base-short":
Expand All @@ -215,6 +248,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
elif model_name == "videomae-large-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
elif model_name == "videomae-huge-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.2433, 0.1632, -0.4894])
elif model_name == "videomae-base-short-finetuned-kinetics":
expected_shape = torch.Size([1, 400])
expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
Expand Down

0 comments on commit 9c26457

Please sign in to comment.