diff --git a/configs/datasets/high-quality-fall_runner_k400-hyperparams.py b/configs/datasets/high-quality-fall_runner_k400-hyperparams.py
new file mode 100644
index 0000000..29bddfd
--- /dev/null
+++ b/configs/datasets/high-quality-fall_runner_k400-hyperparams.py
@@ -0,0 +1,113 @@
+"""Base `Runner` config for high-quality-fall dataset."""
+
+dataset_type = "HighQualityFallDataset"
+
+label_strategy = dict(
+ type="PriorityLabel",
+ label_description=dict(
+ names=["fall", "lying", "other"],
+ start_timestamp_names=["fall_start", "lying_start"],
+ end_timestamp_names=["fall_end", "lying_end"],
+ visible_names=["fall_visible", "lying_visible"],
+ other_class=2,
+ ),
+)
+
+sampling_strategy = dict(type="UniformSampling", clip_len=10)
+
+
+# TRAIN
+ann_file_train = "data/Fall_Simulation_Data/annotations_train.csv"
+
+# TODO: Add shape comments
+# TODO: Think about augmentation steps
+train_pipeline = [
+ dict(type="DecordInit"),
+ dict(type="ClipVideo"),
+ dict(type="SampleFrames", clip_len=16, frame_interval=4, num_clips=1),
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="RandomCrop", size=224),
+ dict(type="Resize", scale=(224, 224), keep_ratio=False),
+ dict(type="Flip", flip_ratio=0.5),
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+train_dataloader = dict(
+ batch_size=12, # From VideoMAEv2 repo
+ num_workers=8,
+ persistent_workers=False,
+ sampler=dict(type="DefaultSampler", shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ sampling_strategy=sampling_strategy,
+ label_strategy=label_strategy,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ num_classes=3,
+ # indices=100,
+ ),
+)
+
+# VALIDATION
+ann_file_val = "data/Fall_Simulation_Data/annotations_val.csv"
+
+val_pipeline = [
+ dict(type="DecordInit"),
+ dict(type="ClipVideo"),
+ dict(
+ type="SampleFrames", clip_len=16, frame_interval=4, num_clips=1, test_mode=True
+ ),
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="CenterCrop", crop_size=224), # From VideoMAEv2 repo
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+# val_dataloader = train_dataloader
+val_dataloader = dict(
+ batch_size=12, # From VideoMAEv2 repo
+ num_workers=8,
+ persistent_workers=False,
+ sampler=dict(type="DefaultSampler", shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ sampling_strategy=sampling_strategy,
+ label_strategy=label_strategy,
+ ann_file=ann_file_val,
+ pipeline=val_pipeline,
+ num_classes=3,
+ ),
+)
+
+# TEST
+ann_file_test = "data/Fall_Simulation_Data/annotations_test.csv"
+
+test_pipeline = [
+ dict(type="DecordInit"),
+ dict(
+ type="SampleFrames", clip_len=16, frame_interval=4, num_clips=5, test_mode=True
+ ), # From VideoMAEv2 repo
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="ThreeCrop", crop_size=224), # From VideoMAEv2 repo
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+test_dataloader = dict(
+ batch_size=1, # From VideoMAEv2 repo
+ num_workers=8,
+ persistent_workers=False,
+ sampler=dict(type="DefaultSampler", shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ sampling_strategy=sampling_strategy,
+ label_strategy=label_strategy,
+ ann_file=ann_file_test,
+ pipeline=test_pipeline,
+ num_classes=3,
+ ),
+)
diff --git a/configs/experiments/overfitting_run.py b/configs/experiments/overfitting_run.py
index 06c3afc..06800f4 100644
--- a/configs/experiments/overfitting_run.py
+++ b/configs/experiments/overfitting_run.py
@@ -1,5 +1,5 @@
_base_ = [
- "../models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py"
+ "../models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_k400-hyperparams.py"
]
EXPERIMENT_NAME = "overfitting_run"
@@ -10,6 +10,7 @@
# Overrides
train_dataloader = dict(
+ batch_size=1,
sampler=dict(type="DefaultSampler", shuffle=False),
dataset=dict(
indices=100,
@@ -19,6 +20,9 @@
ann_file_val = "data/Fall_Simulation_Data/annotations_train.csv"
val_dataloader = dict(
+ num_workers=0,
+ persistent_workers=False,
+ batch_size=1,
dataset=dict(
ann_file=ann_file_val,
indices=100,
@@ -26,4 +30,4 @@
)
default_hooks = dict(checkpoint=dict(interval=0))
-custom_hooks = [dict(type="CustomVisualizationHook", enable=True, interval=10)]
+custom_hooks = [dict(type="CustomVisualizationHook", enable=True, interval=1)]
diff --git a/configs/experiments/vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py b/configs/experiments/vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py
new file mode 100644
index 0000000..a1bf91a
--- /dev/null
+++ b/configs/experiments/vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py
@@ -0,0 +1,78 @@
+_base_ = [
+ "../models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_k400-hyperparams.py"
+]
+
+EXPERIMENT_NAME = "vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams"
+visualizer = dict(
+ vis_backends=dict(save_dir=f"experiments/tensorboard/{EXPERIMENT_NAME}/")
+)
+work_dir = f"experiments/{EXPERIMENT_NAME}"
+
+# Overrides
+default_hooks = dict(checkpoint=dict(interval=1))
+
+# 1487 samples in val -> 92 batches per node -> We want around 10 images
+custom_hooks = [dict(type="CustomVisualizationHook", enable=True, interval=300)]
+
+# Use ViT-B/16
+model = dict(
+ backbone=dict(embed_dims=768, depth=12, num_heads=12),
+ cls_head=dict(in_channels=768),
+)
+load_from = "weights/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth"
+
+# Use frame_interval 8
+train_pipeline = [
+ dict(type="DecordInit"),
+ dict(type="ClipVideo"),
+ dict(
+ type="SampleFrames", clip_len=16, frame_interval=8, num_clips=1
+ ), # This has changed
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="RandomCrop", size=224),
+ dict(type="Resize", scale=(224, 224), keep_ratio=False),
+ dict(type="Flip", flip_ratio=0.5),
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+
+# Use Gaussian sampling
+train_dataloader = dict(
+ dataset=dict(
+ sampling_strategy=dict(
+ type="GaussianSampling",
+ clip_len=5,
+ fallback_sampler=dict(
+ type="UniformSampling", clip_len=5, stride=5, overlap=False
+ ),
+ ),
+ drop_ratios=[0.0, 0.0, 0.30],
+ pipeline=train_pipeline,
+ )
+)
+# We are not changing the val/test dataloaders since gaussian sampling requires labels
+# and we cannot have a valid validation if we use labels in the preprocessing
+
+val_pipeline = [
+ dict(type="DecordInit"),
+ dict(type="ClipVideo"),
+ dict(
+ type="SampleFrames", clip_len=16, frame_interval=8, num_clips=1, test_mode=True
+ ),
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="CenterCrop", crop_size=224), # From VideoMAEv2 repo
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+val_dataloader = dict(
+ dataset=dict(
+ sampling_strategy=dict(
+ type="UniformSampling", clip_len=5, stride=0, overlap=False
+ ),
+ pipeline=val_pipeline,
+ ),
+)
diff --git a/configs/experiments/vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py b/configs/experiments/vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py
new file mode 100644
index 0000000..b1d81d7
--- /dev/null
+++ b/configs/experiments/vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py
@@ -0,0 +1,47 @@
+_base_ = [
+ "../models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_k400-hyperparams.py"
+]
+
+EXPERIMENT_NAME = (
+ "vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams"
+)
+visualizer = dict(
+ vis_backends=dict(save_dir=f"experiments/tensorboard/{EXPERIMENT_NAME}/")
+)
+work_dir = f"experiments/{EXPERIMENT_NAME}"
+
+# Overrides
+default_hooks = dict(checkpoint=dict(interval=1))
+
+# 1487 samples in val -> 92 batches per node -> We want around 10 images
+custom_hooks = [dict(type="CustomVisualizationHook", enable=True, interval=300)]
+
+# Use ViT-B/16
+model = dict(
+ backbone=dict(embed_dims=768, depth=12, num_heads=12),
+ cls_head=dict(in_channels=768),
+)
+load_from = "weights/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth"
+
+# Use Gaussian sampling
+train_dataloader = dict(
+ dataset=dict(
+ sampling_strategy=dict(
+ type="GaussianSampling",
+ clip_len=5,
+ fallback_sampler=dict(
+ type="UniformSampling", clip_len=5, stride=5, overlap=False
+ ),
+ ),
+ drop_ratios=[0.0, 0.0, 0.30],
+ )
+)
+# We are not changing the val/test dataloaders since gaussian sampling requires labels
+# and we cannot have a valid validation if we use labels in the preprocessing
+val_dataloader = dict(
+ dataset=dict(
+ sampling_strategy=dict(
+ type="UniformSampling", clip_len=5, stride=0, overlap=False
+ ),
+ ),
+)
diff --git a/configs/experiments/vit-b_gaussian-sampling_priority-labeling_k400-hyperparams.py b/configs/experiments/vit-b_gaussian-sampling_priority-labeling_k400-hyperparams.py
new file mode 100644
index 0000000..3cc96a5
--- /dev/null
+++ b/configs/experiments/vit-b_gaussian-sampling_priority-labeling_k400-hyperparams.py
@@ -0,0 +1,29 @@
+_base_ = [
+ "../models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_k400-hyperparams.py"
+]
+
+EXPERIMENT_NAME = "vit-b_gaussian-sampling_priority-labeling_k400-hyperparams"
+visualizer = dict(
+ vis_backends=dict(save_dir=f"experiments/tensorboard/{EXPERIMENT_NAME}/")
+)
+work_dir = f"experiments/{EXPERIMENT_NAME}"
+
+# Overrides
+default_hooks = dict(checkpoint=dict(interval=1))
+
+# 1487 samples in val -> 92 batches per node -> We want around 10 images
+custom_hooks = [dict(type="CustomVisualizationHook", enable=True, interval=150)]
+
+# Use ViT-B/16
+model = dict(
+ backbone=dict(embed_dims=768, depth=12, num_heads=12),
+ cls_head=dict(in_channels=768),
+)
+load_from = "weights/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth"
+
+# Use Gaussian sampling
+train_dataloader = dict(
+ dataset=dict(sampling_strategy=dict(type="GaussianSampling", clip_len=10))
+)
+# We are not changing the val/test dataloaders since gaussian sampling requires labels
+# and we cannot have a valid validation if we use labels in the preprocessing
diff --git a/configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams.py b/configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams.py
new file mode 100644
index 0000000..03352ac
--- /dev/null
+++ b/configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams.py
@@ -0,0 +1,30 @@
+_base_ = [
+ "../models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py"
+]
+
+EXPERIMENT_NAME = "vit-b_gaussian-sampling_priority-labeling_paper-hyperparams"
+visualizer = dict(
+ vis_backends=dict(save_dir=f"experiments/tensorboard/{EXPERIMENT_NAME}/")
+)
+work_dir = f"experiments/{EXPERIMENT_NAME}"
+
+# Overrides
+default_hooks = dict(checkpoint=dict(interval=3))
+
+# 1487 samples in val -> 372 per node -> 124 batches per node -> We want around 10 images
+# -> Interval = 124 / 10 = 12
+custom_hooks = [dict(type="CustomVisualizationHook", enable=True, interval=10)]
+
+# Use ViT-B/16
+model = dict(
+ backbone=dict(embed_dims=768, depth=12, num_heads=12),
+ cls_head=dict(in_channels=768),
+)
+load_from = "weights/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth"
+
+# Use Gaussian sampling
+train_dataloader = dict(
+ dataset=dict(sampling_strategy=dict(type="GaussianSampling", clip_len=10))
+)
+# We are not changing the val/test dataloaders since gaussian sampling requires labels
+# and we cannot have a valid validation if we use labels in the preprocessing
diff --git a/configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss.py b/configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss.py
new file mode 100644
index 0000000..e892cf2
--- /dev/null
+++ b/configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss.py
@@ -0,0 +1,40 @@
+_base_ = [
+ "../models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py"
+]
+
+EXPERIMENT_NAME = (
+ "vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss"
+)
+visualizer = dict(
+ vis_backends=dict(save_dir=f"experiments/tensorboard/{EXPERIMENT_NAME}/")
+)
+work_dir = f"experiments/{EXPERIMENT_NAME}"
+
+# Overrides
+default_hooks = dict(checkpoint=dict(interval=3))
+
+# 1487 samples in val -> 372 per node -> 124 batches per node -> We want around 10 images
+# -> Interval = 124 / 10 = 12
+custom_hooks = [dict(type="CustomVisualizationHook", enable=True, interval=10)]
+
+# Use ViT-B/16
+# Add weighted CE loss
+# weight_for_class_i = total_samples / (num_samples_in_class_i * num_classes)
+model = dict(
+ backbone=dict(embed_dims=768, depth=12, num_heads=12),
+ cls_head=dict(
+ in_channels=768,
+ loss_cls=dict(
+ type="CrossEntropyLoss",
+ class_weight=[26.38235294117647, 37.901408450704224, 3.7168508287292816],
+ ),
+ ),
+)
+load_from = "weights/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth"
+
+# Use Gaussian sampling
+train_dataloader = dict(
+ dataset=dict(sampling_strategy=dict(type="GaussianSampling", clip_len=10))
+)
+# We are not changing the val/test dataloaders since gaussian sampling requires labels
+# and we cannot have a valid validation if we use labels in the preprocessing
diff --git a/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_k400-hyperparams.py b/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_k400-hyperparams.py
new file mode 100644
index 0000000..cbcf715
--- /dev/null
+++ b/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_k400-hyperparams.py
@@ -0,0 +1,416 @@
+_base_ = [
+ "../default_runtime.py",
+ "../datasets/high-quality-fall_runner_k400-hyperparams.py",
+]
+
+# Finetuning parameters are from VideoMAEv2 repo
+# https://github.com/OpenGVLab/VideoMAEv2/blob/master/scripts/finetune/vit_b_k400_ft.sh
+
+
+# ViT-S-P16
+model = dict(
+ type="Recognizer3D",
+ backbone=dict(
+ type="VisionTransformer",
+ img_size=224,
+ patch_size=16,
+ embed_dims=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ qkv_bias=True,
+ num_frames=16,
+ norm_cfg=dict(type="LN", eps=1e-6),
+ drop_path_rate=0.3, # From VideoMAEv2 repo
+ ),
+ cls_head=dict(
+ type="TimeSformerHead",
+ num_classes=3,
+ in_channels=384,
+ average_clips="prob",
+ topk=(1,),
+ ),
+ data_preprocessor=dict(
+ type="ActionDataPreprocessor",
+ mean=[102.17311096191406, 98.78225708007812, 92.68714141845703],
+ std=[58.04566192626953, 57.004024505615234, 57.3704948425293],
+ format_shape="NCTHW",
+ ),
+)
+
+# Loading weights
+load_from = "weights/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-25c748fd.pth"
+
+# TRAINING CONFIG
+train_cfg = dict(type="EpochBasedTrainLoop", max_epochs=90, val_interval=1)
+
+# TODO: Think about fine-tuning param scheduler
+param_scheduler = [
+ dict(
+ type="LinearLR",
+ by_epoch=True,
+ convert_to_iter_based=True,
+ start_factor=1e-3,
+ end_factor=1,
+ begin=0,
+ end=5,
+ ), # From VideoMAEv2 repo - Warmup
+ dict(
+ type="CosineAnnealingLR",
+ by_epoch=True,
+ convert_to_iter_based=True,
+ eta_min=1e-6,
+ begin=5,
+ end=35,
+ ),
+]
+
+auto_scale_lr = dict(enable=True, base_batch_size=256)
+
+# Layer Decay and Weight Decay module configs
+vit_b_layer_decay_75_custom_keys = {
+ "backbone.patch_embed.projection.weight": {
+ "lr_mult": 0.023757264018058777,
+ "decay_mult": 1,
+ },
+ "backbone.patch_embed.projection.bias": {
+ "lr_mult": 0.023757264018058777,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.0.norm1.weight": {"lr_mult": 0.03167635202407837, "decay_mult": 0},
+ "backbone.blocks.0.norm1.bias": {"lr_mult": 0.03167635202407837, "decay_mult": 0},
+ "backbone.blocks.0.attn.q_bias": {"lr_mult": 0.03167635202407837, "decay_mult": 0},
+ "backbone.blocks.0.attn.v_bias": {"lr_mult": 0.03167635202407837, "decay_mult": 0},
+ "backbone.blocks.0.attn.proj.bias": {
+ "lr_mult": 0.03167635202407837,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.0.norm2.weight": {"lr_mult": 0.03167635202407837, "decay_mult": 0},
+ "backbone.blocks.0.norm2.bias": {"lr_mult": 0.03167635202407837, "decay_mult": 0},
+ "backbone.blocks.0.mlp.layers.0.0.bias": {
+ "lr_mult": 0.03167635202407837,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.0.mlp.layers.1.bias": {
+ "lr_mult": 0.03167635202407837,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.0.attn.qkv.weight": {
+ "lr_mult": 0.03167635202407837,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.0.attn.proj.weight": {
+ "lr_mult": 0.03167635202407837,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.0.mlp.layers.0.0.weight": {
+ "lr_mult": 0.03167635202407837,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.0.mlp.layers.1.weight": {
+ "lr_mult": 0.03167635202407837,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.1.norm1.weight": {"lr_mult": 0.04223513603210449, "decay_mult": 0},
+ "backbone.blocks.1.norm1.bias": {"lr_mult": 0.04223513603210449, "decay_mult": 0},
+ "backbone.blocks.1.attn.q_bias": {"lr_mult": 0.04223513603210449, "decay_mult": 0},
+ "backbone.blocks.1.attn.v_bias": {"lr_mult": 0.04223513603210449, "decay_mult": 0},
+ "backbone.blocks.1.attn.proj.bias": {
+ "lr_mult": 0.04223513603210449,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.1.norm2.weight": {"lr_mult": 0.04223513603210449, "decay_mult": 0},
+ "backbone.blocks.1.norm2.bias": {"lr_mult": 0.04223513603210449, "decay_mult": 0},
+ "backbone.blocks.1.mlp.layers.0.0.bias": {
+ "lr_mult": 0.04223513603210449,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.1.mlp.layers.1.bias": {
+ "lr_mult": 0.04223513603210449,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.1.attn.qkv.weight": {
+ "lr_mult": 0.04223513603210449,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.1.attn.proj.weight": {
+ "lr_mult": 0.04223513603210449,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.1.mlp.layers.0.0.weight": {
+ "lr_mult": 0.04223513603210449,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.1.mlp.layers.1.weight": {
+ "lr_mult": 0.04223513603210449,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.2.norm1.weight": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.2.norm1.bias": {"lr_mult": 0.056313514709472656, "decay_mult": 0},
+ "backbone.blocks.2.attn.q_bias": {"lr_mult": 0.056313514709472656, "decay_mult": 0},
+ "backbone.blocks.2.attn.v_bias": {"lr_mult": 0.056313514709472656, "decay_mult": 0},
+ "backbone.blocks.2.attn.proj.bias": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.2.norm2.weight": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.2.norm2.bias": {"lr_mult": 0.056313514709472656, "decay_mult": 0},
+ "backbone.blocks.2.mlp.layers.0.0.bias": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.2.mlp.layers.1.bias": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.2.attn.qkv.weight": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.2.attn.proj.weight": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.2.mlp.layers.0.0.weight": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.2.mlp.layers.1.weight": {
+ "lr_mult": 0.056313514709472656,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.3.norm1.weight": {"lr_mult": 0.07508468627929688, "decay_mult": 0},
+ "backbone.blocks.3.norm1.bias": {"lr_mult": 0.07508468627929688, "decay_mult": 0},
+ "backbone.blocks.3.attn.q_bias": {"lr_mult": 0.07508468627929688, "decay_mult": 0},
+ "backbone.blocks.3.attn.v_bias": {"lr_mult": 0.07508468627929688, "decay_mult": 0},
+ "backbone.blocks.3.attn.proj.bias": {
+ "lr_mult": 0.07508468627929688,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.3.norm2.weight": {"lr_mult": 0.07508468627929688, "decay_mult": 0},
+ "backbone.blocks.3.norm2.bias": {"lr_mult": 0.07508468627929688, "decay_mult": 0},
+ "backbone.blocks.3.mlp.layers.0.0.bias": {
+ "lr_mult": 0.07508468627929688,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.3.mlp.layers.1.bias": {
+ "lr_mult": 0.07508468627929688,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.3.attn.qkv.weight": {
+ "lr_mult": 0.07508468627929688,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.3.attn.proj.weight": {
+ "lr_mult": 0.07508468627929688,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.3.mlp.layers.0.0.weight": {
+ "lr_mult": 0.07508468627929688,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.3.mlp.layers.1.weight": {
+ "lr_mult": 0.07508468627929688,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.4.norm1.weight": {"lr_mult": 0.1001129150390625, "decay_mult": 0},
+ "backbone.blocks.4.norm1.bias": {"lr_mult": 0.1001129150390625, "decay_mult": 0},
+ "backbone.blocks.4.attn.q_bias": {"lr_mult": 0.1001129150390625, "decay_mult": 0},
+ "backbone.blocks.4.attn.v_bias": {"lr_mult": 0.1001129150390625, "decay_mult": 0},
+ "backbone.blocks.4.attn.proj.bias": {
+ "lr_mult": 0.1001129150390625,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.4.norm2.weight": {"lr_mult": 0.1001129150390625, "decay_mult": 0},
+ "backbone.blocks.4.norm2.bias": {"lr_mult": 0.1001129150390625, "decay_mult": 0},
+ "backbone.blocks.4.mlp.layers.0.0.bias": {
+ "lr_mult": 0.1001129150390625,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.4.mlp.layers.1.bias": {
+ "lr_mult": 0.1001129150390625,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.4.attn.qkv.weight": {
+ "lr_mult": 0.1001129150390625,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.4.attn.proj.weight": {
+ "lr_mult": 0.1001129150390625,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.4.mlp.layers.0.0.weight": {
+ "lr_mult": 0.1001129150390625,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.4.mlp.layers.1.weight": {
+ "lr_mult": 0.1001129150390625,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.5.norm1.weight": {"lr_mult": 0.13348388671875, "decay_mult": 0},
+ "backbone.blocks.5.norm1.bias": {"lr_mult": 0.13348388671875, "decay_mult": 0},
+ "backbone.blocks.5.attn.q_bias": {"lr_mult": 0.13348388671875, "decay_mult": 0},
+ "backbone.blocks.5.attn.v_bias": {"lr_mult": 0.13348388671875, "decay_mult": 0},
+ "backbone.blocks.5.attn.proj.bias": {"lr_mult": 0.13348388671875, "decay_mult": 0},
+ "backbone.blocks.5.norm2.weight": {"lr_mult": 0.13348388671875, "decay_mult": 0},
+ "backbone.blocks.5.norm2.bias": {"lr_mult": 0.13348388671875, "decay_mult": 0},
+ "backbone.blocks.5.mlp.layers.0.0.bias": {
+ "lr_mult": 0.13348388671875,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.5.mlp.layers.1.bias": {
+ "lr_mult": 0.13348388671875,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.5.attn.qkv.weight": {"lr_mult": 0.13348388671875, "decay_mult": 1},
+ "backbone.blocks.5.attn.proj.weight": {
+ "lr_mult": 0.13348388671875,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.5.mlp.layers.0.0.weight": {
+ "lr_mult": 0.13348388671875,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.5.mlp.layers.1.weight": {
+ "lr_mult": 0.13348388671875,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.6.norm1.weight": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.norm1.bias": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.attn.q_bias": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.attn.v_bias": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.attn.proj.bias": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.norm2.weight": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.norm2.bias": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.mlp.layers.0.0.bias": {
+ "lr_mult": 0.177978515625,
+ "decay_mult": 0,
+ },
+ "backbone.blocks.6.mlp.layers.1.bias": {"lr_mult": 0.177978515625, "decay_mult": 0},
+ "backbone.blocks.6.attn.qkv.weight": {"lr_mult": 0.177978515625, "decay_mult": 1},
+ "backbone.blocks.6.attn.proj.weight": {"lr_mult": 0.177978515625, "decay_mult": 1},
+ "backbone.blocks.6.mlp.layers.0.0.weight": {
+ "lr_mult": 0.177978515625,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.6.mlp.layers.1.weight": {
+ "lr_mult": 0.177978515625,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.7.norm1.weight": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.norm1.bias": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.attn.q_bias": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.attn.v_bias": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.attn.proj.bias": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.norm2.weight": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.norm2.bias": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.mlp.layers.0.0.bias": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.mlp.layers.1.bias": {"lr_mult": 0.2373046875, "decay_mult": 0},
+ "backbone.blocks.7.attn.qkv.weight": {"lr_mult": 0.2373046875, "decay_mult": 1},
+ "backbone.blocks.7.attn.proj.weight": {"lr_mult": 0.2373046875, "decay_mult": 1},
+ "backbone.blocks.7.mlp.layers.0.0.weight": {
+ "lr_mult": 0.2373046875,
+ "decay_mult": 1,
+ },
+ "backbone.blocks.7.mlp.layers.1.weight": {"lr_mult": 0.2373046875, "decay_mult": 1},
+ "backbone.blocks.8.norm1.weight": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.norm1.bias": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.attn.q_bias": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.attn.v_bias": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.attn.proj.bias": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.norm2.weight": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.norm2.bias": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.mlp.layers.0.0.bias": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.mlp.layers.1.bias": {"lr_mult": 0.31640625, "decay_mult": 0},
+ "backbone.blocks.8.attn.qkv.weight": {"lr_mult": 0.31640625, "decay_mult": 1},
+ "backbone.blocks.8.attn.proj.weight": {"lr_mult": 0.31640625, "decay_mult": 1},
+ "backbone.blocks.8.mlp.layers.0.0.weight": {"lr_mult": 0.31640625, "decay_mult": 1},
+ "backbone.blocks.8.mlp.layers.1.weight": {"lr_mult": 0.31640625, "decay_mult": 1},
+ "backbone.blocks.9.norm1.weight": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.norm1.bias": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.attn.q_bias": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.attn.v_bias": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.attn.proj.bias": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.norm2.weight": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.norm2.bias": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.mlp.layers.0.0.bias": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.mlp.layers.1.bias": {"lr_mult": 0.421875, "decay_mult": 0},
+ "backbone.blocks.9.attn.qkv.weight": {"lr_mult": 0.421875, "decay_mult": 1},
+ "backbone.blocks.9.attn.proj.weight": {"lr_mult": 0.421875, "decay_mult": 1},
+ "backbone.blocks.9.mlp.layers.0.0.weight": {"lr_mult": 0.421875, "decay_mult": 1},
+ "backbone.blocks.9.mlp.layers.1.weight": {"lr_mult": 0.421875, "decay_mult": 1},
+ "backbone.blocks.10.norm1.weight": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.norm1.bias": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.attn.q_bias": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.attn.v_bias": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.attn.proj.bias": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.norm2.weight": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.norm2.bias": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.mlp.layers.0.0.bias": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.mlp.layers.1.bias": {"lr_mult": 0.5625, "decay_mult": 0},
+ "backbone.blocks.10.attn.qkv.weight": {"lr_mult": 0.5625, "decay_mult": 1},
+ "backbone.blocks.10.attn.proj.weight": {"lr_mult": 0.5625, "decay_mult": 1},
+ "backbone.blocks.10.mlp.layers.0.0.weight": {"lr_mult": 0.5625, "decay_mult": 1},
+ "backbone.blocks.10.mlp.layers.1.weight": {"lr_mult": 0.5625, "decay_mult": 1},
+ "backbone.blocks.11.norm1.weight": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.norm1.bias": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.attn.q_bias": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.attn.v_bias": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.attn.proj.bias": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.norm2.weight": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.norm2.bias": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.mlp.layers.0.0.bias": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.mlp.layers.1.bias": {"lr_mult": 0.75, "decay_mult": 0},
+ "backbone.blocks.11.attn.qkv.weight": {"lr_mult": 0.75, "decay_mult": 1},
+ "backbone.blocks.11.attn.proj.weight": {"lr_mult": 0.75, "decay_mult": 1},
+ "backbone.blocks.11.mlp.layers.0.0.weight": {"lr_mult": 0.75, "decay_mult": 1},
+ "backbone.blocks.11.mlp.layers.1.weight": {"lr_mult": 0.75, "decay_mult": 1},
+ "backbone.fc_norm.weight": {"lr_mult": 1.0, "decay_mult": 0},
+ "backbone.fc_norm.bias": {"lr_mult": 1.0, "decay_mult": 0},
+ "cls_head.fc_cls.bias": {"lr_mult": 1.0, "decay_mult": 0},
+ "cls_head.fc_cls.weight": {"lr_mult": 1.0, "decay_mult": 1},
+}
+
+
+optim_wrapper = dict(
+ type="AmpOptimWrapper", # Automatic Mixed Precision may speed up trainig
+ optimizer=dict(
+ type="AdamW", # From VideoMAEv2 repo
+ lr=7e-4, # From VideoMAEv2 repo
+ weight_decay=0.05, # From VideoMAEv2 repo
+ betas=(0.9, 0.999), # From VideoMAEv2 repo
+ ),
+ paramwise_cfg=dict(custom_keys=vit_b_layer_decay_75_custom_keys),
+ # clip_grad=dict(max_norm=5, norm_type=2), # From VideoMAEv2 repo
+)
+
+# VALIDATION CONFIG
+val_evaluator = dict(
+ type="AddAccMetric",
+ metric_list=(
+ "unweighted_average_f1",
+ "per_class_f1",
+ "per_class_precision",
+ "per_class_recall",
+ ),
+)
+val_cfg = dict(type="ValLoop")
+
+
+# TEST CONFIG
+test_evaluator = dict(
+ type="AddAccMetric",
+ metric_list=(
+ "unweighted_average_f1",
+ "per_class_f1",
+ "per_class_precision",
+ "per_class_recall",
+ ),
+)
+test_cfg = dict(type="TestLoop")
diff --git a/experiments.dvc b/experiments.dvc
index 98f6b2a..ff1b824 100644
--- a/experiments.dvc
+++ b/experiments.dvc
@@ -1,6 +1,6 @@
outs:
-- md5: 004d25dfcdbec8b9a95e429079227b93.dir
- size: 1022832432
- nfiles: 9
+- md5: 55075530cd6a7d51a35547b6eebafda0.dir
+ size: 21652483819
+ nfiles: 97
hash: md5
path: experiments
diff --git a/job_scripts/vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.sh b/job_scripts/vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.sh
new file mode 100644
index 0000000..07c096e
--- /dev/null
+++ b/job_scripts/vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+#SBATCH -A NAISS2023-22-1160 -p alvis
+#SBATCH -N 1 --gpus-per-node=A40:1
+#SBATCH --time=48:00:00
+
+apptainer exec \
+ --env PYTHONPATH=$(pwd) \
+ containers/c3se_job_container.sif \
+ python mmaction2/tools/train.py \
+ configs/experiments/vit-b_frame-int-8_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py
\ No newline at end of file
diff --git a/job_scripts/vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.sh b/job_scripts/vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.sh
new file mode 100644
index 0000000..0fafc47
--- /dev/null
+++ b/job_scripts/vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+#SBATCH -A NAISS2023-22-1160 -p alvis
+#SBATCH -N 1 --gpus-per-node=A40:1
+#SBATCH --time=48:00:00
+
+apptainer exec \
+ --env PYTHONPATH=$(pwd) \
+ containers/c3se_job_container.sif \
+ python mmaction2/tools/train.py \
+ configs/experiments/vit-b_gaussian-sampling-5s-clips-30-drop_priority-labeling_k400-hyperparams.py
\ No newline at end of file
diff --git a/job_scripts/vit-b_gaussian-sampling_priority-labeling_k400-hyperparams.sh b/job_scripts/vit-b_gaussian-sampling_priority-labeling_k400-hyperparams.sh
new file mode 100644
index 0000000..fc47fdd
--- /dev/null
+++ b/job_scripts/vit-b_gaussian-sampling_priority-labeling_k400-hyperparams.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+#SBATCH -A NAISS2023-22-1160 -p alvis
+#SBATCH -N 1 --gpus-per-node=A40:1
+#SBATCH --time=24:00:00
+
+apptainer exec \
+ --env PYTHONPATH=$(pwd) \
+ containers/c3se_job_container.sif \
+ python mmaction2/tools/train.py \
+ configs/experiments/vit-b_gaussian-sampling_priority-labeling_k400-hyperparams.py \
+ --resume auto
\ No newline at end of file
diff --git a/job_scripts/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams.sh b/job_scripts/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams.sh
new file mode 100644
index 0000000..ce03cdb
--- /dev/null
+++ b/job_scripts/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+#SBATCH -A NAISS2023-22-1160 -p alvis
+#SBATCH -N 1 --gpus-per-node=A40:4
+#SBATCH --time=24:00:00
+
+apptainer exec \
+ --env PYTHONPATH=$(pwd) \
+ containers/c3se_job_container.sif \
+ python -m torch.distributed.launch --nproc_per_node=4 \
+ mmaction2/tools/train.py \
+ configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams.py \
+ --launcher pytorch --resume auto
\ No newline at end of file
diff --git a/job_scripts/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss.sh b/job_scripts/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss.sh
new file mode 100644
index 0000000..0603a02
--- /dev/null
+++ b/job_scripts/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+#SBATCH -A NAISS2023-22-1160 -p alvis
+#SBATCH -N 1 --gpus-per-node=A40:4
+#SBATCH --time=24:00:00
+
+apptainer exec \
+ --env PYTHONPATH=$(pwd) \
+ containers/c3se_job_container.sif \
+ python -m torch.distributed.launch --nproc_per_node=4 \
+ mmaction2/tools/train.py \
+ configs/experiments/vit-b_gaussian-sampling_priority-labeling_paper-hyperparams_weighted-ce-loss.py \
+ --launcher pytorch
\ No newline at end of file
diff --git a/notebooks/custom_keys_optimizer.ipynb b/notebooks/custom_keys_optimizer.ipynb
new file mode 100644
index 0000000..31557cb
--- /dev/null
+++ b/notebooks/custom_keys_optimizer.ipynb
@@ -0,0 +1,938 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Custom Keys Optimizer\n",
+ "\n",
+ "Here we create the custom keys dictionary for the runner config.\n",
+ "It is necessary to get layer decay."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "LAYER_DECAY = 0.75\n",
+ "MODEL_DEPTH = 12\n",
+ "BASE_WEIGHT_DECAY = 0.05"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "12/04 22:32:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n",
+ "------------------------------------------------------------\n",
+ "System environment:\n",
+ " sys.platform: darwin\n",
+ " Python: 3.10.13 | packaged by conda-forge | (main, Oct 26 2023, 18:09:17) [Clang 16.0.6 ]\n",
+ " CUDA available: False\n",
+ " numpy_random_seed: 104644062\n",
+ " GCC: Apple clang version 15.0.0 (clang-1500.0.40.1)\n",
+ " PyTorch: 2.1.1\n",
+ " PyTorch compiling details: PyTorch built with:\n",
+ " - GCC 4.2\n",
+ " - C++ Version: 201703\n",
+ " - clang 13.1.6\n",
+ " - LAPACK is enabled (usually provided by MKL)\n",
+ " - NNPACK is enabled\n",
+ " - CPU capability usage: NO AVX\n",
+ " - Build settings: BLAS_INFO=accelerate, BUILD_TYPE=Release, CXX_COMPILER=/Applications/Xcode_13.3.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_PYTORCH_METAL_EXPORT -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DUSE_COREML_DELEGATE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wvla-extension -Wnewline-eof -Winconsistent-missing-override -Winconsistent-missing-destructor-override -Wno-range-loop-analysis -Wno-pass-failed -Wsuggest-override -Wno-error=pedantic -Wno-error=old-style-cast -Wno-error=inconsistent-missing-override -Wno-error=inconsistent-missing-destructor-override -Wconstant-conversion -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-missing-braces -Wunused-lambda-capture -Qunused-arguments -fcolor-diagnostics -faligned-new -Wno-unused-but-set-variable -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -DUSE_MPS -Wno-unused-private-field -Wno-missing-braces, LAPACK_INFO=accelerate, TORCH_DISABLE_GPU_ASSERTS=OFF, TORCH_VERSION=2.1.1, USE_CUDA=0, USE_CUDNN=OFF, USE_EIGEN_FOR_BLAS=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=OFF, USE_MKLDNN=OFF, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=ON, USE_OPENMP=OFF, USE_ROCM=OFF, \n",
+ "\n",
+ " TorchVision: 0.16.1\n",
+ " OpenCV: 4.8.1\n",
+ " MMEngine: 0.10.1\n",
+ "\n",
+ "Runtime environment:\n",
+ " cudnn_benchmark: False\n",
+ " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n",
+ " dist_cfg: {'backend': 'nccl'}\n",
+ " seed: 104644062\n",
+ " Distributed launcher: none\n",
+ " Distributed training: False\n",
+ " GPU number: 1\n",
+ "------------------------------------------------------------\n",
+ "\n",
+ "12/04 22:32:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n",
+ "ann_file_test = 'data/Fall_Simulation_Data/annotations_test.csv'\n",
+ "ann_file_train = 'data/Fall_Simulation_Data/annotations_train.csv'\n",
+ "ann_file_val = 'data/Fall_Simulation_Data/annotations_val.csv'\n",
+ "custom_hooks = [\n",
+ " dict(enable=True, type='CustomVisualizationHook'),\n",
+ "]\n",
+ "custom_imports = dict(\n",
+ " allow_failed_imports=False,\n",
+ " imports=[\n",
+ " 'datasets',\n",
+ " 'evaluation',\n",
+ " 'visualization',\n",
+ " ])\n",
+ "dataset_type = 'HighQualityFallDataset'\n",
+ "default_hooks = dict(\n",
+ " checkpoint=dict(\n",
+ " by_epoch=True,\n",
+ " interval=3,\n",
+ " max_keep_ckpts=3,\n",
+ " save_best='auto',\n",
+ " type='CheckpointHook'),\n",
+ " logger=dict(type='LoggerHook'),\n",
+ " param_scheduler=dict(type='ParamSchedulerHook'),\n",
+ " runtime_info=dict(type='RuntimeInfoHook'),\n",
+ " sampler_seed=dict(type='DistSamplerSeedHook'),\n",
+ " sync_buffers=dict(type='SyncBuffersHook'),\n",
+ " timer=dict(type='IterTimerHook'))\n",
+ "default_scope = 'mmaction'\n",
+ "env_cfg = dict(\n",
+ " cudnn_benchmark=False,\n",
+ " dist_cfg=dict(backend='nccl'),\n",
+ " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))\n",
+ "label_strategy = dict(\n",
+ " label_description=dict(\n",
+ " end_timestamp_names=[\n",
+ " 'fall_end',\n",
+ " 'lying_end',\n",
+ " ],\n",
+ " names=[\n",
+ " 'fall',\n",
+ " 'lying',\n",
+ " 'other',\n",
+ " ],\n",
+ " other_class=2,\n",
+ " start_timestamp_names=[\n",
+ " 'fall_start',\n",
+ " 'lying_start',\n",
+ " ],\n",
+ " visible_names=[\n",
+ " 'fall_visible',\n",
+ " 'lying_visible',\n",
+ " ]),\n",
+ " type='PriorityLabel')\n",
+ "launcher = 'none'\n",
+ "load_from = 'weights/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-25c748fd.pth'\n",
+ "log_level = 'INFO'\n",
+ "log_processor = dict(by_epoch=True, type='LogProcessor', window_size=10)\n",
+ "model = dict(\n",
+ " backbone=dict(\n",
+ " depth=12,\n",
+ " drop_path_rate=0.3,\n",
+ " embed_dims=384,\n",
+ " img_size=224,\n",
+ " mlp_ratio=4,\n",
+ " norm_cfg=dict(eps=1e-06, type='LN'),\n",
+ " num_frames=16,\n",
+ " num_heads=6,\n",
+ " patch_size=16,\n",
+ " qkv_bias=True,\n",
+ " type='VisionTransformer'),\n",
+ " cls_head=dict(\n",
+ " average_clips='prob',\n",
+ " in_channels=384,\n",
+ " num_classes=3,\n",
+ " topk=(1, ),\n",
+ " type='TimeSformerHead'),\n",
+ " data_preprocessor=dict(\n",
+ " format_shape='NCTHW',\n",
+ " mean=[\n",
+ " 102.17311096191406,\n",
+ " 98.78225708007812,\n",
+ " 92.68714141845703,\n",
+ " ],\n",
+ " std=[\n",
+ " 58.04566192626953,\n",
+ " 57.004024505615234,\n",
+ " 57.3704948425293,\n",
+ " ],\n",
+ " type='ActionDataPreprocessor'),\n",
+ " type='Recognizer3D')\n",
+ "optim_wrapper = dict(\n",
+ " clip_grad=dict(max_norm=5, norm_type=2),\n",
+ " optimizer=dict(\n",
+ " betas=(\n",
+ " 0.9,\n",
+ " 0.999,\n",
+ " ), lr=0.001, type='AdamW', weight_decay=0.1),\n",
+ " type='AmpOptimWrapper')\n",
+ "param_scheduler = [\n",
+ " dict(\n",
+ " begin=0,\n",
+ " by_epoch=True,\n",
+ " convert_to_iter_based=True,\n",
+ " end=5,\n",
+ " end_factor=1,\n",
+ " start_factor=0.001,\n",
+ " type='LinearLR'),\n",
+ " dict(\n",
+ " begin=5,\n",
+ " by_epoch=True,\n",
+ " convert_to_iter_based=True,\n",
+ " end=35,\n",
+ " eta_min=1e-06,\n",
+ " type='CosineAnnealingLR'),\n",
+ "]\n",
+ "resume = False\n",
+ "sampling_strategy = dict(clip_len=10, type='UniformSampling')\n",
+ "test_cfg = dict(type='TestLoop')\n",
+ "test_dataloader = dict(\n",
+ " batch_size=3,\n",
+ " dataset=dict(\n",
+ " ann_file='data/Fall_Simulation_Data/annotations_test.csv',\n",
+ " label_strategy=dict(\n",
+ " label_description=dict(\n",
+ " end_timestamp_names=[\n",
+ " 'fall_end',\n",
+ " 'lying_end',\n",
+ " ],\n",
+ " names=[\n",
+ " 'fall',\n",
+ " 'lying',\n",
+ " 'other',\n",
+ " ],\n",
+ " other_class=2,\n",
+ " start_timestamp_names=[\n",
+ " 'fall_start',\n",
+ " 'lying_start',\n",
+ " ],\n",
+ " visible_names=[\n",
+ " 'fall_visible',\n",
+ " 'lying_visible',\n",
+ " ]),\n",
+ " type='PriorityLabel'),\n",
+ " num_classes=3,\n",
+ " pipeline=[\n",
+ " dict(type='DecordInit'),\n",
+ " dict(\n",
+ " clip_len=16,\n",
+ " frame_interval=4,\n",
+ " num_clips=5,\n",
+ " test_mode=True,\n",
+ " type='SampleFrames'),\n",
+ " dict(type='DecordDecode'),\n",
+ " dict(scale=(\n",
+ " -1,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(crop_size=224, type='ThreeCrop'),\n",
+ " dict(input_format='NCTHW', type='FormatShape'),\n",
+ " dict(type='PackActionInputs'),\n",
+ " ],\n",
+ " sampling_strategy=dict(clip_len=10, type='UniformSampling'),\n",
+ " type='HighQualityFallDataset'),\n",
+ " num_workers=8,\n",
+ " persistent_workers=True,\n",
+ " sampler=dict(shuffle=False, type='DefaultSampler'))\n",
+ "test_evaluator = dict(\n",
+ " metric_list=(\n",
+ " 'unweighted_average_f1',\n",
+ " 'per_class_f1',\n",
+ " 'per_class_precision',\n",
+ " 'per_class_recall',\n",
+ " ),\n",
+ " type='AddAccMetric')\n",
+ "test_pipeline = [\n",
+ " dict(type='DecordInit'),\n",
+ " dict(\n",
+ " clip_len=16,\n",
+ " frame_interval=4,\n",
+ " num_clips=5,\n",
+ " test_mode=True,\n",
+ " type='SampleFrames'),\n",
+ " dict(type='DecordDecode'),\n",
+ " dict(scale=(\n",
+ " -1,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(crop_size=224, type='ThreeCrop'),\n",
+ " dict(input_format='NCTHW', type='FormatShape'),\n",
+ " dict(type='PackActionInputs'),\n",
+ "]\n",
+ "train_cfg = dict(max_epochs=35, type='EpochBasedTrainLoop', val_interval=1)\n",
+ "train_dataloader = dict(\n",
+ " batch_size=3,\n",
+ " dataset=dict(\n",
+ " ann_file='data/Fall_Simulation_Data/annotations_train.csv',\n",
+ " label_strategy=dict(\n",
+ " label_description=dict(\n",
+ " end_timestamp_names=[\n",
+ " 'fall_end',\n",
+ " 'lying_end',\n",
+ " ],\n",
+ " names=[\n",
+ " 'fall',\n",
+ " 'lying',\n",
+ " 'other',\n",
+ " ],\n",
+ " other_class=2,\n",
+ " start_timestamp_names=[\n",
+ " 'fall_start',\n",
+ " 'lying_start',\n",
+ " ],\n",
+ " visible_names=[\n",
+ " 'fall_visible',\n",
+ " 'lying_visible',\n",
+ " ]),\n",
+ " type='PriorityLabel'),\n",
+ " num_classes=3,\n",
+ " pipeline=[\n",
+ " dict(type='DecordInit'),\n",
+ " dict(type='ClipVideo'),\n",
+ " dict(\n",
+ " clip_len=16,\n",
+ " frame_interval=4,\n",
+ " num_clips=1,\n",
+ " type='SampleFrames'),\n",
+ " dict(type='DecordDecode'),\n",
+ " dict(scale=(\n",
+ " -1,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(size=224, type='RandomCrop'),\n",
+ " dict(keep_ratio=False, scale=(\n",
+ " 224,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(flip_ratio=0.5, type='Flip'),\n",
+ " dict(input_format='NCTHW', type='FormatShape'),\n",
+ " dict(type='PackActionInputs'),\n",
+ " ],\n",
+ " sampling_strategy=dict(clip_len=10, type='UniformSampling'),\n",
+ " type='HighQualityFallDataset'),\n",
+ " num_workers=8,\n",
+ " persistent_workers=True,\n",
+ " sampler=dict(shuffle=True, type='DefaultSampler'))\n",
+ "train_pipeline = [\n",
+ " dict(type='DecordInit'),\n",
+ " dict(type='ClipVideo'),\n",
+ " dict(clip_len=16, frame_interval=4, num_clips=1, type='SampleFrames'),\n",
+ " dict(type='DecordDecode'),\n",
+ " dict(scale=(\n",
+ " -1,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(size=224, type='RandomCrop'),\n",
+ " dict(keep_ratio=False, scale=(\n",
+ " 224,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(flip_ratio=0.5, type='Flip'),\n",
+ " dict(input_format='NCTHW', type='FormatShape'),\n",
+ " dict(type='PackActionInputs'),\n",
+ "]\n",
+ "val_cfg = dict(type='ValLoop')\n",
+ "val_dataloader = dict(\n",
+ " batch_size=3,\n",
+ " dataset=dict(\n",
+ " ann_file='data/Fall_Simulation_Data/annotations_val.csv',\n",
+ " label_strategy=dict(\n",
+ " label_description=dict(\n",
+ " end_timestamp_names=[\n",
+ " 'fall_end',\n",
+ " 'lying_end',\n",
+ " ],\n",
+ " names=[\n",
+ " 'fall',\n",
+ " 'lying',\n",
+ " 'other',\n",
+ " ],\n",
+ " other_class=2,\n",
+ " start_timestamp_names=[\n",
+ " 'fall_start',\n",
+ " 'lying_start',\n",
+ " ],\n",
+ " visible_names=[\n",
+ " 'fall_visible',\n",
+ " 'lying_visible',\n",
+ " ]),\n",
+ " type='PriorityLabel'),\n",
+ " num_classes=3,\n",
+ " pipeline=[\n",
+ " dict(type='DecordInit'),\n",
+ " dict(type='ClipVideo'),\n",
+ " dict(\n",
+ " clip_len=16,\n",
+ " frame_interval=4,\n",
+ " num_clips=1,\n",
+ " test_mode=True,\n",
+ " type='SampleFrames'),\n",
+ " dict(type='DecordDecode'),\n",
+ " dict(scale=(\n",
+ " -1,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(crop_size=224, type='CenterCrop'),\n",
+ " dict(input_format='NCTHW', type='FormatShape'),\n",
+ " dict(type='PackActionInputs'),\n",
+ " ],\n",
+ " sampling_strategy=dict(clip_len=10, type='UniformSampling'),\n",
+ " type='HighQualityFallDataset'),\n",
+ " num_workers=8,\n",
+ " persistent_workers=True,\n",
+ " sampler=dict(shuffle=False, type='DefaultSampler'))\n",
+ "val_evaluator = dict(\n",
+ " metric_list=(\n",
+ " 'unweighted_average_f1',\n",
+ " 'per_class_f1',\n",
+ " 'per_class_precision',\n",
+ " 'per_class_recall',\n",
+ " ),\n",
+ " type='AddAccMetric')\n",
+ "val_pipeline = [\n",
+ " dict(type='DecordInit'),\n",
+ " dict(type='ClipVideo'),\n",
+ " dict(\n",
+ " clip_len=16,\n",
+ " frame_interval=4,\n",
+ " num_clips=1,\n",
+ " test_mode=True,\n",
+ " type='SampleFrames'),\n",
+ " dict(type='DecordDecode'),\n",
+ " dict(scale=(\n",
+ " -1,\n",
+ " 224,\n",
+ " ), type='Resize'),\n",
+ " dict(crop_size=224, type='CenterCrop'),\n",
+ " dict(input_format='NCTHW', type='FormatShape'),\n",
+ " dict(type='PackActionInputs'),\n",
+ "]\n",
+ "vis_backends = dict(\n",
+ " save_dir='experiments/tensorboard', type='TensorboardVisBackend')\n",
+ "visualizer = dict(\n",
+ " type='ActionVisualizer',\n",
+ " vis_backends=dict(\n",
+ " save_dir='experiments/tensorboard', type='TensorboardVisBackend'))\n",
+ "work_dir = 'experiments'\n",
+ "\n",
+ "12/04 22:32:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n",
+ "12/04 22:32:34 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n",
+ "before_run:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(BELOW_NORMAL) LoggerHook \n",
+ " -------------------- \n",
+ "before_train:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(NORMAL ) IterTimerHook \n",
+ "(VERY_LOW ) CheckpointHook \n",
+ " -------------------- \n",
+ "before_train_epoch:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(NORMAL ) IterTimerHook \n",
+ "(NORMAL ) DistSamplerSeedHook \n",
+ " -------------------- \n",
+ "before_train_iter:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(NORMAL ) IterTimerHook \n",
+ " -------------------- \n",
+ "after_train_iter:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(NORMAL ) IterTimerHook \n",
+ "(BELOW_NORMAL) LoggerHook \n",
+ "(LOW ) ParamSchedulerHook \n",
+ "(VERY_LOW ) CheckpointHook \n",
+ " -------------------- \n",
+ "after_train_epoch:\n",
+ "(NORMAL ) IterTimerHook \n",
+ "(NORMAL ) SyncBuffersHook \n",
+ "(LOW ) ParamSchedulerHook \n",
+ "(VERY_LOW ) CheckpointHook \n",
+ " -------------------- \n",
+ "before_val:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ " -------------------- \n",
+ "before_val_epoch:\n",
+ "(NORMAL ) IterTimerHook \n",
+ "(NORMAL ) SyncBuffersHook \n",
+ " -------------------- \n",
+ "before_val_iter:\n",
+ "(NORMAL ) IterTimerHook \n",
+ " -------------------- \n",
+ "after_val_iter:\n",
+ "(NORMAL ) IterTimerHook \n",
+ "(NORMAL ) CustomVisualizationHook \n",
+ "(BELOW_NORMAL) LoggerHook \n",
+ " -------------------- \n",
+ "after_val_epoch:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(NORMAL ) IterTimerHook \n",
+ "(BELOW_NORMAL) LoggerHook \n",
+ "(LOW ) ParamSchedulerHook \n",
+ "(VERY_LOW ) CheckpointHook \n",
+ " -------------------- \n",
+ "after_val:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ " -------------------- \n",
+ "after_train:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(VERY_LOW ) CheckpointHook \n",
+ " -------------------- \n",
+ "before_test:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ " -------------------- \n",
+ "before_test_epoch:\n",
+ "(NORMAL ) IterTimerHook \n",
+ " -------------------- \n",
+ "before_test_iter:\n",
+ "(NORMAL ) IterTimerHook \n",
+ " -------------------- \n",
+ "after_test_iter:\n",
+ "(NORMAL ) IterTimerHook \n",
+ "(NORMAL ) CustomVisualizationHook \n",
+ "(BELOW_NORMAL) LoggerHook \n",
+ " -------------------- \n",
+ "after_test_epoch:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ "(NORMAL ) IterTimerHook \n",
+ "(BELOW_NORMAL) LoggerHook \n",
+ " -------------------- \n",
+ "after_test:\n",
+ "(VERY_HIGH ) RuntimeInfoHook \n",
+ " -------------------- \n",
+ "after_run:\n",
+ "(BELOW_NORMAL) LoggerHook \n",
+ " -------------------- \n"
+ ]
+ }
+ ],
+ "source": [
+ "from mmengine.runner import Runner\n",
+ "from mmengine.config import Config\n",
+ "\n",
+ "runner_cfg = Config.fromfile(\n",
+ " \"configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py\"\n",
+ ")\n",
+ "runner = Runner.from_cfg(runner_cfg)\n",
+ "model = runner.model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Functions from the VideoMAE repo\n",
+ "\n",
+ "\n",
+ "def get_num_layer_for_vit(var_name, num_max_layer):\n",
+ " if var_name in (\"backbone.cls_token\", \"backbone.mask_token\", \"backbone.pos_embed\"):\n",
+ " return 0\n",
+ " elif var_name.startswith(\"backbone.patch_embed\"):\n",
+ " return 0\n",
+ " elif var_name.startswith(\"backbone.rel_pos_bias\"):\n",
+ " return num_max_layer - 1\n",
+ " elif var_name.startswith(\"backbone.blocks\"):\n",
+ " layer_id = int(var_name.split(\".\")[2])\n",
+ " return layer_id + 1\n",
+ " else:\n",
+ " return num_max_layer - 1\n",
+ "\n",
+ "\n",
+ "class LayerDecayValueAssigner(object):\n",
+ " def __init__(self, values):\n",
+ " self.values = values\n",
+ "\n",
+ " def get_scale(self, layer_id):\n",
+ " return self.values[layer_id]\n",
+ "\n",
+ " def get_layer_id(self, var_name):\n",
+ " return get_num_layer_for_vit(var_name, len(self.values))\n",
+ "\n",
+ "\n",
+ "def get_parameter_groups(\n",
+ " model, weight_decay=1e-5, skip_list=(), get_num_layer=None, get_layer_scale=None\n",
+ "):\n",
+ " parameter_group_names = {}\n",
+ " parameter_group_vars = {}\n",
+ "\n",
+ " for name, param in model.named_parameters():\n",
+ " if not param.requires_grad:\n",
+ " continue # frozen weights\n",
+ " if (\n",
+ " len(param.shape) == 1\n",
+ " or name.endswith(\".bias\")\n",
+ " or name.endswith(\".scale\")\n",
+ " or name in skip_list\n",
+ " ):\n",
+ " group_name = \"no_decay\"\n",
+ " this_weight_decay = 0.0\n",
+ " else:\n",
+ " group_name = \"decay\"\n",
+ " this_weight_decay = weight_decay\n",
+ " if get_num_layer is not None:\n",
+ " layer_id = get_num_layer(name)\n",
+ " group_name = \"layer_%d_%s\" % (layer_id, group_name)\n",
+ " else:\n",
+ " layer_id = None\n",
+ "\n",
+ " if group_name not in parameter_group_names:\n",
+ " if get_layer_scale is not None:\n",
+ " scale = get_layer_scale(layer_id)\n",
+ " else:\n",
+ " scale = 1.0\n",
+ "\n",
+ " parameter_group_names[group_name] = {\n",
+ " \"weight_decay\": this_weight_decay,\n",
+ " \"params\": [],\n",
+ " \"lr_scale\": scale,\n",
+ " }\n",
+ " parameter_group_vars[group_name] = {\n",
+ " \"weight_decay\": this_weight_decay,\n",
+ " \"params\": [],\n",
+ " \"lr_scale\": scale,\n",
+ " }\n",
+ "\n",
+ " parameter_group_vars[group_name][\"params\"].append(param)\n",
+ " parameter_group_names[group_name][\"params\"].append(name)\n",
+ "\n",
+ " return parameter_group_names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the parameter groups from VideoMAE\n",
+ "\n",
+ "assigner = LayerDecayValueAssigner(\n",
+ " list(LAYER_DECAY ** (MODEL_DEPTH + 1 - i) for i in range(MODEL_DEPTH + 2))\n",
+ ")\n",
+ "\n",
+ "groups = get_parameter_groups(\n",
+ " model,\n",
+ " BASE_WEIGHT_DECAY,\n",
+ " get_num_layer=assigner.get_layer_id,\n",
+ " get_layer_scale=assigner.get_scale,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'backbone.patch_embed.projection.weight': {'lr_mult': 0.023757264018058777,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.patch_embed.projection.bias': {'lr_mult': 0.023757264018058777,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.norm1.weight': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.norm1.bias': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.attn.q_bias': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.attn.v_bias': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.attn.proj.bias': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.norm2.weight': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.norm2.bias': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.mlp.layers.0.0.bias': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.mlp.layers.1.bias': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.0.attn.qkv.weight': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.0.attn.proj.weight': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.0.mlp.layers.0.0.weight': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.0.mlp.layers.1.weight': {'lr_mult': 0.03167635202407837,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.1.norm1.weight': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.norm1.bias': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.attn.q_bias': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.attn.v_bias': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.attn.proj.bias': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.norm2.weight': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.norm2.bias': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.mlp.layers.0.0.bias': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.mlp.layers.1.bias': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.1.attn.qkv.weight': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.1.attn.proj.weight': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.1.mlp.layers.0.0.weight': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.1.mlp.layers.1.weight': {'lr_mult': 0.04223513603210449,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.2.norm1.weight': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.norm1.bias': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.attn.q_bias': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.attn.v_bias': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.attn.proj.bias': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.norm2.weight': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.norm2.bias': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.mlp.layers.0.0.bias': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.mlp.layers.1.bias': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.2.attn.qkv.weight': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.2.attn.proj.weight': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.2.mlp.layers.0.0.weight': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.2.mlp.layers.1.weight': {'lr_mult': 0.056313514709472656,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.3.norm1.weight': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.norm1.bias': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.attn.q_bias': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.attn.v_bias': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.attn.proj.bias': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.norm2.weight': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.norm2.bias': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.mlp.layers.0.0.bias': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.mlp.layers.1.bias': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.3.attn.qkv.weight': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.3.attn.proj.weight': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.3.mlp.layers.0.0.weight': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.3.mlp.layers.1.weight': {'lr_mult': 0.07508468627929688,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.4.norm1.weight': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.norm1.bias': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.attn.q_bias': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.attn.v_bias': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.attn.proj.bias': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.norm2.weight': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.norm2.bias': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.mlp.layers.0.0.bias': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.mlp.layers.1.bias': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.4.attn.qkv.weight': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.4.attn.proj.weight': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.4.mlp.layers.0.0.weight': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.4.mlp.layers.1.weight': {'lr_mult': 0.1001129150390625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.5.norm1.weight': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.norm1.bias': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.attn.q_bias': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.attn.v_bias': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.attn.proj.bias': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.norm2.weight': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.norm2.bias': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.mlp.layers.0.0.bias': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.mlp.layers.1.bias': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.5.attn.qkv.weight': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.5.attn.proj.weight': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.5.mlp.layers.0.0.weight': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.5.mlp.layers.1.weight': {'lr_mult': 0.13348388671875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.6.norm1.weight': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.norm1.bias': {'lr_mult': 0.177978515625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.attn.q_bias': {'lr_mult': 0.177978515625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.attn.v_bias': {'lr_mult': 0.177978515625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.attn.proj.bias': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.norm2.weight': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.norm2.bias': {'lr_mult': 0.177978515625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.mlp.layers.0.0.bias': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.mlp.layers.1.bias': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.6.attn.qkv.weight': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.6.attn.proj.weight': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.6.mlp.layers.0.0.weight': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.6.mlp.layers.1.weight': {'lr_mult': 0.177978515625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.7.norm1.weight': {'lr_mult': 0.2373046875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.norm1.bias': {'lr_mult': 0.2373046875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.attn.q_bias': {'lr_mult': 0.2373046875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.attn.v_bias': {'lr_mult': 0.2373046875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.attn.proj.bias': {'lr_mult': 0.2373046875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.norm2.weight': {'lr_mult': 0.2373046875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.norm2.bias': {'lr_mult': 0.2373046875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.mlp.layers.0.0.bias': {'lr_mult': 0.2373046875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.mlp.layers.1.bias': {'lr_mult': 0.2373046875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.7.attn.qkv.weight': {'lr_mult': 0.2373046875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.7.attn.proj.weight': {'lr_mult': 0.2373046875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.7.mlp.layers.0.0.weight': {'lr_mult': 0.2373046875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.7.mlp.layers.1.weight': {'lr_mult': 0.2373046875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.8.norm1.weight': {'lr_mult': 0.31640625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.norm1.bias': {'lr_mult': 0.31640625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.attn.q_bias': {'lr_mult': 0.31640625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.attn.v_bias': {'lr_mult': 0.31640625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.attn.proj.bias': {'lr_mult': 0.31640625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.norm2.weight': {'lr_mult': 0.31640625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.norm2.bias': {'lr_mult': 0.31640625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.mlp.layers.0.0.bias': {'lr_mult': 0.31640625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.mlp.layers.1.bias': {'lr_mult': 0.31640625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.8.attn.qkv.weight': {'lr_mult': 0.31640625, 'decay_mult': 1},\n",
+ " 'backbone.blocks.8.attn.proj.weight': {'lr_mult': 0.31640625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.8.mlp.layers.0.0.weight': {'lr_mult': 0.31640625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.8.mlp.layers.1.weight': {'lr_mult': 0.31640625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.9.norm1.weight': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.norm1.bias': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.attn.q_bias': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.attn.v_bias': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.attn.proj.bias': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.norm2.weight': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.norm2.bias': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.mlp.layers.0.0.bias': {'lr_mult': 0.421875,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.mlp.layers.1.bias': {'lr_mult': 0.421875, 'decay_mult': 0},\n",
+ " 'backbone.blocks.9.attn.qkv.weight': {'lr_mult': 0.421875, 'decay_mult': 1},\n",
+ " 'backbone.blocks.9.attn.proj.weight': {'lr_mult': 0.421875, 'decay_mult': 1},\n",
+ " 'backbone.blocks.9.mlp.layers.0.0.weight': {'lr_mult': 0.421875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.9.mlp.layers.1.weight': {'lr_mult': 0.421875,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.10.norm1.weight': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.norm1.bias': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.attn.q_bias': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.attn.v_bias': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.attn.proj.bias': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.norm2.weight': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.norm2.bias': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.mlp.layers.0.0.bias': {'lr_mult': 0.5625,\n",
+ " 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.mlp.layers.1.bias': {'lr_mult': 0.5625, 'decay_mult': 0},\n",
+ " 'backbone.blocks.10.attn.qkv.weight': {'lr_mult': 0.5625, 'decay_mult': 1},\n",
+ " 'backbone.blocks.10.attn.proj.weight': {'lr_mult': 0.5625, 'decay_mult': 1},\n",
+ " 'backbone.blocks.10.mlp.layers.0.0.weight': {'lr_mult': 0.5625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.10.mlp.layers.1.weight': {'lr_mult': 0.5625,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.11.norm1.weight': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.norm1.bias': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.attn.q_bias': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.attn.v_bias': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.attn.proj.bias': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.norm2.weight': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.norm2.bias': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.mlp.layers.0.0.bias': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.mlp.layers.1.bias': {'lr_mult': 0.75, 'decay_mult': 0},\n",
+ " 'backbone.blocks.11.attn.qkv.weight': {'lr_mult': 0.75, 'decay_mult': 1},\n",
+ " 'backbone.blocks.11.attn.proj.weight': {'lr_mult': 0.75, 'decay_mult': 1},\n",
+ " 'backbone.blocks.11.mlp.layers.0.0.weight': {'lr_mult': 0.75,\n",
+ " 'decay_mult': 1},\n",
+ " 'backbone.blocks.11.mlp.layers.1.weight': {'lr_mult': 0.75, 'decay_mult': 1},\n",
+ " 'backbone.fc_norm.weight': {'lr_mult': 1.0, 'decay_mult': 0},\n",
+ " 'backbone.fc_norm.bias': {'lr_mult': 1.0, 'decay_mult': 0},\n",
+ " 'cls_head.fc_cls.bias': {'lr_mult': 1.0, 'decay_mult': 0},\n",
+ " 'cls_head.fc_cls.weight': {'lr_mult': 1.0, 'decay_mult': 1}}"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Convert the parameter groups to the format used by mmaction\n",
+ "\n",
+ "custom_keys = {}\n",
+ "for _, group in groups.items():\n",
+ " decay_mult = 0 if group[\"weight_decay\"] == 0 else 1\n",
+ " params = group[\"params\"]\n",
+ " lr_mult = group[\"lr_scale\"]\n",
+ " for param in params:\n",
+ " custom_keys[param] = {\"lr_mult\": lr_mult, \"decay_mult\": decay_mult}\n",
+ "\n",
+ "custom_keys"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "human-fall-detection",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/dataset_label_analysis.ipynb b/notebooks/dataset_label_analysis.ipynb
index dbc73a3..d4d6287 100644
--- a/notebooks/dataset_label_analysis.ipynb
+++ b/notebooks/dataset_label_analysis.ipynb
@@ -1,353 +1,837 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Label Analysis\n",
- "\n",
- "In this notebook we analyse the datsets and label distributions we get for different settings for sampling and labeling strategy."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "%reload_ext autoreload\n",
- "%autoreload 2\n",
- "import re\n",
- "\n",
- "import matplotlib.pyplot as plt\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "\n",
- "from datasets import HighQualityFallDataset\n",
- "from datasets.transforms.label_strategy import HQFD_LABEL_DESCRIPTION, PriorityLabel\n",
- "from datasets.transforms.sampling_strategy import GaussianSampling, UniformSampling"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " filename | \n",
- " label | \n",
- " interval | \n",
- " sample_idx | \n",
- " modality | \n",
- " start_index | \n",
- " label_name | \n",
- " video_category | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " data/Fall_Simulation_Data/videos/ADL17_Cam1.avi | \n",
- " 2 | \n",
- " (30.0, 40.0) | \n",
- " 0 | \n",
- " RGB | \n",
- " 0 | \n",
- " Other | \n",
- " ADL | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " data/Fall_Simulation_Data/videos/ADL17_Cam1.avi | \n",
- " 2 | \n",
- " (50.0, 60.0) | \n",
- " 1 | \n",
- " RGB | \n",
- " 0 | \n",
- " Other | \n",
- " ADL | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " data/Fall_Simulation_Data/videos/ADL17_Cam1.avi | \n",
- " 2 | \n",
- " (60.0, 70.0) | \n",
- " 2 | \n",
- " RGB | \n",
- " 0 | \n",
- " Other | \n",
- " ADL | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " data/Fall_Simulation_Data/videos/ADL17_Cam1.avi | \n",
- " 2 | \n",
- " (90.0, 100.0) | \n",
- " 3 | \n",
- " RGB | \n",
- " 0 | \n",
- " Other | \n",
- " ADL | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " data/Fall_Simulation_Data/videos/ADL17_Cam1.avi | \n",
- " 2 | \n",
- " (160.0, 170.0) | \n",
- " 4 | \n",
- " RGB | \n",
- " 0 | \n",
- " Other | \n",
- " ADL | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " filename label interval \\\n",
- "0 data/Fall_Simulation_Data/videos/ADL17_Cam1.avi 2 (30.0, 40.0) \n",
- "1 data/Fall_Simulation_Data/videos/ADL17_Cam1.avi 2 (50.0, 60.0) \n",
- "2 data/Fall_Simulation_Data/videos/ADL17_Cam1.avi 2 (60.0, 70.0) \n",
- "3 data/Fall_Simulation_Data/videos/ADL17_Cam1.avi 2 (90.0, 100.0) \n",
- "4 data/Fall_Simulation_Data/videos/ADL17_Cam1.avi 2 (160.0, 170.0) \n",
- "\n",
- " sample_idx modality start_index label_name video_category \n",
- "0 0 RGB 0 Other ADL \n",
- "1 1 RGB 0 Other ADL \n",
- "2 2 RGB 0 Other ADL \n",
- "3 3 RGB 0 Other ADL \n",
- "4 4 RGB 0 Other ADL "
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "ANN_FILE = \"data/Fall_Simulation_Data/annotations.csv\"\n",
- "\n",
- "uniform_sampling = UniformSampling(clip_len=10, stride=0, overlap=False)\n",
- "gaussian_sampling = GaussianSampling(\n",
- " clip_len=10, n_samples_per_sec=None, fallback_sampler=None, std=None\n",
- ")\n",
- "label_strategy = PriorityLabel(\n",
- " label_description=HQFD_LABEL_DESCRIPTION,\n",
- " threshold=0.0,\n",
- " absolute_threshold=False,\n",
- " priority=[0, 1, 2],\n",
- ")\n",
- "\n",
- "hqfd = HighQualityFallDataset(\n",
- " ann_file=ANN_FILE,\n",
- " sampling_strategy=gaussian_sampling,\n",
- " label_strategy=label_strategy,\n",
- " pipeline=[],\n",
- " num_classes=3,\n",
- " test_mode=False,\n",
- " drop_ratios=[0.0, 0.0, 0.75],\n",
- ")\n",
- "\n",
- "df_hqfd = pd.DataFrame(list(hqfd))\n",
- "class_names = [\"Fall\", \"Lying\", \"Other\"]\n",
- "df_hqfd[\"label_name\"] = df_hqfd[\"label\"].apply(lambda x: class_names[x])\n",
- "\n",
- "\n",
- "def extract_category(filename):\n",
- " match = re.search(r\"(ADL|Fall)\", filename.split(\"/\")[-1])\n",
- " return match.group(1) if match else None\n",
- "\n",
- "\n",
- "df_hqfd[\"video_category\"] = df_hqfd[\"filename\"].apply(extract_category)\n",
- "\n",
- "df_hqfd.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "def plot_label_dist(df_hqfd):\n",
- " display(\"---- Label Distribution ----\")\n",
- " display(df_hqfd[\"label_name\"].value_counts().sort_index() / len(df_hqfd))\n",
- "\n",
- " display(\"---- Label Counts ----\")\n",
- " display(df_hqfd[\"label_name\"].value_counts().sort_index())\n",
- " df_hqfd[\"label_name\"].value_counts().sort_index().plot(kind=\"bar\")\n",
- " plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'---- Label Distribution ----'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "label_name\n",
- "Fall 0.288679\n",
- "Lying 0.204657\n",
- "Other 0.506664\n",
- "Name: count, dtype: float64"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "'---- Label Counts ----'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "label_name\n",
- "Fall 1711\n",
- "Lying 1213\n",
- "Other 3003\n",
- "Name: count, dtype: int64"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "image/png": "",
- "text/plain": [
- "