Merge pull request #32 from Grutschus/27-set-up-a-pretrained-backbone…

…-vit-and-a-training-pipeline 27 set up a pretrained backbone vit and a training pipeline
Grutschus · Nov 28, 2023 · 1bb8ba3 · 1bb8ba3
2 parents dbc4b74 + 2782e92
commit 1bb8ba3
Show file tree

Hide file tree

Showing 17 changed files with 1,019 additions and 49 deletions.
diff --git a/.gitignore b/.gitignore
@@ -160,4 +160,5 @@ cython_debug/
 #.idea/
 
 # Container files
-**.sif
+**.sif
+/weights
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -11,6 +11,22 @@
  ],
  "console": "integratedTerminal",
  "justMyCode": false
+ },
+ {
+ "name": "Debug Training",
+ "type": "python",
+ "request": "launch",
+ "program": "${workspaceFolder}/mmaction2/tools/train.py",
+ "args": [
+ "configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py"
+ ],
+ "console": "integratedTerminal",
+ "justMyCode": false,
+ "cwd": "${workspaceFolder}",
+ "env": {
+ "PYTHONPATH": "${workspaceFolder}",
+ "CUDA_VISIBLE_DEVICES": "2"
+ }
  }
  ]
 }
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -11,5 +11,6 @@
  "python.testing.unittestEnabled": false,
  "python.testing.pytestEnabled": true,
  "jupyter.debugJustMyCode": false,
- "jupyter.notebookFileRoot": "${workspaceFolder}"
+ "jupyter.notebookFileRoot": "${workspaceFolder}",
+ "remote.SSH.remoteServerListenOnSocket": true
 }
diff --git a/configs/datasets/ds_uniformsample_existencelabel.py b/configs/datasets/ds_uniformsample_existencelabel.py
@@ -13,6 +13,6 @@
 )
 ann_file = "tests/test_data/test_annotation.csv"
 pipeline = [] # type: ignore
-multiclass = True
+multi_class = True
 num_classes = 3
 test_mode = True
diff --git a/configs/datasets/high-quality-fall_runner-base.py b/configs/datasets/high-quality-fall_runner-base.py
@@ -0,0 +1,113 @@
+"""Base `Runner` config for high-quality-fall dataset."""
+
+dataset_type = "HighQualityFallDataset"
+
+label_strategy = dict(
+ type="PriorityLabel",
+ label_description=dict(
+ names=["fall", "lying", "other"],
+ start_timestamp_names=["fall_start", "lying_start"],
+ end_timestamp_names=["fall_end", "lying_end"],
+ visible_names=["fall_visible", "lying_visible"],
+ other_class=2,
+ ),
+)
+
+sampling_strategy = dict(type="UniformSampling", clip_len=10)
+
+
+# TRAIN
+ann_file_train = "data/Fall_Simulation_Data/annotations_train.csv"
+
+# TODO: Add shape comments
+# TODO: Think about augmentation steps
+train_pipeline = [
+ dict(type="DecordInit"),
+ dict(type="ClipVideo"),
+ dict(type="SampleFrames", clip_len=16, frame_interval=4, num_clips=1),
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="RandomResizedCrop"),
+ dict(type="Resize", scale=(224, 224), keep_ratio=False),
+ dict(type="Flip", flip_ratio=0.5),
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+train_dataloader = dict(
+ batch_size=3, # From VideoMAEv2 repo
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type="DefaultSampler", shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ sampling_strategy=sampling_strategy,
+ label_strategy=label_strategy,
+ ann_file=ann_file_train,
+ pipeline=train_pipeline,
+ num_classes=3,
+ indices=100,
+ ),
+)
+
+# VALIDATION
+ann_file_val = "data/Fall_Simulation_Data/annotations_val.csv"
+
+val_pipeline = [
+ dict(type="DecordInit"),
+ dict(type="ClipVideo"),
+ dict(
+ type="SampleFrames", clip_len=16, frame_interval=4, num_clips=1, test_mode=True
+ ),
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="CenterCrop", crop_size=224), # From VideoMAEv2 repo
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+val_dataloader = train_dataloader
+# val_dataloader = dict(
+# batch_size=3, # From VideoMAEv2 repo
+# num_workers=8,
+# persistent_workers=True,
+# sampler=dict(type="DefaultSampler", shuffle=False),
+# dataset=dict(
+# type=dataset_type,
+# sampling_strategy=sampling_strategy,
+# label_strategy=label_strategy,
+# ann_file=ann_file_val,
+# pipeline=val_pipeline,
+# num_classes=3,
+# ),
+# )
+
+# TEST
+ann_file_test = "data/Fall_Simulation_Data/annotations_test.csv"
+
+test_pipeline = [
+ dict(type="DecordInit"),
+ dict(
+ type="SampleFrames", clip_len=16, frame_interval=4, num_clips=5, test_mode=True
+ ), # From VideoMAEv2 repo
+ dict(type="DecordDecode"),
+ dict(type="Resize", scale=(-1, 224)),
+ dict(type="ThreeCrop", crop_size=224), # From VideoMAEv2 repo
+ dict(type="FormatShape", input_format="NCTHW"),
+ dict(type="PackActionInputs"),
+]
+
+test_dataloader = dict(
+ batch_size=3, # From VideoMAEv2 repo
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type="DefaultSampler", shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ sampling_strategy=sampling_strategy,
+ label_strategy=label_strategy,
+ ann_file=ann_file_test,
+ pipeline=test_pipeline,
+ num_classes=3,
+ ),
+)
diff --git a/configs/default_runtime.py b/configs/default_runtime.py
@@ -0,0 +1,48 @@
+"""Default runtime for our experiments."""
+
+# Trying to skip this part, since we have custom registries not in this scope
+default_scope = "mmaction"
+work_dir = "experiments"
+custom_imports = dict(imports=["datasets"], allow_failed_imports=False)
+launcher = "none"
+
+default_hooks = dict(
+ runtime_info=dict(type="RuntimeInfoHook"),
+ timer=dict(type="IterTimerHook"),
+ logger=dict(type="LoggerHook"),
+ param_scheduler=dict(type="ParamSchedulerHook"),
+ checkpoint=dict(
+ type="CheckpointHook",
+ interval=1,
+ by_epoch=True,
+ max_keep_ckpts=3,
+ save_best="auto", # For CE, this is top-1-acc
+ ),
+ sampler_seed=dict(type="DistSamplerSeedHook"),
+ sync_buffers=dict(type="SyncBuffersHook"),
+)
+
+# Hook disabled since it cannot handle NCTHW tensors
+# TODO fix this
+# custom_hooks = [dict(type="VisualizationHook", enable=True)]
+
+env_cfg = dict(
+ cudnn_benchmark=False,
+ mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
+ dist_cfg=dict(backend="nccl"),
+)
+
+log_processor = dict(
+ type="LogProcessor",
+ window_size=10,
+ by_epoch=True,
+)
+
+vis_backends = [dict(type="TensorboardVisBackend", save_dir="experiments/tensorboard")]
+visualizer = dict(type="ActionVisualizer", vis_backends=vis_backends)
+
+log_level = "INFO"
+
+# Overwrite this to continue training
+load_from = None
+resume = False
diff --git a/configs/models/videomaev2.py b/configs/models/videomaev2.py
@@ -27,11 +27,10 @@
  average_clips="prob",
  multi_class=True,
  ),
- # TODO: update this to fit our dataset
  data_preprocessor=dict(
  type="ActionDataPreprocessor",
- mean=[123.675, 116.28, 103.53],
- std=[58.395, 57.12, 57.375],
+ mean=[102.17311096191406, 98.78225708007812, 92.68714141845703],
+ std=[58.04566192626953, 57.004024505615234, 57.3704948425293],
  format_shape="NCTHW",
  ),
 )

diff --git a/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py b/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py
@@ -0,0 +1,72 @@
+_base_ = ["../default_runtime.py", "../datasets/high-quality-fall_runner-base.py"]
+
+# Finetuning parameters are from VideoMAEv2 repo
+# https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/FINETUNE.md
+
+
+# ViT-S-P16
+model = dict(
+ type="Recognizer3D",
+ backbone=dict(
+ type="VisionTransformer",
+ img_size=224,
+ patch_size=16,
+ embed_dims=384,
+ depth=12,
+ num_heads=6,
+ mlp_ratio=4,
+ qkv_bias=True,
+ num_frames=16,
+ norm_cfg=dict(type="LN", eps=1e-6),
+ drop_path_rate=0.3, # From VideoMAEv2 repo
+ ),
+ cls_head=dict(
+ type="TimeSformerHead",
+ num_classes=3,
+ in_channels=384,
+ average_clips="prob",
+ ),
+ data_preprocessor=dict(
+ type="ActionDataPreprocessor",
+ mean=[102.17311096191406, 98.78225708007812, 92.68714141845703],
+ std=[58.04566192626953, 57.004024505615234, 57.3704948425293],
+ format_shape="NCTHW",
+ ),
+)
+
+# Loading weights
+load_from = "weights/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-25c748fd.pth"
+
+# TRAINING CONFIG
+train_cfg = dict(type="EpochBasedTrainLoop", max_epochs=35, val_interval=1)
+
+# TODO: Think about fine-tuning param scheduler
+param_scheduler = [
+ dict(
+ type="LinearLR", start_factor=0.001, by_epoch=True, begin=0, end=5
+ ), # From VideoMAEv2 repo
+]
+
+optim_wrapper = dict(
+ type="OptimWrapper",
+ optimizer=dict(
+ type="AdamW", # From VideoMAEv2 repo
+ lr=1e-3, # From VideoMAEv2 repo
+ weight_decay=0.1, # From VideoMAEv2 repo
+ betas=(0.9, 0.999), # From VideoMAEv2 repo
+ ),
+ clip_grad=dict(max_norm=5, norm_type=2), # From VideoMAEv2 repo
+)
+
+# VALIDATION CONFIG
+val_evaluator = dict(
+ type="AccMetric", metric_options=dict(top_k_accuracy=dict(topk=(1,)))
+)
+val_cfg = dict(type="ValLoop")
+
+
+# TEST CONFIG
+test_evaluator = dict(
+ type="AccMetric", metric_options=dict(top_k_accuracy=dict(topk=(1,)))
+)
+test_cfg = dict(type="TestLoop")
diff --git a/containers/requirements.txt b/containers/requirements.txt
@@ -2,4 +2,6 @@
 openpyxl>=3.0
 openmim>=0.3
 ffmpeg-python>=0.2
-dvc[s3]
+dvc[s3]
+dvclive>=3.3
+tensorboard>=2.15
diff --git a/datasets/high_quality_fall_dataset.py b/datasets/high_quality_fall_dataset.py
@@ -76,16 +76,20 @@ def __init__(
  test_mode: bool = False,
  **kwargs,
  ) -> None:
+ # Bug in MMENGINE: kwarg `custom_imports` is not removed from kwargs
+ # this causes an error when building the dataset
+ # TODO: Create an issue on MMENGINE, can be fixed here:
+ # https://github.com/open-mmlab/mmengine/blob/85c0976bc2434157f786d44cdd8f0fb2955414f0/mmengine/config/config.py#L462C34-L462C34
+ kwargs.pop("custom_imports", None)
+
  if isinstance(sampling_strategy, dict):
- built_sampling_strategy = SAMPLING_STRATEGIES.build(sampling_strategy) # type: SamplingStrategy
+ self.sampling_strategy = SAMPLING_STRATEGIES.build(sampling_strategy) # type: SamplingStrategy
  else:
- built_sampling_strategy = sampling_strategy
- self.sampling_strategy = built_sampling_strategy
+ self.sampling_strategy = sampling_strategy
  if isinstance(label_strategy, dict):
- built_label_strategy = LABEL_STRATEGIES.build(label_strategy) # type: LabelStrategy
+ self.label_strategy = LABEL_STRATEGIES.build(label_strategy) # type: LabelStrategy
  else:
- built_label_strategy = label_strategy
- self.label_strategy = built_label_strategy
+ self.label_strategy = label_strategy
  super().__init__(
  ann_file,
  pipeline=pipeline,
@@ -95,6 +99,7 @@ def __init__(
  start_index=start_index,
  modality=modality,
  test_mode=test_mode,
+ **kwargs,
  )
 
  def load_data_list(self) -> List[dict]:

diff --git a/datasets/transforms/clip_video.py b/datasets/transforms/clip_video.py
@@ -33,10 +33,11 @@ def transform(self, results: Dict) -> Dict:
  dict: The result dict.
  """
  interval = results["interval"]
+ total_frames = results["total_frames"]
  fps = results["avg_fps"]
  offset = results["start_index"] if "start_index" in results else 0
  start_frame = int(interval[0] * fps) + offset
- end_frame = int(interval[1] * fps) + offset
+ end_frame = min(int(interval[1] * fps) + offset, total_frames)
  results["start_index"] = start_frame
  results["total_frames"] = end_frame - start_frame
  return results