diff --git a/.gitignore b/.gitignore index e0deec3..1cc9c27 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,5 @@ cython_debug/ #.idea/ # Container files -**.sif \ No newline at end of file +**.sif +/weights diff --git a/.vscode/launch.json b/.vscode/launch.json index ae03984..ba42c4c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,6 +11,22 @@ ], "console": "integratedTerminal", "justMyCode": false + }, + { + "name": "Debug Training", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/mmaction2/tools/train.py", + "args": [ + "configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py" + ], + "console": "integratedTerminal", + "justMyCode": false, + "cwd": "${workspaceFolder}", + "env": { + "PYTHONPATH": "${workspaceFolder}", + "CUDA_VISIBLE_DEVICES": "2" + } } ] } \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index a618ca9..533be17 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -11,5 +11,6 @@ "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "jupyter.debugJustMyCode": false, - "jupyter.notebookFileRoot": "${workspaceFolder}" + "jupyter.notebookFileRoot": "${workspaceFolder}", + "remote.SSH.remoteServerListenOnSocket": true } \ No newline at end of file diff --git a/configs/datasets/ds_uniformsample_existencelabel.py b/configs/datasets/ds_uniformsample_existencelabel.py index 1cf0100..340f5f6 100644 --- a/configs/datasets/ds_uniformsample_existencelabel.py +++ b/configs/datasets/ds_uniformsample_existencelabel.py @@ -13,6 +13,6 @@ ) ann_file = "tests/test_data/test_annotation.csv" pipeline = [] # type: ignore -multiclass = True +multi_class = True num_classes = 3 test_mode = True diff --git a/configs/datasets/high-quality-fall_runner-base.py b/configs/datasets/high-quality-fall_runner-base.py new file mode 100644 index 0000000..794db64 --- /dev/null +++ b/configs/datasets/high-quality-fall_runner-base.py @@ -0,0 +1,113 @@ +"""Base `Runner` config for high-quality-fall dataset.""" + +dataset_type = "HighQualityFallDataset" + +label_strategy = dict( + type="PriorityLabel", + label_description=dict( + names=["fall", "lying", "other"], + start_timestamp_names=["fall_start", "lying_start"], + end_timestamp_names=["fall_end", "lying_end"], + visible_names=["fall_visible", "lying_visible"], + other_class=2, + ), +) + +sampling_strategy = dict(type="UniformSampling", clip_len=10) + + +# TRAIN +ann_file_train = "data/Fall_Simulation_Data/annotations_train.csv" + +# TODO: Add shape comments +# TODO: Think about augmentation steps +train_pipeline = [ + dict(type="DecordInit"), + dict(type="ClipVideo"), + dict(type="SampleFrames", clip_len=16, frame_interval=4, num_clips=1), + dict(type="DecordDecode"), + dict(type="Resize", scale=(-1, 224)), + dict(type="RandomResizedCrop"), + dict(type="Resize", scale=(224, 224), keep_ratio=False), + dict(type="Flip", flip_ratio=0.5), + dict(type="FormatShape", input_format="NCTHW"), + dict(type="PackActionInputs"), +] + +train_dataloader = dict( + batch_size=3, # From VideoMAEv2 repo + num_workers=8, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=True), + dataset=dict( + type=dataset_type, + sampling_strategy=sampling_strategy, + label_strategy=label_strategy, + ann_file=ann_file_train, + pipeline=train_pipeline, + num_classes=3, + indices=100, + ), +) + +# VALIDATION +ann_file_val = "data/Fall_Simulation_Data/annotations_val.csv" + +val_pipeline = [ + dict(type="DecordInit"), + dict(type="ClipVideo"), + dict( + type="SampleFrames", clip_len=16, frame_interval=4, num_clips=1, test_mode=True + ), + dict(type="DecordDecode"), + dict(type="Resize", scale=(-1, 224)), + dict(type="CenterCrop", crop_size=224), # From VideoMAEv2 repo + dict(type="FormatShape", input_format="NCTHW"), + dict(type="PackActionInputs"), +] + +val_dataloader = train_dataloader +# val_dataloader = dict( +# batch_size=3, # From VideoMAEv2 repo +# num_workers=8, +# persistent_workers=True, +# sampler=dict(type="DefaultSampler", shuffle=False), +# dataset=dict( +# type=dataset_type, +# sampling_strategy=sampling_strategy, +# label_strategy=label_strategy, +# ann_file=ann_file_val, +# pipeline=val_pipeline, +# num_classes=3, +# ), +# ) + +# TEST +ann_file_test = "data/Fall_Simulation_Data/annotations_test.csv" + +test_pipeline = [ + dict(type="DecordInit"), + dict( + type="SampleFrames", clip_len=16, frame_interval=4, num_clips=5, test_mode=True + ), # From VideoMAEv2 repo + dict(type="DecordDecode"), + dict(type="Resize", scale=(-1, 224)), + dict(type="ThreeCrop", crop_size=224), # From VideoMAEv2 repo + dict(type="FormatShape", input_format="NCTHW"), + dict(type="PackActionInputs"), +] + +test_dataloader = dict( + batch_size=3, # From VideoMAEv2 repo + num_workers=8, + persistent_workers=True, + sampler=dict(type="DefaultSampler", shuffle=False), + dataset=dict( + type=dataset_type, + sampling_strategy=sampling_strategy, + label_strategy=label_strategy, + ann_file=ann_file_test, + pipeline=test_pipeline, + num_classes=3, + ), +) diff --git a/configs/default_runtime.py b/configs/default_runtime.py new file mode 100644 index 0000000..0f43ef4 --- /dev/null +++ b/configs/default_runtime.py @@ -0,0 +1,48 @@ +"""Default runtime for our experiments.""" + +# Trying to skip this part, since we have custom registries not in this scope +default_scope = "mmaction" +work_dir = "experiments" +custom_imports = dict(imports=["datasets"], allow_failed_imports=False) +launcher = "none" + +default_hooks = dict( + runtime_info=dict(type="RuntimeInfoHook"), + timer=dict(type="IterTimerHook"), + logger=dict(type="LoggerHook"), + param_scheduler=dict(type="ParamSchedulerHook"), + checkpoint=dict( + type="CheckpointHook", + interval=1, + by_epoch=True, + max_keep_ckpts=3, + save_best="auto", # For CE, this is top-1-acc + ), + sampler_seed=dict(type="DistSamplerSeedHook"), + sync_buffers=dict(type="SyncBuffersHook"), +) + +# Hook disabled since it cannot handle NCTHW tensors +# TODO fix this +# custom_hooks = [dict(type="VisualizationHook", enable=True)] + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0), + dist_cfg=dict(backend="nccl"), +) + +log_processor = dict( + type="LogProcessor", + window_size=10, + by_epoch=True, +) + +vis_backends = [dict(type="TensorboardVisBackend", save_dir="experiments/tensorboard")] +visualizer = dict(type="ActionVisualizer", vis_backends=vis_backends) + +log_level = "INFO" + +# Overwrite this to continue training +load_from = None +resume = False diff --git a/configs/models/videomaev2.py b/configs/models/videomaev2.py index df3b356..1ef5862 100644 --- a/configs/models/videomaev2.py +++ b/configs/models/videomaev2.py @@ -27,11 +27,10 @@ average_clips="prob", multi_class=True, ), - # TODO: update this to fit our dataset data_preprocessor=dict( type="ActionDataPreprocessor", - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], + mean=[102.17311096191406, 98.78225708007812, 92.68714141845703], + std=[58.04566192626953, 57.004024505615234, 57.3704948425293], format_shape="NCTHW", ), ) diff --git a/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py b/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py new file mode 100644 index 0000000..0f18939 --- /dev/null +++ b/configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py @@ -0,0 +1,72 @@ +_base_ = ["../default_runtime.py", "../datasets/high-quality-fall_runner-base.py"] + +# Finetuning parameters are from VideoMAEv2 repo +# https://github.com/OpenGVLab/VideoMAEv2/blob/master/docs/FINETUNE.md + + +# ViT-S-P16 +model = dict( + type="Recognizer3D", + backbone=dict( + type="VisionTransformer", + img_size=224, + patch_size=16, + embed_dims=384, + depth=12, + num_heads=6, + mlp_ratio=4, + qkv_bias=True, + num_frames=16, + norm_cfg=dict(type="LN", eps=1e-6), + drop_path_rate=0.3, # From VideoMAEv2 repo + ), + cls_head=dict( + type="TimeSformerHead", + num_classes=3, + in_channels=384, + average_clips="prob", + ), + data_preprocessor=dict( + type="ActionDataPreprocessor", + mean=[102.17311096191406, 98.78225708007812, 92.68714141845703], + std=[58.04566192626953, 57.004024505615234, 57.3704948425293], + format_shape="NCTHW", + ), +) + +# Loading weights +load_from = "weights/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-25c748fd.pth" + +# TRAINING CONFIG +train_cfg = dict(type="EpochBasedTrainLoop", max_epochs=35, val_interval=1) + +# TODO: Think about fine-tuning param scheduler +param_scheduler = [ + dict( + type="LinearLR", start_factor=0.001, by_epoch=True, begin=0, end=5 + ), # From VideoMAEv2 repo +] + +optim_wrapper = dict( + type="OptimWrapper", + optimizer=dict( + type="AdamW", # From VideoMAEv2 repo + lr=1e-3, # From VideoMAEv2 repo + weight_decay=0.1, # From VideoMAEv2 repo + betas=(0.9, 0.999), # From VideoMAEv2 repo + ), + clip_grad=dict(max_norm=5, norm_type=2), # From VideoMAEv2 repo +) + +# VALIDATION CONFIG +val_evaluator = dict( + type="AccMetric", metric_options=dict(top_k_accuracy=dict(topk=(1,))) +) +val_cfg = dict(type="ValLoop") + + +# TEST CONFIG +test_evaluator = dict( + type="AccMetric", metric_options=dict(top_k_accuracy=dict(topk=(1,))) +) +test_cfg = dict(type="TestLoop") diff --git a/containers/requirements.txt b/containers/requirements.txt index bb4664e..6bf423b 100644 --- a/containers/requirements.txt +++ b/containers/requirements.txt @@ -2,4 +2,6 @@ openpyxl>=3.0 openmim>=0.3 ffmpeg-python>=0.2 -dvc[s3] \ No newline at end of file +dvc[s3] +dvclive>=3.3 +tensorboard>=2.15 diff --git a/datasets/high_quality_fall_dataset.py b/datasets/high_quality_fall_dataset.py index f497bd6..1542ce2 100644 --- a/datasets/high_quality_fall_dataset.py +++ b/datasets/high_quality_fall_dataset.py @@ -76,16 +76,20 @@ def __init__( test_mode: bool = False, **kwargs, ) -> None: + # Bug in MMENGINE: kwarg `custom_imports` is not removed from kwargs + # this causes an error when building the dataset + # TODO: Create an issue on MMENGINE, can be fixed here: + # https://github.com/open-mmlab/mmengine/blob/85c0976bc2434157f786d44cdd8f0fb2955414f0/mmengine/config/config.py#L462C34-L462C34 + kwargs.pop("custom_imports", None) + if isinstance(sampling_strategy, dict): - built_sampling_strategy = SAMPLING_STRATEGIES.build(sampling_strategy) # type: SamplingStrategy + self.sampling_strategy = SAMPLING_STRATEGIES.build(sampling_strategy) # type: SamplingStrategy else: - built_sampling_strategy = sampling_strategy - self.sampling_strategy = built_sampling_strategy + self.sampling_strategy = sampling_strategy if isinstance(label_strategy, dict): - built_label_strategy = LABEL_STRATEGIES.build(label_strategy) # type: LabelStrategy + self.label_strategy = LABEL_STRATEGIES.build(label_strategy) # type: LabelStrategy else: - built_label_strategy = label_strategy - self.label_strategy = built_label_strategy + self.label_strategy = label_strategy super().__init__( ann_file, pipeline=pipeline, @@ -95,6 +99,7 @@ def __init__( start_index=start_index, modality=modality, test_mode=test_mode, + **kwargs, ) def load_data_list(self) -> List[dict]: diff --git a/datasets/transforms/clip_video.py b/datasets/transforms/clip_video.py index cf519d0..6865157 100644 --- a/datasets/transforms/clip_video.py +++ b/datasets/transforms/clip_video.py @@ -33,10 +33,11 @@ def transform(self, results: Dict) -> Dict: dict: The result dict. """ interval = results["interval"] + total_frames = results["total_frames"] fps = results["avg_fps"] offset = results["start_index"] if "start_index" in results else 0 start_frame = int(interval[0] * fps) + offset - end_frame = int(interval[1] * fps) + offset + end_frame = min(int(interval[1] * fps) + offset, total_frames) results["start_index"] = start_frame results["total_frames"] = end_frame - start_frame return results diff --git a/datasets/transforms/label_strategy.py b/datasets/transforms/label_strategy.py index 8483de0..00e57b9 100644 --- a/datasets/transforms/label_strategy.py +++ b/datasets/transforms/label_strategy.py @@ -51,6 +51,26 @@ def __post_init__(self): as we have built it.""" +def _calculate_interval_overlap( + interval1: IntervalInSeconds, interval2: IntervalInSeconds +) -> float: + if ( + interval1[0] <= interval2[1] and interval1[1] >= interval2[0] + ): # If the intervals overlap + return max(0, min(interval1[1], interval2[1]) - max(interval1[0], interval2[0])) + else: # If the intervals do not overlap + return 0.0 + + +def _overlap_over_threshold( + overlap: float, threshold: float, clip_length: float, absolute: bool +) -> bool: + if absolute: + return overlap > threshold + else: + return overlap / clip_length > threshold + + class LabelStrategy(abc.ABC): """Generic labeling strategy. Used to extract labels from an annotation and a given clip. @@ -65,7 +85,7 @@ def __init__(self, label_description: LabelDescription | dict) -> None: self.label_description = label_description @abc.abstractmethod - def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> list[int]: + def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> list[int] | int: """Extracts the label from the annotation and the clip. Args: @@ -75,7 +95,7 @@ def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> list[int]: clip (IntervalInSeconds): Interval of the clip in seconds. Returns: - List[int]: Labels of the clip.""" + List[int] | int: Labels of the clip. Type is a list, if it is a multi class strategy""" ... @@ -101,7 +121,7 @@ def __init__( self.threshold = threshold self.absolute_threshold = absolute_threshold - def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> list[int]: + def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> list[int] | int: labels = [] labeled_time = 0.0 for idx, (start_name, end_name, visible_name) in enumerate( @@ -119,9 +139,9 @@ def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> list[int]: start = annotation[start_name] end = annotation[end_name] - overlap = self._calculate_interval_overlap(clip, (start, end)) + overlap = _calculate_interval_overlap(clip, (start, end)) - if self._overlap_over_threshold( + if _overlap_over_threshold( overlap, self.threshold, clip[1] - clip[0], self.absolute_threshold ): labeled_time += overlap @@ -133,22 +153,38 @@ def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> list[int]: labels.append(self.label_description.other_class) return labels - def _calculate_interval_overlap( - self, interval1: IntervalInSeconds, interval2: IntervalInSeconds - ) -> float: - if ( - interval1[0] <= interval2[1] and interval1[1] >= interval2[0] - ): # If the intervals overlap - return max( - 0, min(interval1[1], interval2[1]) - max(interval1[0], interval2[0]) - ) - else: # If the intervals do not overlap - return 0.0 - - def _overlap_over_threshold( - self, overlap: float, threshold: float, clip_length: float, absolute: bool - ) -> bool: - if absolute: - return overlap > threshold - else: - return overlap / clip_length > threshold + +@LABEL_STRATEGIES.register_module() +class PriorityLabel(ExistenceLabel): + """Assigns the label of the action with the highest priority. + + Args: + threshold (float): Threshold for the existance of an action. Defaults to 0. + absolute_threshold (bool): Whether to use the threshold as an absolute value (True) in seconds + or as a percentage of the clip length (False). Defaults to True + priority (list[int]): Priority of the actions according to label indices. + Priority in the list of indices is in descending order. + Defaults to [0, 1, 2]. + """ + + def __init__( + self, + label_description: LabelDescription | dict, + threshold: float = 0.0, + absolute_threshold: bool = True, + priority: list[int] | None = None, + ): + super().__init__(label_description, threshold, absolute_threshold) + self.priority = priority if priority is not None else [0, 1, 2] + + def label(self, annotation: pd.Series, clip: IntervalInSeconds) -> int: + labels = super().label(annotation, clip) + + # Typing check + if isinstance(labels, int): + return labels + + for priority in self.priority: + if priority in labels: + return priority + return self.label_description.other_class diff --git a/environment.yml b/environment.yml index d59cb54..2c2d406 100644 --- a/environment.yml +++ b/environment.yml @@ -29,3 +29,6 @@ dependencies: - ipympl>=0.9 - pandas-stubs>=2.1 - ffmpeg-python>=0.2 + - dvclive>=3.3 + - tensorboard>=2.15 + - torch-tb-profiler>=0.4 diff --git a/job_scripts/toy_training.sh b/job_scripts/toy_training.sh index 3fd0bff..016507b 100755 --- a/job_scripts/toy_training.sh +++ b/job_scripts/toy_training.sh @@ -13,4 +13,4 @@ apptainer exec \ --env CUDA_VISIBLE_DEVICES=2,3 \ containers/c3se_job_container.sif \ python mmaction2/tools/train.py \ - configs/models/videomaev2.py \ No newline at end of file + configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py \ No newline at end of file diff --git a/notebooks/dataset_normalization_analysis.ipynb b/notebooks/dataset_normalization_analysis.ipynb new file mode 100644 index 0000000..c6bfe46 --- /dev/null +++ b/notebooks/dataset_normalization_analysis.ipynb @@ -0,0 +1,665 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset normalization analysis\n", + "\n", + "In this notebook we load the train dataset and calculate the channel means and standard deviations. This is necessary for normalizing the data before passing it into the backbone." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11/27 19:43:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - \n", + "------------------------------------------------------------\n", + "System environment:\n", + " sys.platform: darwin\n", + " Python: 3.10.13 | packaged by conda-forge | (main, Oct 26 2023, 18:09:17) [Clang 16.0.6 ]\n", + " CUDA available: False\n", + " numpy_random_seed: 221150840\n", + " GCC: Apple clang version 15.0.0 (clang-1500.0.40.1)\n", + " PyTorch: 2.1.1\n", + " PyTorch compiling details: PyTorch built with:\n", + " - GCC 4.2\n", + " - C++ Version: 201703\n", + " - clang 13.1.6\n", + " - LAPACK is enabled (usually provided by MKL)\n", + " - NNPACK is enabled\n", + " - CPU capability usage: NO AVX\n", + " - Build settings: BLAS_INFO=accelerate, BUILD_TYPE=Release, CXX_COMPILER=/Applications/Xcode_13.3.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_PYTORCH_METAL_EXPORT -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DUSE_COREML_DELEGATE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wvla-extension -Wnewline-eof -Winconsistent-missing-override -Winconsistent-missing-destructor-override -Wno-range-loop-analysis -Wno-pass-failed -Wsuggest-override -Wno-error=pedantic -Wno-error=old-style-cast -Wno-error=inconsistent-missing-override -Wno-error=inconsistent-missing-destructor-override -Wconstant-conversion -Wno-invalid-partial-specialization -Wno-unused-private-field -Wno-missing-braces -Wunused-lambda-capture -Qunused-arguments -fcolor-diagnostics -faligned-new -Wno-unused-but-set-variable -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -DUSE_MPS -Wno-unused-private-field -Wno-missing-braces, LAPACK_INFO=accelerate, TORCH_DISABLE_GPU_ASSERTS=OFF, TORCH_VERSION=2.1.1, USE_CUDA=0, USE_CUDNN=OFF, USE_EIGEN_FOR_BLAS=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=OFF, USE_MKLDNN=OFF, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=ON, USE_OPENMP=OFF, USE_ROCM=OFF, \n", + "\n", + " TorchVision: 0.16.1\n", + " OpenCV: 4.8.1\n", + " MMEngine: 0.9.1\n", + "\n", + "Runtime environment:\n", + " cudnn_benchmark: False\n", + " mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0}\n", + " dist_cfg: {'backend': 'nccl'}\n", + " seed: 221150840\n", + " Distributed launcher: none\n", + " Distributed training: False\n", + " GPU number: 1\n", + "------------------------------------------------------------\n", + "\n", + "11/27 19:43:17 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Config:\n", + "ann_file_test = 'data/Fall_Simulation_Data/annotations_test.csv'\n", + "ann_file_train = 'data/Fall_Simulation_Data/annotations_train.csv'\n", + "ann_file_val = 'data/Fall_Simulation_Data/annotations_val.csv'\n", + "custom_hooks = [\n", + " dict(enable=True, type='VisualizationHook'),\n", + "]\n", + "custom_imports = dict(\n", + " allow_failed_imports=False, imports=[\n", + " 'datasets',\n", + " ])\n", + "dataset_type = 'HighQualityFallDataset'\n", + "default_hooks = dict(\n", + " checkpoint=dict(\n", + " by_epoch=True,\n", + " interval=1,\n", + " max_keep_ckpts=5,\n", + " save_best='auto',\n", + " type='CheckpointHook'),\n", + " logger=dict(type='LoggerHook'),\n", + " param_scheduler=dict(type='ParamSchedulerHook'),\n", + " runtime_info=dict(type='RuntimeInfoHook'),\n", + " sampler_seed=dict(type='DistSamplerSeedHook'),\n", + " sync_buffers=dict(type='SyncBuffersHook'),\n", + " timer=dict(type='IterTimerHook'))\n", + "default_scope = 'mmaction'\n", + "env_cfg = dict(\n", + " cudnn_benchmark=False,\n", + " dist_cfg=dict(backend='nccl'),\n", + " mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))\n", + "label_strategy = dict(\n", + " label_description=dict(\n", + " end_timestamp_names=[\n", + " 'fall_end',\n", + " 'lying_end',\n", + " ],\n", + " names=[\n", + " 'fall',\n", + " 'lying',\n", + " 'other',\n", + " ],\n", + " other_class=2,\n", + " start_timestamp_names=[\n", + " 'fall_start',\n", + " 'lying_start',\n", + " ],\n", + " visible_names=[\n", + " 'fall_visible',\n", + " 'lying_visible',\n", + " ]),\n", + " type='ExistenceLabel')\n", + "launcher = 'none'\n", + "load_from = 'weights/vit-small-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-25c748fd.pth'\n", + "log_level = 'INFO'\n", + "log_processor = dict(by_epoch=True, type='LogProcessor', window_size=10)\n", + "model = dict(\n", + " backbone=dict(\n", + " depth=12,\n", + " embed_dims=384,\n", + " img_size=224,\n", + " mlp_ratio=4,\n", + " norm_cfg=dict(eps=1e-06, type='LN'),\n", + " num_frames=16,\n", + " num_heads=6,\n", + " patch_size=16,\n", + " qkv_bias=True,\n", + " type='VisionTransformer'),\n", + " cls_head=dict(\n", + " average_clips='prob',\n", + " in_channels=384,\n", + " multi_class=True,\n", + " num_classes=3,\n", + " type='TimeSformerHead'),\n", + " data_preprocessor=dict(\n", + " format_shape='NCTHW',\n", + " mean=[\n", + " 102.17311096191406,\n", + " 98.78225708007812,\n", + " 92.68714141845703,\n", + " ],\n", + " std=[\n", + " 58.04566192626953,\n", + " 57.004024505615234,\n", + " 57.3704948425293,\n", + " ],\n", + " type='ActionDataPreprocessor'),\n", + " type='Recognizer3D')\n", + "optim_wrapper = dict(\n", + " clip_grad=dict(max_norm=40, norm_type=2),\n", + " optimizer=dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0001),\n", + " type='OptimWrapper')\n", + "param_scheduler = dict(\n", + " begin=0,\n", + " by_epoch=True,\n", + " end=100,\n", + " gamma=0.1,\n", + " milestones=[\n", + " 40,\n", + " 80,\n", + " ],\n", + " type='MultiStepLR')\n", + "resume = False\n", + "sampling_strategy = dict(clip_len=10, type='UniformSampling')\n", + "test_cfg = dict(type='TestLoop')\n", + "test_dataloader = dict(\n", + " batch_size=1,\n", + " dataset=dict(\n", + " ann_file='data/Fall_Simulation_Data/annotations_test.csv',\n", + " label_strategy=dict(\n", + " label_description=dict(\n", + " end_timestamp_names=[\n", + " 'fall_end',\n", + " 'lying_end',\n", + " ],\n", + " names=[\n", + " 'fall',\n", + " 'lying',\n", + " 'other',\n", + " ],\n", + " other_class=2,\n", + " start_timestamp_names=[\n", + " 'fall_start',\n", + " 'lying_start',\n", + " ],\n", + " visible_names=[\n", + " 'fall_visible',\n", + " 'lying_visible',\n", + " ]),\n", + " type='ExistenceLabel'),\n", + " multi_class=True,\n", + " num_classes=3,\n", + " pipeline=[\n", + " dict(type='DecordInit'),\n", + " dict(\n", + " clip_len=16,\n", + " frame_interval=4,\n", + " num_clips=5,\n", + " test_mode=True,\n", + " type='SampleFrames'),\n", + " dict(type='DecordDecode'),\n", + " dict(scale=(\n", + " -1,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(crop_size=224, type='ThreeCrop'),\n", + " dict(input_format='NCTHW', type='FormatShape'),\n", + " dict(type='PackActionInputs'),\n", + " ],\n", + " sampling_strategy=dict(clip_len=10, type='UniformSampling'),\n", + " type='HighQualityFallDataset'),\n", + " num_workers=8,\n", + " persistent_workers=True,\n", + " sampler=dict(shuffle=False, type='DefaultSampler'))\n", + "test_evaluator = dict(type='AccMetric')\n", + "test_pipeline = [\n", + " dict(type='DecordInit'),\n", + " dict(\n", + " clip_len=16,\n", + " frame_interval=4,\n", + " num_clips=5,\n", + " test_mode=True,\n", + " type='SampleFrames'),\n", + " dict(type='DecordDecode'),\n", + " dict(scale=(\n", + " -1,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(crop_size=224, type='ThreeCrop'),\n", + " dict(input_format='NCTHW', type='FormatShape'),\n", + " dict(type='PackActionInputs'),\n", + "]\n", + "train_cfg = dict(max_epochs=100, type='EpochBasedTrainLoop', val_interval=3)\n", + "train_dataloader = dict(\n", + " batch_size=1,\n", + " dataset=dict(\n", + " ann_file='data/Fall_Simulation_Data/annotations_train.csv',\n", + " label_strategy=dict(\n", + " label_description=dict(\n", + " end_timestamp_names=[\n", + " 'fall_end',\n", + " 'lying_end',\n", + " ],\n", + " names=[\n", + " 'fall',\n", + " 'lying',\n", + " 'other',\n", + " ],\n", + " other_class=2,\n", + " start_timestamp_names=[\n", + " 'fall_start',\n", + " 'lying_start',\n", + " ],\n", + " visible_names=[\n", + " 'fall_visible',\n", + " 'lying_visible',\n", + " ]),\n", + " type='ExistenceLabel'),\n", + " multi_class=True,\n", + " num_classes=3,\n", + " pipeline=[\n", + " dict(type='DecordInit'),\n", + " dict(type='ClipVideo'),\n", + " dict(\n", + " clip_len=16,\n", + " frame_interval=4,\n", + " num_clips=1,\n", + " type='SampleFrames'),\n", + " dict(type='DecordDecode'),\n", + " dict(scale=(\n", + " -1,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(type='RandomResizedCrop'),\n", + " dict(keep_ratio=False, scale=(\n", + " 224,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(flip_ratio=0.5, type='Flip'),\n", + " dict(input_format='NCTHW', type='FormatShape'),\n", + " dict(type='PackActionInputs'),\n", + " ],\n", + " sampling_strategy=dict(clip_len=10, type='UniformSampling'),\n", + " type='HighQualityFallDataset'),\n", + " num_workers=8,\n", + " persistent_workers=True,\n", + " sampler=dict(shuffle=True, type='DefaultSampler'))\n", + "train_pipeline = [\n", + " dict(type='DecordInit'),\n", + " dict(type='ClipVideo'),\n", + " dict(clip_len=16, frame_interval=4, num_clips=1, type='SampleFrames'),\n", + " dict(type='DecordDecode'),\n", + " dict(scale=(\n", + " -1,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(type='RandomResizedCrop'),\n", + " dict(keep_ratio=False, scale=(\n", + " 224,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(flip_ratio=0.5, type='Flip'),\n", + " dict(input_format='NCTHW', type='FormatShape'),\n", + " dict(type='PackActionInputs'),\n", + "]\n", + "val_cfg = dict(type='ValLoop')\n", + "val_dataloader = dict(\n", + " batch_size=1,\n", + " dataset=dict(\n", + " ann_file='data/Fall_Simulation_Data/annotations_val.csv',\n", + " label_strategy=dict(\n", + " label_description=dict(\n", + " end_timestamp_names=[\n", + " 'fall_end',\n", + " 'lying_end',\n", + " ],\n", + " names=[\n", + " 'fall',\n", + " 'lying',\n", + " 'other',\n", + " ],\n", + " other_class=2,\n", + " start_timestamp_names=[\n", + " 'fall_start',\n", + " 'lying_start',\n", + " ],\n", + " visible_names=[\n", + " 'fall_visible',\n", + " 'lying_visible',\n", + " ]),\n", + " type='ExistenceLabel'),\n", + " multi_class=True,\n", + " num_classes=3,\n", + " pipeline=[\n", + " dict(type='DecordInit'),\n", + " dict(type='ClipVideo'),\n", + " dict(\n", + " clip_len=16,\n", + " frame_interval=4,\n", + " num_clips=1,\n", + " test_mode=True,\n", + " type='SampleFrames'),\n", + " dict(type='DecordDecode'),\n", + " dict(scale=(\n", + " -1,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(crop_size=224, type='CenterCrop'),\n", + " dict(input_format='NCTHW', type='FormatShape'),\n", + " dict(type='PackActionInputs'),\n", + " ],\n", + " sampling_strategy=dict(clip_len=10, type='UniformSampling'),\n", + " type='HighQualityFallDataset'),\n", + " num_workers=8,\n", + " persistent_workers=True,\n", + " sampler=dict(shuffle=False, type='DefaultSampler'))\n", + "val_evaluator = dict(type='AccMetric')\n", + "val_pipeline = [\n", + " dict(type='DecordInit'),\n", + " dict(type='ClipVideo'),\n", + " dict(\n", + " clip_len=16,\n", + " frame_interval=4,\n", + " num_clips=1,\n", + " test_mode=True,\n", + " type='SampleFrames'),\n", + " dict(type='DecordDecode'),\n", + " dict(scale=(\n", + " -1,\n", + " 224,\n", + " ), type='Resize'),\n", + " dict(crop_size=224, type='CenterCrop'),\n", + " dict(input_format='NCTHW', type='FormatShape'),\n", + " dict(type='PackActionInputs'),\n", + "]\n", + "vis_backends = [\n", + " dict(type='DVCLiveVisBackend'),\n", + "]\n", + "visualizer = dict(\n", + " type='ActionVisualizer', vis_backends=[\n", + " dict(type='DVCLiveVisBackend'),\n", + " ])\n", + "work_dir = 'experiments'\n", + "\n", + "11/27 19:43:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.\n", + "11/27 19:43:19 - mmengine - \u001b[4m\u001b[97mINFO\u001b[0m - Hooks will be executed in the following order:\n", + "before_run:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "before_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_train_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) DistSamplerSeedHook \n", + " -------------------- \n", + "before_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_train_iter:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_train_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_val:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + " -------------------- \n", + "before_val_epoch:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) SyncBuffersHook \n", + " -------------------- \n", + "before_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_val_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) VisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_val_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + "(LOW ) ParamSchedulerHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "after_val:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + " -------------------- \n", + "after_train:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(VERY_LOW ) CheckpointHook \n", + " -------------------- \n", + "before_test:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + " -------------------- \n", + "before_test_epoch:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "before_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + " -------------------- \n", + "after_test_iter:\n", + "(NORMAL ) IterTimerHook \n", + "(NORMAL ) VisualizationHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test_epoch:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + "(NORMAL ) IterTimerHook \n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "after_test:\n", + "(VERY_HIGH ) RuntimeInfoHook \n", + " -------------------- \n", + "after_run:\n", + "(BELOW_NORMAL) LoggerHook \n", + " -------------------- \n", + "11/27 19:43:19 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - Failed to search registry with scope \"mmaction\" in the \"sampling strategy\" registry tree. As a workaround, the current \"sampling strategy\" registry in \".\" is used to build instance. This may cause unexpected failure when running the built modules. Please check whether \"mmaction\" is a correct scope, or whether the registry is initialized.\n", + "11/27 19:43:19 - mmengine - \u001b[5m\u001b[4m\u001b[33mWARNING\u001b[0m - Failed to search registry with scope \"mmaction\" in the \"label strategy\" registry tree. As a workaround, the current \"label strategy\" registry in \".\" is used to build instance. This may cause unexpected failure when running the built modules. Please check whether \"mmaction\" is a correct scope, or whether the registry is initialized.\n" + ] + } + ], + "source": [ + "from mmengine.config import Config\n", + "from mmengine.runner import Runner\n", + "import torch\n", + "from tqdm import tqdm\n", + "\n", + "runner_cfg = Config.fromfile(\n", + " \"configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py\"\n", + ")\n", + "\n", + "runner = Runner.from_cfg(runner_cfg)\n", + "\n", + "train_dataloader_cfg = runner.cfg.train_dataloader\n", + "\n", + "# My formulas don't work in parallel :( (I think)\n", + "train_dataloader_cfg[\"num_workers\"] = 0\n", + "train_dataloader_cfg[\"persistent_workers\"] = False\n", + "\n", + "train_dataloader = runner.build_dataloader(train_dataloader_cfg, seed=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I use incremental mean and std calculation to avoid numerical instability.\n", + "\n", + "[Batch statistics ](https://notmatthancock.github.io/2017/03/23/simple-batch-stat-updates.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class StatsRecorder:\n", + " def __init__(self, epsilon=1e-3) -> None:\n", + " self.nobservations = 0\n", + " self.epsilon = epsilon\n", + " self.threshold_counter = 0\n", + "\n", + " def update(self, x):\n", + " if self.nobservations == 0:\n", + " self.mean = x.mean(dim=1)\n", + " self.std = x.std(dim=1)\n", + " self.nobservations = x.shape[1]\n", + " else:\n", + " newmean = x.mean(dim=1)\n", + " newstd = x.std(dim=1)\n", + "\n", + " if torch.all(torch.abs(newmean - self.mean) < self.epsilon) and torch.all(\n", + " torch.abs(newstd - self.std) < self.epsilon\n", + " ):\n", + " self.threshold_counter += 1\n", + " if self.threshold_counter >= 10:\n", + " print(\"std and mean are not changing anymore\")\n", + " raise KeyboardInterrupt\n", + " else:\n", + " self.threshold_counter = 0\n", + "\n", + " m = self.nobservations * 1.0\n", + " n = x.shape[1]\n", + "\n", + " tmp = self.mean\n", + "\n", + " self.mean = m / (m + n) * tmp + n / (m + n) * newmean\n", + " self.std = (\n", + " m / (m + n) * self.std**2\n", + " + n / (m + n) * newstd**2\n", + " + m * n / (m + n) ** 2 * (tmp - newmean) ** 2\n", + " )\n", + " self.std = torch.sqrt(self.std)\n", + "\n", + " self.nobservations += n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/10764 [00:00