diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f139efe --- /dev/null +++ b/.gitignore @@ -0,0 +1,54 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Project Specific +models/*.weights +models/*.pth +data/* +data/*.csv diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e189569 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Torben Teepe + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 43adf3c..4ba1a1f 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,27 @@ This repository contains the PyTorch code for: GaitGraph: Graph Convolutional Network for Skeleton-Based Gait Recognition [Torben Teepe](https://github.com/tteepe), Ali Khan, Johannes Gilg, [Fabian Herzog](https://github.com/fubel) +![Pipeline](images/pipeline.png) + +## Quick Start +Quick Start & models coming soon! + +## Main Results +Top-1 Accuracy per probe angle excluding identical-view cases for the provided models on +[CASIA-B](http://www.cbsr.ia.ac.cn/english/Gait%20Databases.asp) dataset. + +| | 0 | 18 | 36 | 54 | 72 | 90 | 108 | 126 | 144 | 162 | 180 | mean | +|:-------|-----:|-----:|-----:|-----:|-----:|-----:|------:|------:|------:|------:|------:|-------:| +| NM#5-6 | 85.3 | 88.5 | 91 | 92.5 | 87.2 | 86.5 | 88.4 | 89.2 | 87.9 | 85.9 | 81.9 | 87.7 | +| BG#1-2 | 75.8 | 76.7 | 75.9 | 76.1 | 71.4 | 73.9 | 78 | 74.7 | 75.4 | 75.4 | 69.2 | 74.8 | +| CL#1-2 | 69.6 | 66.1 | 68.8 | 67.2 | 64.5 | 62 | 69.5 | 65.6 | 65.7 | 66.1 | 64.3 | 66.3 | + +## Licence & Acknowledgement +GaitPose itself is released under the MIT License (see LICENSE). + +The following parts of the code are borrowed from other projects. Thanks for their wonderful work! +- Object Detector: [eriklindernoren/PyTorch-YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) +- Pose Estimator: [HRNet/HRNet-Human-Pose-Estimation](https://github.com/HRNet/HRNet-Human-Pose-Estimation) +- ST-GCN Model: [yysijie/st-gcn](https://github.com/yysijie/st-gcn) +- ResGCNv1 Model: [yfsong0709/ResGCNv1](https://github.com/yfsong0709/ResGCNv1) +- SupCon Loss: [HobbitLong/SupContrast](https://github.com/HobbitLong/SupContrast) diff --git a/images/pipeline.png b/images/pipeline.png new file mode 100644 index 0000000..a49cb72 Binary files /dev/null and b/images/pipeline.png differ diff --git a/models/download_weights.sh b/models/download_weights.sh new file mode 100644 index 0000000..39aec9a --- /dev/null +++ b/models/download_weights.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Download weights for vanilla YOLOv3 +wget -c https://pjreddie.com/media/files/yolov3.weights +# Download weights for tiny YOLOv3 +wget -c https://pjreddie.com/media/files/yolov3-tiny.weights +## Download weights for backbone network +#wget -c https://pjreddie.com/media/files/darknet53.conv.74 + +print "#############################################################" +print "######## Weights for HRNet Pose Estimation need to ##########" +print "######## be downloaded manually from here: ##########" +print "######## https://drive.google.com/drive/folders/1nzM_OBV9LbAEA7HClC0chEyf_7ECDXYA" +print "######## Files: pose_hrnet_*.pth ##########" +print "#############################################################" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..71200c6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +yacs==0.1.8 +numpy==1.19.5 +torch==1.7.1 +torchvision==0.8.2 +matplotlib==3.3.3 +tabulate==0.8.7 +tensorflow==2.4.0 +tensorboard==2.4.0 +pillow==8.1.0 +tqdm==4.56.0 +opencv-python~=4.5 +jupyter==1.0.0 +pandas==1.1.0 diff --git a/save/.gitkeep b/save/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/common.py b/src/common.py new file mode 100644 index 0000000..6b7ba22 --- /dev/null +++ b/src/common.py @@ -0,0 +1,171 @@ +import os +import argparse +import torch +from models.st_gcn.st_gcn import STGCNEmbedding +import models.ResGCNv1 + + +def parse_option(): + parser = argparse.ArgumentParser(description="Training model on gait sequence") + parser.add_argument("dataset", choices=["casia-b", "outdoor-gait", "tum-gaid"]) + parser.add_argument("train_data_path", help="Path to train data CSV") + parser.add_argument("--valid_data_path", help="Path to validation data CSV") + parser.add_argument("--valid_split", type=float, default=0.2) + + parser.add_argument("--checkpoint_path", help="Path to checkpoint to resume") + parser.add_argument("--weight_path", help="Path to weights for model") + + # Optionals + parser.add_argument("--num_workers", type=int, default=8) + parser.add_argument( + "--gpus", default="0", help="-1 for CPU, use comma for multiple gpus" + ) + parser.add_argument("--batch_size", type=int, default=64) + parser.add_argument("--batch_size_validation", type=int, default=64) + parser.add_argument("--epochs", type=int, default=500) + parser.add_argument("--start_epoch", type=int, default=1) + parser.add_argument("--log_interval", type=int, default=10) + parser.add_argument("--save_interval", type=int, default=50, help="save frequency") + parser.add_argument( + "--save_best_start", type=float, default=0.3, help="save frequency" + ) + parser.add_argument("--use_amp", action="store_true") + parser.add_argument("--tune", action="store_true") + parser.add_argument("--shuffle", action="store_true") + parser.add_argument("--exp_name", help="Name of the experiment") + + parser.add_argument("--network_name", default="resgcn-n39-r4") + parser.add_argument("--sequence_length", type=int, default=60) + parser.add_argument("--embedding_layer_size", type=int, default=256) + parser.add_argument("--temporal_kernel_size", type=int, default=9) + parser.add_argument("--dropout", type=float, default=0.4) + parser.add_argument("--learning_rate", type=float, default=1e-3) + parser.add_argument( + "--lr_decay_rate", type=float, default=0.1, help="decay rate for learning rate" + ) + parser.add_argument("--point_noise_std", type=float, default=0.05) + parser.add_argument("--joint_noise_std", type=float, default=0.1) + parser.add_argument("--flip_probability", type=float, default=0.5) + parser.add_argument("--mirror_probability", type=float, default=0.5) + parser.add_argument("--weight_decay", type=float, default=1e-5) + parser.add_argument("--use_multi_branch", action="store_true") + parser.add_argument( + "--temp", type=float, default=0.07, help="temperature for loss function" + ) + opt = parser.parse_args() + + # Sanitize opts + opt.gpus_str = opt.gpus + opt.gpus = [int(gpu) for gpu in opt.gpus.split(",")] + + return opt + + +def log_hyperparameter(writer, opt, accuracy, loss): + writer.add_hparams( + { + "batch_size": opt.batch_size, + "sequence_length": opt.sequence_length, + "embedding_layer_size": opt.embedding_layer_size, + "dropout": opt.dropout, + "learning_rate": opt.learning_rate, + "lr_decay_rate": opt.lr_decay_rate, + "point_noise_std": opt.point_noise_std, + "weight_decay": opt.weight_decay, + "temp": opt.temp, + }, + { + "hparam/accuracy": accuracy, + "hparam/loss": loss, + }, + ) + + +def setup_environment(opt): + # HACK: Fix tensorboard + import tensorflow as tf + import tensorboard as tb + + tf.io.gfile = tb.compat.tensorflow_stub.io.gfile + + os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpus_str + opt.cuda = opt.gpus[0] >= 0 + torch.device("cuda" if opt.cuda else "cpu") + + return opt + + +def get_model_stgcn(opt): + # Model + input_channels = 3 + edge_importance_weighting = True + graph_args = {"strategy": "spatial"} + + embedding_net = STGCNEmbedding( + input_channels, + graph_args, + edge_importance_weighting=edge_importance_weighting, + embedding_layer_size=opt.embedding_layer_size, + temporal_kernel_size=opt.temporal_kernel_size, + dropout=opt.dropout, + ) + + return embedding_net + + +def get_model_resgcn(graph, opt): + model_args = { + "A": torch.tensor(graph.A, dtype=torch.float32, requires_grad=False), + "num_class": opt.embedding_layer_size, + "num_input": 1 if not opt.use_multi_branch else 3, + "num_channel": 3 if not opt.use_multi_branch else 6, + "parts": graph.parts, + } + return models.ResGCNv1.create(opt.network_name, **model_args) + + +def get_trainer(model, opt, steps_per_epoch): + optimizer = torch.optim.Adam( + model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay + ) + scheduler = torch.optim.lr_scheduler.OneCycleLR( + optimizer, opt.learning_rate, epochs=opt.epochs, steps_per_epoch=steps_per_epoch + ) + scaler = torch.cuda.amp.GradScaler(enabled=opt.use_amp) + + return optimizer, scheduler, scaler + + +def load_checkpoint(model, optimizer, scheduler, scaler, opt): + if opt.checkpoint_path is not None: + checkpoint = torch.load(opt.checkpoint_path) + model.load_state_dict(checkpoint["model"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + scheduler.load_state_dict(checkpoint["scheduler"]) + scaler.load_state_dict(checkpoint["scaler"]) + opt.start_epoch = checkpoint["epoch"] + + if opt.weight_path is not None: + checkpoint = torch.load(opt.weight_path) + model.load_state_dict(checkpoint["model"], strict=False) + + +def save_model(model, optimizer, scheduler, scaler, opt, epoch, save_file): + print("==> Saving...") + state = { + "opt": opt, + "model": model.state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "scaler": scaler.state_dict(), + "epoch": epoch, + } + torch.save(state, save_file) + del state + + +def count_parameters(model): + """ + Useful function to compute number of parameters in a model. + """ + return sum(p.numel() for p in model.parameters() if p.requires_grad) diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py new file mode 100644 index 0000000..149424a --- /dev/null +++ b/src/datasets/__init__.py @@ -0,0 +1,11 @@ +from .preparation import DatasetSimple, DatasetDetections +from .gait import ( + CasiaBPose, +) + + +def dataset_factory(name): + if name == "casia-b": + return CasiaBPose + + raise ValueError() diff --git a/src/datasets/augmentation.py b/src/datasets/augmentation.py new file mode 100644 index 0000000..be2d495 --- /dev/null +++ b/src/datasets/augmentation.py @@ -0,0 +1,282 @@ +import numpy as np +import cv2 +import torch + +from pose_estimator.utils import get_affine_transform + + +class ToTensor(object): + def __call__(self, data): + return torch.tensor(data, dtype=torch.float) + + +class MultiInput(object): + def __init__(self, connect_joint, enabled=False): + self.connect_joint = connect_joint + self.enabled = enabled + + def __call__(self, data): + # (C, T, V) -> (I, C * 2, T, V) + data = np.transpose(data, (2, 0, 1)) + + if not self.enabled: + return data[np.newaxis, ...] + + C, T, V = data.shape + data_new = np.zeros((3, C * 2, T, V)) + # Joints + data_new[0, :C, :, :] = data + for i in range(V): + data_new[0, C:, :, i] = data[:, :, i] - data[:, :, 1] + # Velocity + for i in range(T - 2): + data_new[1, :C, i, :] = data[:, i + 1, :] - data[:, i, :] + data_new[1, C:, i, :] = data[:, i + 2, :] - data[:, i, :] + # Bones + for i in range(len(self.connect_joint)): + data_new[2, :C, :, i] = data[:, :, i] - data[:, :, self.connect_joint[i]] + bone_length = 0 + for i in range(C - 1): + bone_length += np.power(data_new[2, i, :, :], 2) + bone_length = np.sqrt(bone_length) + 0.0001 + for i in range(C - 1): + data_new[2, C, :, :] = np.arccos(data_new[2, i, :, :] / bone_length) + + return data_new + + +class FlipSequence(object): + def __init__(self, probability=0.5): + self.probability = probability + + def __call__(self, data): + if np.random.random() <= self.probability: + return np.flip(data, axis=0).copy() + return data + + +class MirrorPoses(object): + def __init__(self, probability=0.5): + self.probability = probability + + def __call__(self, data): + if np.random.random() <= self.probability: + center = np.mean(data[:, :, 0], axis=1, keepdims=True) + data[:, :, 0] = center - data[:, :, 0] + center + + return data + + +class RandomSelectSequence(object): + def __init__(self, sequence_length=10): + self.sequence_length = sequence_length + + def __call__(self, data): + try: + start = np.random.randint(0, data.shape[0] - self.sequence_length) + except ValueError: + print(data.shape[0]) + raise ValueError + end = start + self.sequence_length + return data[start:end] + + +class SelectSequenceCenter(object): + def __init__(self, sequence_length=10): + self.sequence_length = sequence_length + + def __call__(self, data): + try: + start = int((data.shape[0]/2) - (self.sequence_length / 2)) + except ValueError: + print(data.shape[0]) + raise ValueError + end = start + self.sequence_length + return data[start:end] + + +class ShuffleSequence(object): + def __init__(self, enabled=False): + self.enabled = enabled + + def __call__(self, data): + if self.enabled: + np.random.shuffle(data) + return data + + +class TwoNoiseTransform(object): + """Create two crops of the same image""" + def __init__(self, transform): + self.transform = transform + + def __call__(self, x): + return [self.transform(x), self.transform(x)] + + +class PointNoise(object): + """ + Add Gaussian noise to pose points + std: standard deviation + """ + + def __init__(self, std=0.15): + self.std = std + + def __call__(self, data): + noise = np.random.normal(0, self.std, data.shape).astype(np.float32) + return data + noise + + +class JointNoise(object): + """ + Add Gaussian noise to joint + std: standard deviation + """ + + def __init__(self, std=0.5): + self.std = std + + def __call__(self, data): + # T, V, C + noise = np.hstack(( + np.random.normal(0, 0.25, (data.shape[1], 2)), + np.zeros((data.shape[1], 1)) + )).astype(np.float32) + + return data + np.repeat(noise[np.newaxis, ...], data.shape[0], axis=0) + + +class DropOutFrames(object): + """ + Type of data augmentation. Dropout frames randomly from a sequence. + Properties: + dropout_rate_range: Defines the range from which dropout rate is picked + prob: Probability that this technique is applied on a sample. + """ + + def __init__(self, probability=0.1, sequence_length=60): + self.probability = probability + self.sequence_length = sequence_length + + def __call__(self, data): + T, V, C = data.shape + + new_data = [] + dropped = 0 + for i in range(T): + if np.random.random() <= self.probability: + new_data.append(data[i]) + else: + dropped += 1 + if T - dropped <= self.sequence_length: + break + + for j in range(i, T): + new_data.append(data[j]) + + return np.array(new_data) + + +class DropOutJoints(object): + """ + Type of data augmentation. Zero joints randomly from a pose. + Properties: + dropout_rate_range: + prob: Probability that this technique is applied on a sample. + """ + + def __init__( + self, prob=1, dropout_rate_range=0.1, + ): + self.dropout_rate_range = dropout_rate_range + self.prob = prob + + def __call__(self, data): + if np.random.binomial(1, self.prob, 1) != 1: + return data + + T, V, C = data.shape + data = data.reshape(T * V, C) + # Choose the dropout_rate randomly for every sample from 0 - dropout range + dropout_rate = np.random.uniform(0, self.dropout_rate_range, 1) + zero_indices = 1 - np.random.binomial(1, dropout_rate, T * V) + for i in range(3): + data[:, i] = zero_indices * data[:, i] + data = data.reshape(T, V, C) + return data + + +class InterpolateFrames(object): + """ + Type of data augmentation. Create more frames between adjacent frames by interpolation + """ + + def __init__(self, probability=0.1): + """ + :param probability: The probability with which this augmentation technique will be applied + """ + self.probability = probability + + def __call__(self, data): + # data shape is T,V,C = Frames, Joints, Channels (X,Y,conf) + T, V, C = data.shape + + # interpolated_data = np.zeros((T + T - 1, V, C), dtype=np.float32) + interpolated_data = [] + for i in range(T): + # Add original frame + interpolated_data.append(data[i]) + + # Skip last + if i == T - 1: + break + + if np.random.random() <= self.probability: + continue + + # Calculate difference between x and y points of each joint of current frame and current frame plus 1 + x_difference = data[i + 1, :, 0] - data[i, :, 0] + y_difference = data[i + 1, :, 1] - data[i, :, 1] + + new_frame_x = ( + data[i, :, 0] + (x_difference * np.random.normal(0.5, 1)) + ) + new_frame_y = ( + data[i, :, 1] + (y_difference * np.random.normal(0.5, 1)) + ) + # Take average of conf of current and next frame to find the conf of the interpolated frame + new_frame_conf = (data[i + 1, :, 2] + data[i, :, 2]) / 2 + interpolated_frame = np.array( + [new_frame_x, new_frame_y, new_frame_conf] + ).transpose() + + interpolated_data.append(interpolated_frame) + + return np.array(interpolated_data) + + +class CropToBox(object): + """Crop image to detection box + """ + + def __init__(self, config): + self.config = config + + def __call__(self, img, center, scale): + rotation = 0 + # pose estimation transformation + trans = get_affine_transform( + center, scale, rotation, self.config.MODEL.IMAGE_SIZE + ) + model_input = cv2.warpAffine( + np.array(img), + trans, + ( + int(self.config.MODEL.IMAGE_SIZE[0]), + int(self.config.MODEL.IMAGE_SIZE[1]), + ), + flags=cv2.INTER_LINEAR, + ) + + return model_input diff --git a/src/datasets/gait.py b/src/datasets/gait.py new file mode 100644 index 0000000..7753d51 --- /dev/null +++ b/src/datasets/gait.py @@ -0,0 +1,129 @@ +import numpy as np +from torch.utils.data import Dataset + + +class PoseDataset(Dataset): + """ + Args: + data_list_path (string): Path to pose data. + sequence_length: Length of sequence for each data point. The number of frames of pose data returned. + train: Training dataset or validation. default : True + transform: Transformation on the dataset + target_transform: Transformation on the target. + """ + + def __init__( + self, + data_list_path, + sequence_length=1, + train=True, + transform=None, + target_transform=None, + ): + super(PoseDataset, self).__init__() + self.data_list = np.loadtxt(data_list_path, skiprows=1, dtype=str) + self.sequence_length = sequence_length + self.train = train + + self.transform = transform + self.target_transform = target_transform + + self.data_dict = {} + + for row in self.data_list: + row = row.split(",") + + target, frame_num = self._filename_to_target(row[0]) + + if target not in self.data_dict: + self.data_dict[target] = {} + + if len(row[1:]) != 51: + print("Invalid pose data for: ", target, ", frame: ", frame_num) + continue + # Added try block to see if all the joint values are present. other wise skip that frame. + try: + self.data_dict[target][frame_num] = np.array( + row[1:], dtype=np.float32 + ).reshape((-1, 3)) + except ValueError: + print("Invalid pose data for: ", target, ", frame: ", frame_num) + continue + + # Check for data samples that have less than sequence_length frames and remove them. + for target, sequence in self.data_dict.copy().items(): + if len(sequence) < self.sequence_length + 1: + del self.data_dict[target] + + self.targets = list(self.data_dict.keys()) + + self.data = list(self.data_dict.values()) + + def _filename_to_target(self, filename): + raise NotImplemented() + + def __len__(self): + return len(self.targets) + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (pose, target) where target is index of the target class. + """ + target = self.targets[index] + data = np.stack(list(self.data[index].values())) + + if self.transform is not None: + data = self.transform(data) + + if self.target_transform is not None: + target = self.target_transform(target) + + return data, target + + def get_num_classes(self): + """ + Returns number of unique ids present in the dataset. Useful for classification networks. + + """ + if type(self.targets[0]) == int: + classes = set(self.targets) + else: + classes = set([target[0] for target in self.targets]) + num_classes = len(classes) + return num_classes + + +class CasiaBPose(PoseDataset): + """ + CASIA-B Dataset + The format of the video filename in Dataset B is 'xxx-mm-nn-ttt.avi', where + xxx: subject id, from 001 to 124. + mm: walking status, can be 'nm' (normal), 'cl' (in a coat) or 'bg' (with a bag). + nn: sequence number. + ttt: view angle, can be '000', '018', ..., '180'. + """ + + mapping_walking_status = { + 'nm': 0, + 'bg': 1, + 'cl': 2, + } + + def _filename_to_target(self, filename): + _, sequence_id, frame = filename.split("/") + subject_id, walking_status, sequence_num, view_angle = sequence_id.split("-") + walking_status = self.mapping_walking_status[walking_status] + return ( + (int(subject_id), int(walking_status), int(sequence_num), int(view_angle)), + int(frame[:-4]), + ) + + +class KinectGait(PoseDataset): + def _filename_to_target(self, filename): + subject_id, sequence_num, frame = filename.split("-") + return (int(subject_id), int(sequence_num)), int(frame) diff --git a/src/datasets/graph.py b/src/datasets/graph.py new file mode 100644 index 0000000..253cf57 --- /dev/null +++ b/src/datasets/graph.py @@ -0,0 +1,187 @@ +import logging, numpy as np + + +# Thanks to YAN Sijie for the released code on Github (https://github.com/yysijie/st-gcn) +class Graph(): + def __init__(self, dataset, max_hop=3, dilation=1): + self.dataset = dataset.split('-')[0] + self.max_hop = max_hop + self.dilation = dilation + + # get edges + self.num_node, self.edge, self.connect_joint, self.parts = self._get_edge() + + # get adjacency matrix + self.A = self._get_adjacency() + + def __str__(self): + return self.A + + def _get_edge(self): + if self.dataset == 'kinetics': + num_node = 18 + neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), + (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1), + (0, 1), (15, 0), (14, 0), (17, 15), (16, 14), (8, 11)] + connect_joint = np.array([1,1,1,2,3,1,5,6,2,8,9,5,11,12,0,0,14,15]) + parts = [ + np.array([5, 6, 7]), # left_arm + np.array([2, 3, 4]), # right_arm + np.array([11, 12, 13]), # left_leg + np.array([8, 9, 10]), # right_leg + np.array([0, 1, 14, 15, 16, 17]) # torso + ] + elif self.dataset == 'ntu': + num_node = 25 + neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), + (6, 5), (7, 6), (8, 7), (9, 21), (10, 9), + (11, 10), (12, 11), (13, 1), (14, 13), (15, 14), + (16, 15), (17, 1), (18, 17), (19, 18), (20, 19), + (22, 23), (23, 8), (24, 25), (25, 12)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] + connect_joint = np.array([2,2,21,3,21,5,6,7,21,9,10,11,1,13,14,15,1,17,18,19,2,23,8,25,12]) - 1 + parts = [ + np.array([5, 6, 7, 8, 22, 23]) - 1, # left_arm + np.array([9, 10, 11, 12, 24, 25]) - 1, # right_arm + np.array([13, 14, 15, 16]) - 1, # left_leg + np.array([17, 18, 19, 20]) - 1, # right_leg + np.array([1, 2, 3, 4, 21]) - 1 # torso + ] + elif self.dataset == 'sysu': + num_node = 20 + neighbor_1base = [(1, 2), (2, 3), (3, 4), (3, 5), (5, 6), + (6, 7), (7, 8), (3, 9), (9, 10), (10, 11), + (11, 12), (1, 13), (13, 14), (14, 15), (15, 16), + (1, 17), (17, 18), (18, 19), (19, 20)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] + connect_joint = np.array([2,2,2,3,3,5,6,7,3,9,10,11,1,13,14,15,1,17,18,19]) - 1 + parts = [ + np.array([5, 6, 7, 8]) - 1, # left_arm + np.array([9, 10, 11, 12]) - 1, # right_arm + np.array([13, 14, 15, 16]) - 1, # left_leg + np.array([17, 18, 19, 20]) - 1, # right_leg + np.array([1, 2, 3, 4]) - 1 # torso + ] + elif self.dataset == 'ucla': + num_node = 20 + neighbor_1base = [(1, 2), (2, 3), (3, 4), (3, 5), (5, 6), + (6, 7), (7, 8), (3, 9), (9, 10), (10, 11), + (11, 12), (1, 13), (13, 14), (14, 15), (15, 16), + (1, 17), (17, 18), (18, 19), (19, 20)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] + connect_joint = np.array([2,2,2,3,3,5,6,7,3,9,10,11,1,13,14,15,1,17,18,19]) - 1 + parts = [ + np.array([5, 6, 7, 8]) - 1, # left_arm + np.array([9, 10, 11, 12]) - 1, # right_arm + np.array([13, 14, 15, 16]) - 1, # left_leg + np.array([17, 18, 19, 20]) - 1, # right_leg + np.array([1, 2, 3, 4]) - 1 # torso + ] + elif self.dataset == 'cmu': + num_node = 26 + neighbor_1base = [(1, 2), (2, 3), (3, 4), (5, 6), (6, 7), + (7, 8), (1, 9), (5, 9), (9, 10), (10, 11), + (11, 12), (12, 13), (13, 14), (12, 15), (15, 16), + (16, 17), (17, 18), (18, 19), (17, 20), (12, 21), + (21, 22), (22, 23), (23, 24), (24, 25), (23, 26)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] + connect_joint = np.array([9,1,2,3,9,5,6,7,10,10,10,11,12,13,12,15,16,17,18,17,12,21,22,23,24,23]) - 1 + parts = [ + np.array([15, 16, 17, 18, 19, 20]) - 1, # left_arm + np.array([21, 22, 23, 24, 25, 26]) - 1, # right_arm + np.array([1, 2, 3, 4]) - 1, # left_leg + np.array([5, 6, 7, 8]) - 1, # right_leg + np.array([9, 10, 11, 12, 13, 14]) - 1 # torso + ] + elif self.dataset == 'h36m': + num_node = 20 + neighbor_1base = [(1, 2), (2, 3), (3, 4), (5, 6), (6, 7), + (7, 8), (1, 9), (5, 9), (9, 10), (10, 11), + (11, 12), (10, 13), (13, 14), (14, 15), (15, 16), + (10, 17), (17, 18), (18, 19), (19, 20)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base] + connect_joint = np.array([9,1,2,3,9,5,6,7,9,9,10,11,10,13,14,15,10,17,18,19]) - 1 + parts = [ + np.array([13, 14, 15, 16]) - 1, # left_arm + np.array([17, 18, 19, 20]) - 1, # right_arm + np.array([1, 2, 3, 4]) - 1, # left_leg + np.array([5, 6, 7, 8]) - 1, # right_leg + np.array([9, 10, 11, 12]) - 1 # torso + ] + elif self.dataset == 'coco': + # keypoints = { + # 0: "nose", + # 1: "left_eye", + # 2: "right_eye", + # 3: "left_ear", + # 4: "right_ear", + # 5: "left_shoulder", + # 6: "right_shoulder", + # 7: "left_elbow", + # 8: "right_elbow", + # 9: "left_wrist", + # 10: "right_wrist", + # 11: "left_hip", + # 12: "right_hip", + # 13: "left_knee", + # 14: "right_knee", + # 15: "left_ankle", + # 16: "right_ankle" + # } + num_node = 17 + self_link = [(i, i) for i in range(num_node)] + neighbor_link = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 6), + (5, 7), (7, 9), (6, 8), (8, 10), (5, 11), (6, 12), (11, 12), + (11, 13), (13, 15), (12, 14), (14, 16)] + self.edge = self_link + neighbor_link + self.center = 0 + connect_joint = np.array([5,0,0,1,2,0,0,5,6,7,8,5,6,11,12,13,14]) + parts = [ + np.array([5, 7, 9]), # left_arm + np.array([6, 8, 10]), # right_arm + np.array([11, 13, 15]), # left_leg + np.array([12, 14, 16]), # right_leg + np.array([5, 6, 11, 12, 0, 1, 2, 3, 4]), # torso + head + ] + else: + num_node, neighbor_link, connect_joint, parts = 0, [], [], [] + logging.info('') + logging.error('Error: Do NOT exist this dataset: {}!'.format(self.dataset)) + raise ValueError() + self_link = [(i, i) for i in range(num_node)] + edge = self_link + neighbor_link + return num_node, edge, connect_joint, parts + + def _get_hop_distance(self): + A = np.zeros((self.num_node, self.num_node)) + for i, j in self.edge: + A[j, i] = 1 + A[i, j] = 1 + hop_dis = np.zeros((self.num_node, self.num_node)) + np.inf + transfer_mat = [np.linalg.matrix_power(A, d) for d in range(self.max_hop + 1)] + arrive_mat = (np.stack(transfer_mat) > 0) + for d in range(self.max_hop, -1, -1): + hop_dis[arrive_mat[d]] = d + return hop_dis + + def _get_adjacency(self): + hop_dis = self._get_hop_distance() + valid_hop = range(0, self.max_hop + 1, self.dilation) + adjacency = np.zeros((self.num_node, self.num_node)) + for hop in valid_hop: + adjacency[hop_dis == hop] = 1 + normalize_adjacency = self._normalize_digraph(adjacency) + A = np.zeros((len(valid_hop), self.num_node, self.num_node)) + for i, hop in enumerate(valid_hop): + A[i][hop_dis == hop] = normalize_adjacency[hop_dis == hop] + return A + + def _normalize_digraph(self, A): + Dl = np.sum(A, 0) + num_node = A.shape[0] + Dn = np.zeros((num_node, num_node)) + for i in range(num_node): + if Dl[i] > 0: + Dn[i, i] = Dl[i]**(-1) + AD = np.dot(A, Dn) + return AD diff --git a/src/datasets/preparation.py b/src/datasets/preparation.py new file mode 100644 index 0000000..4c4e703 --- /dev/null +++ b/src/datasets/preparation.py @@ -0,0 +1,99 @@ +import os + +import numpy as np +from PIL import Image +from torch.utils.data import Dataset + + +class DatasetSimple(Dataset): + """ + Args: + root (string): Root directory path. + frame_list_path (string): Frame list path. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. + E.g, ``transforms.RandomCrop`` for images. + sample_transform (callable, optional): A function/transform that takes + in the target and transforms it. + """ + + def __init__(self, root, frame_list_path, transform=None, sample_transform=None): + self.root = root + self.frame_list = np.loadtxt(frame_list_path, skiprows=1, dtype=str) + self.transform = transform + self.sample_transform = sample_transform + + def __len__(self): + return len(self.frame_list) + + def __getitem__(self, index): + image_name = self.frame_list[index] + image_path = os.path.join(self.root, image_name) + + with open(image_path, "rb") as f: + img = Image.open(f) + img.convert("RGB") + + if self.transform: + img = self.transform(img) + + return img, image_name + + +def box_to_center_scale(box, model_image_width, model_image_height): + """convert a box to center,scale information required for pose transformation + Parameters + ---------- + box : list | ndarray + model_image_width : int + model_image_height : int + + Returns + ------- + (numpy array, numpy array) + Two numpy arrays, coordinates for the center of the box and the scale of the box + """ + center = np.zeros(2, dtype=np.float32) + + top_left_corner = box[0:2] + box_width = box[2] + box_height = box[3] + center[0] = top_left_corner[0] + box_width * 0.5 + center[1] = top_left_corner[1] + box_height * 0.5 + + aspect_ratio = model_image_width * 1.0 / model_image_height + pixel_std = 200 + + if box_width > aspect_ratio * box_height: + box_height = box_width * 1.0 / aspect_ratio + elif box_width < aspect_ratio * box_height: + box_width = box_height * aspect_ratio + scale = np.array( + [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std], + dtype=np.float32) + if center[0] != -1: + scale = scale * 1.25 + + return center, scale + + +class DatasetDetections(DatasetSimple): + def __getitem__(self, index): + frame_info = self.frame_list[index].split(",") + image_name = frame_info[0] + image_path = os.path.join(self.root, image_name) + + box = np.array(frame_info[1:], dtype=np.float32) + center, scale = box_to_center_scale(box, 288, 384) + + with open(image_path, "rb") as f: + img = Image.open(f) + img.convert("RGB") + + if self.sample_transform: + img = self.sample_transform(img, center, scale) + + if self.transform: + img = self.transform(img) + + return img, image_name, (center, scale) diff --git a/src/detector/README.md b/src/detector/README.md new file mode 100644 index 0000000..71d2a4a --- /dev/null +++ b/src/detector/README.md @@ -0,0 +1,2 @@ +## Detector PyTorch-YOLOv3 +This part is borrowed from [eriklindernoren/PyTorch-YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) diff --git a/src/detector/config/yolov3-tiny.cfg b/src/detector/config/yolov3-tiny.cfg new file mode 100644 index 0000000..ade4969 --- /dev/null +++ b/src/detector/config/yolov3-tiny.cfg @@ -0,0 +1,206 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=2 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +# 0 +[convolutional] +batch_normalize=1 +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +# 1 +[maxpool] +size=2 +stride=2 + +# 2 +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# 3 +[maxpool] +size=2 +stride=2 + +# 4 +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +# 5 +[maxpool] +size=2 +stride=2 + +# 6 +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +# 7 +[maxpool] +size=2 +stride=2 + +# 8 +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +# 9 +[maxpool] +size=2 +stride=2 + +# 10 +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +# 11 +[maxpool] +size=2 +stride=1 + +# 12 +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +# 13 +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +# 14 +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +# 15 +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + + +# 16 +[yolo] +mask = 3,4,5 +anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 +classes=80 +num=6 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +# 17 +[route] +layers = -4 + +# 18 +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +# 19 +[upsample] +stride=2 + +# 20 +[route] +layers = -1, 8 + +# 21 +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +# 22 +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + +# 23 +[yolo] +mask = 1,2,3 +anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 +classes=80 +num=6 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 diff --git a/src/detector/config/yolov3.cfg b/src/detector/config/yolov3.cfg new file mode 100644 index 0000000..946e015 --- /dev/null +++ b/src/detector/config/yolov3.cfg @@ -0,0 +1,788 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=16 +subdivisions=1 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 diff --git a/src/detector/detector_yolov3.py b/src/detector/detector_yolov3.py new file mode 100644 index 0000000..f1af80b --- /dev/null +++ b/src/detector/detector_yolov3.py @@ -0,0 +1,98 @@ +import argparse +from PIL import Image + +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from matplotlib.ticker import NullLocator + +from detector.models import * +from utils import * + + +Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor + + +class DetectorYOLOv3: + def __init__(self, + model_def='config/yolov3.cfg', + weights_path='../weights/yolov3.weights', + conf_thres=0.8, + nms_thres=0.4, + img_size=416): + self.model_def = model_def + self.weights_path = weights_path + self.img_size = img_size + self.conf_thres = conf_thres + self.nms_thres = nms_thres + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = Darknet(self.model_def, img_size=self.img_size).to(device) + + if self.weights_path.endswith(".weights"): + # Load darknet weights + self.model.load_darknet_weights(self.weights_path) + else: + # Load checkpoint weights + self.model.load_state_dict(torch.load(self.weights_path)) + + self.model.eval() # Set in evaluation mode + + def detect_from_image(self, img): + input_img = preprocess_image(img) + + # Configure input + input_img = Variable(input_img.type(Tensor)) + + # Get detections + with torch.no_grad(): + detections = self.model(input_img) + detections = non_max_suppression(detections, self.conf_thres, self.nms_thres)[0] + if detections is None: + return [] + else: + detections = detections.data.cpu().numpy() + + # Draw bounding boxes and labels of detections + human_candidates = [] + if detections is not None: + # Rescale boxes to original img + detections = rescale_boxes(detections, self.img_size, img.shape[:2]) + + for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections: + box_w = x2 - x1 + box_h = y2 - y1 + + if int(cls_pred) == 0: + human_candidate = [x1, y1, box_w, box_h] + human_candidates.append(human_candidate) + return human_candidates + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file") + parser.add_argument("--weights_path", type=str, default="../../models/yolov3.weights", + help="path to weights file") + parser.add_argument("--conf_thres", type=float, default=0.8, help="object confidence threshold") + parser.add_argument("--nms_thres", type=float, default=0.4, help="iou threshold for non-maximum suppression") + opt = parser.parse_args() + + detector = DetectorYOLOv3(**vars(opt)) + + img = np.array(Image.open('../data/samples/messi.jpg')) + human_candidates = detector.detect_from_image(img) + + # Create plot + plt.figure() + fig, ax = plt.subplots(1) + ax.imshow(img) + + for x1, y1, box_w, box_h in human_candidates: + bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=(1, 0, 0), facecolor="none") + # Add the bbox to the plot + ax.add_patch(bbox) + + plt.axis("off") + plt.gca().xaxis.set_major_locator(NullLocator()) + plt.gca().yaxis.set_major_locator(NullLocator()) + plt.show() diff --git a/src/detector/models.py b/src/detector/models.py new file mode 100644 index 0000000..5da98fc --- /dev/null +++ b/src/detector/models.py @@ -0,0 +1,340 @@ +from __future__ import division + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from utils import build_targets, to_cpu, parse_model_config + + +def create_modules(module_defs): + """ + Constructs module list of layer blocks from module configuration in module_defs + """ + hyperparams = module_defs.pop(0) + output_filters = [int(hyperparams["channels"])] + module_list = nn.ModuleList() + for module_i, module_def in enumerate(module_defs): + modules = nn.Sequential() + + if module_def["type"] == "convolutional": + bn = int(module_def["batch_normalize"]) + filters = int(module_def["filters"]) + kernel_size = int(module_def["size"]) + pad = (kernel_size - 1) // 2 + modules.add_module( + f"conv_{module_i}", + nn.Conv2d( + in_channels=output_filters[-1], + out_channels=filters, + kernel_size=kernel_size, + stride=int(module_def["stride"]), + padding=pad, + bias=not bn, + ), + ) + if bn: + modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5)) + if module_def["activation"] == "leaky": + modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1)) + + elif module_def["type"] == "maxpool": + kernel_size = int(module_def["size"]) + stride = int(module_def["stride"]) + if kernel_size == 2 and stride == 1: + modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1))) + maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2)) + modules.add_module(f"maxpool_{module_i}", maxpool) + + elif module_def["type"] == "upsample": + upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest") + modules.add_module(f"upsample_{module_i}", upsample) + + elif module_def["type"] == "route": + layers = [int(x) for x in module_def["layers"].split(",")] + filters = sum([output_filters[1:][i] for i in layers]) + modules.add_module(f"route_{module_i}", EmptyLayer()) + + elif module_def["type"] == "shortcut": + filters = output_filters[1:][int(module_def["from"])] + modules.add_module(f"shortcut_{module_i}", EmptyLayer()) + + elif module_def["type"] == "yolo": + anchor_idxs = [int(x) for x in module_def["mask"].split(",")] + # Extract anchors + anchors = [int(x) for x in module_def["anchors"].split(",")] + anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] + anchors = [anchors[i] for i in anchor_idxs] + num_classes = int(module_def["classes"]) + img_size = int(hyperparams["height"]) + # Define detection layer + yolo_layer = YOLOLayer(anchors, num_classes, img_size) + modules.add_module(f"yolo_{module_i}", yolo_layer) + # Register module list and number of output filters + module_list.append(modules) + output_filters.append(filters) + + return hyperparams, module_list + + +class Upsample(nn.Module): + """ nn.Upsample is deprecated """ + + def __init__(self, scale_factor, mode="nearest"): + super(Upsample, self).__init__() + self.scale_factor = scale_factor + self.mode = mode + + def forward(self, x): + x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode) + return x + + +class EmptyLayer(nn.Module): + """Placeholder for 'route' and 'shortcut' layers""" + + def __init__(self): + super(EmptyLayer, self).__init__() + + +class YOLOLayer(nn.Module): + """Detection layer""" + + def __init__(self, anchors, num_classes, img_dim=416): + super(YOLOLayer, self).__init__() + self.anchors = anchors + self.num_anchors = len(anchors) + self.num_classes = num_classes + self.ignore_thres = 0.5 + self.mse_loss = nn.MSELoss() + self.bce_loss = nn.BCELoss() + self.obj_scale = 1 + self.noobj_scale = 100 + self.metrics = {} + self.img_dim = img_dim + self.grid_size = 0 # grid size + + def compute_grid_offsets(self, grid_size, cuda=True): + self.grid_size = grid_size + g = self.grid_size + FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor + self.stride = self.img_dim / self.grid_size + # Calculate offsets for each grid + self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor) + self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor) + self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors]) + self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1)) + self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1)) + + def forward(self, x, targets=None, img_dim=None): + + # Tensors for cuda support + FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor + + self.img_dim = img_dim + num_samples = x.size(0) + grid_size = x.size(2) + + prediction = ( + x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) + .permute(0, 1, 3, 4, 2) + .contiguous() + ) + + # Get outputs + x = torch.sigmoid(prediction[..., 0]) # Center x + y = torch.sigmoid(prediction[..., 1]) # Center y + w = prediction[..., 2] # Width + h = prediction[..., 3] # Height + pred_conf = torch.sigmoid(prediction[..., 4]) # Conf + pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. + + # If grid size does not match current we compute new offsets + if grid_size != self.grid_size: + self.compute_grid_offsets(grid_size, cuda=x.is_cuda) + + # Add offset and scale with anchors + pred_boxes = FloatTensor(prediction[..., :4].shape) + pred_boxes[..., 0] = x.data + self.grid_x + pred_boxes[..., 1] = y.data + self.grid_y + pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w + pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h + + output = torch.cat( + ( + pred_boxes.view(num_samples, -1, 4) * self.stride, + pred_conf.view(num_samples, -1, 1), + pred_cls.view(num_samples, -1, self.num_classes), + ), + -1, + ) + + if targets is None: + return output, 0 + else: + iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( + pred_boxes=pred_boxes, + pred_cls=pred_cls, + target=targets, + anchors=self.scaled_anchors, + ignore_thres=self.ignore_thres, + ) + + # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) + loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) + loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) + loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) + loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) + loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) + loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) + loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj + loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) + total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls + + # Metrics + cls_acc = 100 * class_mask[obj_mask].mean() + conf_obj = pred_conf[obj_mask].mean() + conf_noobj = pred_conf[noobj_mask].mean() + conf50 = (pred_conf > 0.5).float() + iou50 = (iou_scores > 0.5).float() + iou75 = (iou_scores > 0.75).float() + detected_mask = conf50 * class_mask * tconf + precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) + recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) + recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) + + self.metrics = { + "loss": to_cpu(total_loss).item(), + "x": to_cpu(loss_x).item(), + "y": to_cpu(loss_y).item(), + "w": to_cpu(loss_w).item(), + "h": to_cpu(loss_h).item(), + "conf": to_cpu(loss_conf).item(), + "cls": to_cpu(loss_cls).item(), + "cls_acc": to_cpu(cls_acc).item(), + "recall50": to_cpu(recall50).item(), + "recall75": to_cpu(recall75).item(), + "precision": to_cpu(precision).item(), + "conf_obj": to_cpu(conf_obj).item(), + "conf_noobj": to_cpu(conf_noobj).item(), + "grid_size": grid_size, + } + + return output, total_loss + + +class Darknet(nn.Module): + """YOLOv3 object detection model""" + + def __init__(self, config_path, img_size=416): + super(Darknet, self).__init__() + self.module_defs = parse_model_config(config_path) + self.hyperparams, self.module_list = create_modules(self.module_defs) + self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")] + self.img_size = img_size + self.seen = 0 + self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32) + + def forward(self, x, targets=None): + img_dim = x.shape[2] + loss = 0 + layer_outputs, yolo_outputs = [], [] + for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): + if module_def["type"] in ["convolutional", "upsample", "maxpool"]: + x = module(x) + elif module_def["type"] == "route": + x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1) + elif module_def["type"] == "shortcut": + layer_i = int(module_def["from"]) + x = layer_outputs[-1] + layer_outputs[layer_i] + elif module_def["type"] == "yolo": + x, layer_loss = module[0](x, targets, img_dim) + loss += layer_loss + yolo_outputs.append(x) + layer_outputs.append(x) + yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1)) + return yolo_outputs if targets is None else (loss, yolo_outputs) + + def load_darknet_weights(self, weights_path): + """Parses and loads the weights stored in 'weights_path'""" + + # Open the weights file + with open(weights_path, "rb") as f: + header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values + self.header_info = header # Needed to write header when saving weights + self.seen = header[3] # number of images seen during training + weights = np.fromfile(f, dtype=np.float32) # The rest are weights + + # Establish cutoff for loading backbone weights + cutoff = None + if "darknet53.conv.74" in weights_path: + cutoff = 75 + + ptr = 0 + for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): + if i == cutoff: + break + if module_def["type"] == "convolutional": + conv_layer = module[0] + if module_def["batch_normalize"]: + # Load BN bias, weights, running mean and running variance + bn_layer = module[1] + num_b = bn_layer.bias.numel() # Number of biases + # Bias + bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias) + bn_layer.bias.data.copy_(bn_b) + ptr += num_b + # Weight + bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight) + bn_layer.weight.data.copy_(bn_w) + ptr += num_b + # Running Mean + bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean) + bn_layer.running_mean.data.copy_(bn_rm) + ptr += num_b + # Running Var + bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var) + bn_layer.running_var.data.copy_(bn_rv) + ptr += num_b + else: + # Load conv. bias + num_b = conv_layer.bias.numel() + conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias) + conv_layer.bias.data.copy_(conv_b) + ptr += num_b + # Load conv. weights + num_w = conv_layer.weight.numel() + conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight) + conv_layer.weight.data.copy_(conv_w) + ptr += num_w + + def save_darknet_weights(self, path, cutoff=-1): + """ + @:param path - path of the new weights file + @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) + """ + fp = open(path, "wb") + self.header_info[3] = self.seen + self.header_info.tofile(fp) + + # Iterate through layers + for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): + if module_def["type"] == "convolutional": + conv_layer = module[0] + # If batch norm, load bn first + if module_def["batch_normalize"]: + bn_layer = module[1] + bn_layer.bias.data.cpu().numpy().tofile(fp) + bn_layer.weight.data.cpu().numpy().tofile(fp) + bn_layer.running_mean.data.cpu().numpy().tofile(fp) + bn_layer.running_var.data.cpu().numpy().tofile(fp) + # Load conv bias + else: + conv_layer.bias.data.cpu().numpy().tofile(fp) + # Load conv weights + conv_layer.weight.data.cpu().numpy().tofile(fp) + + fp.close() diff --git a/src/detector/utils.py b/src/detector/utils.py new file mode 100644 index 0000000..7a01c4d --- /dev/null +++ b/src/detector/utils.py @@ -0,0 +1,239 @@ +import torch +import numpy as np +import cv2 + + +def pad_to_square(img, pad_value): + h, w, _ = img.shape + dim_diff = np.abs(h - w) + # Upper (left) and lower (right) padding + pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2 + # Determine padding + pad = ((pad1, pad2), (0, 0), (0, 0)) if h <= w else ((0, 0), (pad1, pad2), (0, 0)) + # Add padding + img = np.pad(img, pad, "constant", constant_values=pad_value) + return img, pad + + +def preprocess_image(img, img_size=416): + input_img, _ = pad_to_square(np.array(img), 127.5) + # Resize + input_img = cv2.resize( + input_img, (img_size, img_size), interpolation=cv2.INTER_AREA + ) + # Channels-first + input_img = np.transpose(input_img, (2, 0, 1)) + + # extend one dimension + input_img = np.expand_dims(input_img, axis=0) + + # As pytorch tensor + input_img = torch.from_numpy(input_img).float() / 255.0 + return input_img + + +def parse_model_config(path): + """Parses the yolo-v3 layer configuration file and returns module definitions""" + file = open(path, 'r') + lines = file.read().split('\n') + lines = [x for x in lines if x and not x.startswith('#')] + lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces + module_defs = [] + for line in lines: + if line.startswith('['): # This marks the start of a new block + module_defs.append({}) + module_defs[-1]['type'] = line[1:-1].rstrip() + if module_defs[-1]['type'] == 'convolutional': + module_defs[-1]['batch_normalize'] = 0 + else: + key, value = line.split("=") + value = value.strip() + module_defs[-1][key.rstrip()] = value.strip() + + return module_defs + + +def parse_data_config(path): + """Parses the data configuration file""" + options = dict() + options['gpus'] = '0,1,2,3' + options['num_workers'] = '10' + with open(path, 'r') as fp: + lines = fp.readlines() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#'): + continue + key, value = line.split('=') + options[key.strip()] = value.strip() + return options + + +def xywh2xyxy(x): + y = x.new(x.shape) + y[..., 0] = x[..., 0] - x[..., 2] / 2 + y[..., 1] = x[..., 1] - x[..., 3] / 2 + y[..., 2] = x[..., 0] + x[..., 2] / 2 + y[..., 3] = x[..., 1] + x[..., 3] / 2 + return y + + +def bbox_wh_iou(wh1, wh2): + wh2 = wh2.t() + w1, h1 = wh1[0], wh1[1] + w2, h2 = wh2[0], wh2[1] + inter_area = torch.min(w1, w2) * torch.min(h1, h2) + union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area + return inter_area / union_area + + +def bbox_iou(box1, box2, x1y1x2y2=True): + """ + Returns the IoU of two bounding boxes + """ + if not x1y1x2y2: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # get the corrdinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1, b2_x1) + inter_rect_y1 = torch.max(b1_y1, b2_y1) + inter_rect_x2 = torch.min(b1_x2, b2_x2) + inter_rect_y2 = torch.min(b1_y2, b2_y2) + # Intersection area + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp( + inter_rect_y2 - inter_rect_y1 + 1, min=0 + ) + # Union Area + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + + +def to_cpu(tensor): + return tensor.detach().cpu() + + +def rescale_boxes(boxes, current_dim, original_shape): + """ Rescales bounding boxes to the original shape """ + orig_h, orig_w = original_shape + # The amount of padding that was added + pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape)) + pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape)) + # Image height and width after padding is removed + unpad_h = current_dim - pad_y + unpad_w = current_dim - pad_x + # Rescale bounding boxes to dimension of original img + boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w + boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h + boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w + boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h + return boxes + + +def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4): + """ + Removes detections with lower object confidence score than 'conf_thres' and performs + Non-Maximum Suppression to further filter detections. + Returns detections with shape: + (x1, y1, x2, y2, object_conf, class_score, class_pred) + """ + + # From (center x, center y, width, height) to (x1, y1, x2, y2) + prediction[..., :4] = xywh2xyxy(prediction[..., :4]) + output = [None for _ in range(len(prediction))] + for image_i, image_pred in enumerate(prediction): + # Filter out confidence scores below threshold + image_pred = image_pred[image_pred[:, 4] >= conf_thres] + # If none are remaining => process next img + if not image_pred.size(0): + continue + # Object confidence times class confidence + score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0] + # Sort by it + image_pred = image_pred[(-score).argsort()] + class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True) + detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1) + # Perform non-maximum suppression + keep_boxes = [] + while detections.size(0): + large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres + label_match = detections[0, -1] == detections[:, -1] + # Indices of boxes with lower confidence scores, large IOUs and matching labels + invalid = large_overlap & label_match + weights = detections[invalid, 4:5] + # Merge overlapping bboxes by order of confidence + detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum() + keep_boxes += [detections[0]] + detections = detections[~invalid] + if keep_boxes: + output[image_i] = torch.stack(keep_boxes) + + return output + + +def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres): + + ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor + FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor + + nB = pred_boxes.size(0) + nA = pred_boxes.size(1) + nC = pred_cls.size(-1) + nG = pred_boxes.size(2) + + # Output tensors + obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0) + noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1) + class_mask = FloatTensor(nB, nA, nG, nG).fill_(0) + iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0) + tx = FloatTensor(nB, nA, nG, nG).fill_(0) + ty = FloatTensor(nB, nA, nG, nG).fill_(0) + tw = FloatTensor(nB, nA, nG, nG).fill_(0) + th = FloatTensor(nB, nA, nG, nG).fill_(0) + tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0) + + # Convert to position relative to box + target_boxes = target[:, 2:6] * nG + gxy = target_boxes[:, :2] + gwh = target_boxes[:, 2:] + # Get anchors with best iou + ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) + best_ious, best_n = ious.max(0) + # Separate target values + b, target_labels = target[:, :2].long().t() + gx, gy = gxy.t() + gw, gh = gwh.t() + gi, gj = gxy.long().t() + # Set masks + obj_mask[b, best_n, gj, gi] = 1 + noobj_mask[b, best_n, gj, gi] = 0 + + # Set noobj mask to zero where iou exceeds ignore threshold + for i, anchor_ious in enumerate(ious.t()): + noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0 + + # Coordinates + tx[b, best_n, gj, gi] = gx - gx.floor() + ty[b, best_n, gj, gi] = gy - gy.floor() + # Width and height + tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16) + th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16) + # One-hot encoding of label + tcls[b, best_n, gj, gi, target_labels] = 1 + # Compute label correctness and iou at best anchor + class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float() + iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False) + + tconf = obj_mask.float() + return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf diff --git a/src/evaluate.py b/src/evaluate.py new file mode 100644 index 0000000..5890cd3 --- /dev/null +++ b/src/evaluate.py @@ -0,0 +1,192 @@ +import sys +import time + +import numpy as np +import pandas +import torch +from torchvision import transforms +from torch.utils.data import DataLoader + +from common import get_model_resgcn +from utils import AverageMeter +from datasets import dataset_factory +from datasets.augmentation import ShuffleSequence, SelectSequenceCenter, ToTensor, MultiInput +from datasets.graph import Graph + + +def _evaluate_casia_b(embeddings): + """ + Test dataset consists of sequences of last 50 ids from CASIA B Dataset. + Data is divided in the following way: + Gallery Set: + NM 1, NM 2, NM 3, NM 4 + Probe Set: + Subset 1: + NM 5, NM 6 + Subset 2: + BG 1, BG 2 + Subset 3: + CL 1, CL 2 + """ + + gallery = {k: v for (k, v) in embeddings.items() if k[1] == 0 and k[2] <= 4} + gallery_per_angle = {} + for angle in range(0, 181, 18): + gallery_per_angle[angle] = {k: v for (k, v) in gallery.items() if k[3] == angle} + + probe_nm = {k: v for (k, v) in embeddings.items() if k[1] == 0 and k[2] >= 5} + probe_bg = {k: v for (k, v) in embeddings.items() if k[1] == 1} + probe_cl = {k: v for (k, v) in embeddings.items() if k[1] == 2} + + correct = np.zeros((3, 11, 11)) + total = np.zeros((3, 11, 11)) + for gallery_angle in range(0, 181, 18): + gallery_embeddings = np.array(list(gallery_per_angle[gallery_angle].values())) + gallery_targets = list(gallery_per_angle[gallery_angle].keys()) + gallery_pos = int(gallery_angle / 18) + + probe_num = 0 + for probe in [probe_nm, probe_bg, probe_cl]: + for (target, embedding) in probe.items(): + subject_id, _, _, probe_angle = target + probe_pos = int(probe_angle / 18) + + distance = np.linalg.norm(gallery_embeddings - embedding, ord=2, axis=1) + min_pos = np.argmin(distance) + min_target = gallery_targets[int(min_pos)] + + if min_target[0] == subject_id: + correct[probe_num, gallery_pos, probe_pos] += 1 + total[probe_num, gallery_pos, probe_pos] += 1 + + probe_num += 1 + + accuracy = correct / total + + # Exclude same view + for i in range(3): + accuracy[i] -= np.diag(np.diag(accuracy[i])) + + accuracy_flat = np.sum(accuracy, 1) / 10 + + header = ["NM#5-6", "BG#1-2", "CL#1-2"] + + accuracy_avg = np.mean(accuracy) + sub_accuracies_avg = np.mean(accuracy_flat, 1) + sub_accuracies = dict(zip(header, list(sub_accuracies_avg))) + + dataframe = pandas.DataFrame( + np.concatenate((accuracy_flat, sub_accuracies_avg[..., np.newaxis]), 1), + header, + list(range(0, 181, 18)) + ["mean"], + ) + + return correct, accuracy_avg, sub_accuracies, dataframe + + +def evaluate(data_loader, model, evaluation_fn, log_interval=10, use_flip=False): + model.eval() + batch_time = AverageMeter() + + # Calculate embeddings + with torch.no_grad(): + end = time.time() + embeddings = dict() + for idx, (points, target) in enumerate(data_loader): + if use_flip: + bsz = points.shape[0] + data_flipped = torch.flip(points, dims=[1]) + points = torch.cat([points, data_flipped], dim=0) + + if torch.cuda.is_available(): + points = points.cuda(non_blocking=True) + + output = model(points) + + if use_flip: + f1, f2 = torch.split(output, [bsz, bsz], dim=0) + output = torch.mean(torch.stack([f1, f2]), dim=0) + + for i in range(output.shape[0]): + sequence = tuple( + int(t[i]) if type(t[i]) is torch.Tensor else t[i] for t in target + ) + embeddings[sequence] = output[i].cpu().numpy() + + batch_time.update(time.time() - end) + end = time.time() + + if idx % log_interval == 0: + print( + f"Test: [{idx}/{len(data_loader)}]\t" + f"Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" + ) + sys.stdout.flush() + + return evaluation_fn(embeddings) + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Evaluate model on dataset") + parser.add_argument("dataset", choices=["casia-b"]) + parser.add_argument("weights_path") + parser.add_argument("data_path") + parser.add_argument("--network_name", default="resgcn-n39-r8") + parser.add_argument("--sequence_length", type=int, default=60) + parser.add_argument("--batch_size", type=int, default=256) + parser.add_argument("--embedding_layer_size", type=int, default=128) + parser.add_argument("--use_multi_branch", action="store_true") + parser.add_argument("--shuffle", action="store_true") + + opt = parser.parse_args() + + # Config for dataset + graph = Graph("coco") + dataset_class = dataset_factory(opt.dataset) + evaluation_fn = None + if opt.dataset == "casia-b": + evaluation_fn = _evaluate_casia_b + + # Load data + dataset = dataset_class( + opt.data_path, + train=False, + sequence_length=opt.sequence_length, + transform=transforms.Compose( + [ + SelectSequenceCenter(opt.sequence_length), + ShuffleSequence(opt.shuffle), + MultiInput(graph.connect_joint, opt.use_multi_branch), + ToTensor() + ] + ), + ) + data_loader = DataLoader(dataset, batch_size=opt.batch_size) + print(f"Data loaded: {len(data_loader)} batches") + + # Init model + model = get_model_resgcn(graph, opt) + + if torch.cuda.is_available(): + model.cuda() + + # Load weights + checkpoint = torch.load(opt.weights_path) + model.load_state_dict(checkpoint["model"]) + + result, accuracy_avg, sub_accuracies, dataframe = evaluate( + data_loader, model, evaluation_fn, use_flip=True + ) + + print("\n") + print((dataframe * 100).round(2)) + print(f"AVG: {accuracy_avg*100} %") + print("=================================") + print((dataframe * 100).round(1).to_latex()) + print((dataframe * 100).round(1).to_markdown()) + + +if __name__ == "__main__": + main() diff --git a/src/losses.py b/src/losses.py new file mode 100644 index 0000000..e643b8d --- /dev/null +++ b/src/losses.py @@ -0,0 +1,96 @@ +import torch +import torch.nn as nn + +""" +Author: Yonglong Tian (yonglong@mit.edu) +Date: May 07, 2020 +""" + + +class SupConLoss(nn.Module): + """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf. + It also supports the unsupervised contrastive loss in SimCLR""" + def __init__(self, temperature=0.07, contrast_mode='all', + base_temperature=0.07): + super(SupConLoss, self).__init__() + self.temperature = temperature + self.contrast_mode = contrast_mode + self.base_temperature = base_temperature + + def forward(self, features, labels=None, mask=None): + """Compute loss for model. If both `labels` and `mask` are None, + it degenerates to SimCLR unsupervised loss: + https://arxiv.org/pdf/2002.05709.pdf + Args: + features: hidden vector of shape [bsz, n_views, ...]. + labels: ground truth of shape [bsz]. + mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j + has the same class as sample i. Can be asymmetric. + Returns: + A loss scalar. + """ + device = (torch.device('cuda') + if features.is_cuda + else torch.device('cpu')) + + if len(features.shape) < 3: + raise ValueError('`features` needs to be [bsz, n_views, ...],' + 'at least 3 dimensions are required') + if len(features.shape) > 3: + features = features.view(features.shape[0], features.shape[1], -1) + + batch_size = features.shape[0] + if labels is not None and mask is not None: + raise ValueError('Cannot define both `labels` and `mask`') + elif labels is None and mask is None: + mask = torch.eye(batch_size, dtype=torch.float32).to(device) + elif labels is not None: + labels = labels.contiguous().view(-1, 1) + if labels.shape[0] != batch_size: + raise ValueError('Num of labels does not match num of features') + mask = torch.eq(labels, labels.T).float().to(device) + else: + mask = mask.float().to(device) + + contrast_count = features.shape[1] + contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0) + if self.contrast_mode == 'one': + anchor_feature = features[:, 0] + anchor_count = 1 + elif self.contrast_mode == 'all': + anchor_feature = contrast_feature + anchor_count = contrast_count + else: + raise ValueError('Unknown mode: {}'.format(self.contrast_mode)) + + # compute logits + anchor_dot_contrast = torch.div( + torch.matmul(anchor_feature, contrast_feature.T), + self.temperature) + # for numerical stability + logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True) + logits = anchor_dot_contrast - logits_max.detach() + + # tile mask + mask = mask.repeat(anchor_count, contrast_count) + # mask-out self-contrast cases + logits_mask = torch.scatter( + torch.ones_like(mask), + 1, + torch.arange(batch_size * anchor_count).view(-1, 1).to(device), + 0 + ) + mask = mask * logits_mask + + # compute log_prob + exp_logits = torch.exp(logits) * logits_mask + log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True)) + + # compute mean of log-likelihood over positive + mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1) + + # loss + loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos + loss = loss.view(anchor_count, batch_size).mean() + + return loss diff --git a/src/models/README.md b/src/models/README.md new file mode 100644 index 0000000..5d6fe0e --- /dev/null +++ b/src/models/README.md @@ -0,0 +1,4 @@ + + +__Spatial Temporal Graph Convolutional Networks (ST-GCN) for Skeleton-Based Action Recognition__ +ST-GCN network is borrowed from [yysijie/st-gcn](https://github.com/yysijie/st-gcn) diff --git a/src/models/ResGCNv1/__init__.py b/src/models/ResGCNv1/__init__.py new file mode 100644 index 0000000..a8411b6 --- /dev/null +++ b/src/models/ResGCNv1/__init__.py @@ -0,0 +1,56 @@ +import logging + +from . import blocks +from .nets import ResGCN +from .modules import ResGCN_Module, AttGCN_Module +from .attentions import * + + +__model = { + 'resgcn': ResGCN, +} + +__attention = { + 'pa': Part_Att, + 'ca': Channel_Att, + 'fa': Frame_Att, + 'ja': Joint_Att, + 'pca': Part_Conv_Att, + 'psa': Part_Share_Att, +} + +__structure = { + 'b15': {'structure': [1,2,2,2], 'block': 'Basic'}, + 'b19': {'structure': [1,2,3,3], 'block': 'Basic'}, + 'b23': {'structure': [1,3,4,3], 'block': 'Basic'}, + 'b29': {'structure': [1,3,6,4], 'block': 'Basic'}, + 'n39': {'structure': [1,2,2,2], 'block': 'Bottleneck'}, + 'n51': {'structure': [1,2,3,3], 'block': 'Bottleneck'}, + 'n57': {'structure': [1,3,4,3], 'block': 'Bottleneck'}, + 'n75': {'structure': [1,3,6,4], 'block': 'Bottleneck'}, +} + +__reduction = { + 'r1': {'reduction': 1}, + 'r2': {'reduction': 2}, + 'r4': {'reduction': 4}, + 'r8': {'reduction': 8}, +} + + +def create(model_type, **kwargs): + model_split = model_type.split('-') + if model_split[0] in __attention.keys(): + kwargs.update({'module': AttGCN_Module, 'attention': __attention[model_split[0]]}) + del(model_split[0]) + else: + kwargs.update({'module': ResGCN_Module, 'attention': None}) + try: + [model, structure, reduction] = model_split + except: + [model, structure], reduction = model_split, 'r1' + if not (model in __model.keys() and structure in __structure.keys() and reduction in __reduction.keys()): + logging.info('') + logging.error('Error: Do NOT exist this model_type: {}!'.format(model_type)) + raise ValueError() + return __model[model](**(__structure[structure]), **(__reduction[reduction]), **kwargs) diff --git a/src/models/ResGCNv1/attentions.py b/src/models/ResGCNv1/attentions.py new file mode 100644 index 0000000..4dc903b --- /dev/null +++ b/src/models/ResGCNv1/attentions.py @@ -0,0 +1,187 @@ +import torch +from torch import nn + + +class Part_Att(nn.Module): + def __init__(self, channel, parts, **kwargs): + super(Part_Att, self).__init__() + + self.parts = parts + self.joints = get_corr_joints(parts) + + inter_channel = channel // 4 + + self.fcn = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(channel, inter_channel, kernel_size=1), + nn.BatchNorm2d(inter_channel), + nn.ReLU(inplace=True), + nn.Conv2d(inter_channel, channel*len(self.parts), kernel_size=1), + ) + + self.softmax = nn.Softmax(dim=-1) + self.bn = nn.BatchNorm2d(channel) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + N, C, T, V = x.size() + res = x + + x_att = self.softmax(self.fcn(x).view(N, C, len(self.parts))) + x_att = torch.split(x_att, 1, dim=-1) + x_att = [x_att[self.joints[i]].expand_as(x[:,:,:,i]) for i in range(V)] + x_att = torch.stack(x_att, dim=-1) + return self.relu(self.bn(x * x_att) + res) + + +class Part_Share_Att(nn.Module): + def __init__(self, channel, parts, **kwargs): + super(Part_Share_Att, self).__init__() + + self.parts = parts + self.joints = get_corr_joints(parts) + + inter_channel = channel // 4 + + self.part_pool = nn.Sequential( + nn.Conv2d(channel, inter_channel, kernel_size=1), + nn.BatchNorm2d(inter_channel), + nn.ReLU(inplace=True), + nn.AdaptiveAvgPool2d(1), + ) + + self.fcn = nn.Sequential( + nn.Conv2d(inter_channel, inter_channel, kernel_size=1), + nn.BatchNorm2d(inter_channel), + nn.ReLU(inplace=True), + nn.Conv2d(inter_channel, channel*len(self.parts), kernel_size=1), + ) + + self.softmax = nn.Softmax(dim=-1) + self.bn = nn.BatchNorm2d(channel) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + N, C, T, V = x.size() + res = x + + x_split = [self.part_pool(x[:,:,:,part]) for part in self.parts] + x_att = self.softmax(self.fcn(sum(x_split)).view(N, C, len(self.parts))) + x_att = torch.split(x_att, 1, dim=-1) + x_att = [x_att[self.joints[i]].expand_as(x[:,:,:,i]) for i in range(V)] + x_att = torch.stack(x_att, dim=-1) + return self.relu(self.bn(x * x_att) + res) + + +class Part_Conv_Att(nn.Module): + def __init__(self, channel, parts, **kwargs): + super(Part_Conv_Att, self).__init__() + + self.parts = parts + self.joints = get_corr_joints(parts) + + inter_channel = channel // 4 + + self.part_pool = nn.ModuleList([nn.Sequential( + nn.Conv2d(channel, inter_channel, kernel_size=1), + nn.BatchNorm2d(inter_channel), + nn.ReLU(inplace=True), + nn.AdaptiveAvgPool2d(1), + ) for _ in range(len(self.parts))]) + + self.fcn = nn.Sequential( + nn.Conv2d(inter_channel, inter_channel, kernel_size=1), + nn.BatchNorm2d(inter_channel), + nn.ReLU(inplace=True), + nn.Conv2d(inter_channel, channel*len(self.parts), kernel_size=1), + ) + + self.softmax = nn.Softmax(dim=-1) + self.bn = nn.BatchNorm2d(channel) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + N, C, T, V = x.size() + res = x + + x_split = [pool(x[:,:,:,part]) for part, pool in zip(self.parts, self.part_pool)] + x_att = self.softmax(self.fcn(sum(x_split)).view(N, C, len(self.parts))) + x_att = torch.split(x_att, 1, dim=-1) + x_att = [x_att[self.joints[i]].expand_as(x[:,:,:,i]) for i in range(V)] + x_att = torch.stack(x_att, dim=-1) + return self.relu(self.bn(x * x_att) + res) + + +class Channel_Att(nn.Module): + def __init__(self, channel, **kwargs): + super(Channel_Att, self).__init__() + + self.fcn = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(channel, channel//4, kernel_size=1), + nn.BatchNorm2d(channel//4), + nn.ReLU(inplace=True), + nn.Conv2d(channel//4, channel, kernel_size=1), + nn.Softmax(dim=1) + ) + + self.bn = nn.BatchNorm2d(channel) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + res = x + x_att = self.fcn(x).squeeze() + return self.relu(self.bn(x * x_att[:, :, None, None]) + res) + + +class Frame_Att(nn.Module): + def __init__(self, channel, **kwargs): + super(Frame_Att, self).__init__() + + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + self.conv = nn.Conv2d(2, 1, kernel_size=(9,1), padding=(4,0)) + self.bn = nn.BatchNorm2d(channel) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + res = x + x_avg = torch.transpose(self.avg_pool(torch.transpose(x, 1, 2)), 1, 2) + x_max = torch.transpose(self.max_pool(torch.transpose(x, 1, 2)), 1, 2) + x_att = self.conv(torch.cat([x_avg, x_max], dim=1)).squeeze() + return self.relu(self.bn(x * x_att[:, None, :, None]) + res) + + +class Joint_Att(nn.Module): + def __init__(self, channel, parts, **kwargs): + super(Joint_Att, self).__init__() + + num_joint = sum([len(part) for part in parts]) + + self.fcn = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(num_joint, num_joint//2, kernel_size=1), + nn.BatchNorm2d(num_joint//2), + nn.ReLU(inplace=True), + nn.Conv2d(num_joint//2, num_joint, kernel_size=1), + nn.Softmax(dim=1) + ) + + self.bn = nn.BatchNorm2d(channel) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + res = x + x_att = self.fcn(torch.transpose(x, 1, 3)).squeeze() + return self.relu(self.bn(x * x_att[:, None, None, :]) + res) + + +def get_corr_joints(parts): + num_joints = max([max(part) for part in parts]) + 1 + res = [] + for i in range(num_joints): + for j in range(len(parts)): + if i in parts[j]: + res.append(j) + break + return torch.Tensor(res).long() diff --git a/src/models/ResGCNv1/blocks.py b/src/models/ResGCNv1/blocks.py new file mode 100644 index 0000000..14bb411 --- /dev/null +++ b/src/models/ResGCNv1/blocks.py @@ -0,0 +1,175 @@ +import torch +from torch import nn + + +class Spatial_Bottleneck_Block(nn.Module): + def __init__(self, in_channels, out_channels, max_graph_distance, residual=False, reduction=4, **kwargs): + super(Spatial_Bottleneck_Block, self).__init__() + + inter_channels = out_channels // reduction + + if not residual: + self.residual = lambda x: 0 + elif in_channels == out_channels: + self.residual = lambda x: x + else: + self.residual = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1), + nn.BatchNorm2d(out_channels), + ) + + self.conv_down = nn.Conv2d(in_channels, inter_channels, 1) + self.bn_down = nn.BatchNorm2d(inter_channels) + self.conv = SpatialGraphConv(inter_channels, inter_channels, max_graph_distance) + self.bn = nn.BatchNorm2d(inter_channels) + self.conv_up = nn.Conv2d(inter_channels, out_channels, 1) + self.bn_up = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x, A): + + res_block = self.residual(x) + + x = self.conv_down(x) + x = self.bn_down(x) + x = self.relu(x) + + x = self.conv(x, A) + x = self.bn(x) + x = self.relu(x) + + x = self.conv_up(x) + x = self.bn_up(x) + x = self.relu(x + res_block) + + return x + + +class Temporal_Bottleneck_Block(nn.Module): + def __init__(self, channels, temporal_window_size, stride=1, residual=False, reduction=4, **kwargs): + super(Temporal_Bottleneck_Block, self).__init__() + + padding = ((temporal_window_size - 1) // 2, 0) + inter_channels = channels // reduction + + if not residual: + self.residual = lambda x: 0 + elif stride == 1: + self.residual = lambda x: x + else: + self.residual = nn.Sequential( + nn.Conv2d(channels, channels, 1, (stride,1)), + nn.BatchNorm2d(channels), + ) + + self.conv_down = nn.Conv2d(channels, inter_channels, 1) + self.bn_down = nn.BatchNorm2d(inter_channels) + self.conv = nn.Conv2d(inter_channels, inter_channels, (temporal_window_size,1), (stride,1), padding) + self.bn = nn.BatchNorm2d(inter_channels) + self.conv_up = nn.Conv2d(inter_channels, channels, 1) + self.bn_up = nn.BatchNorm2d(channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x, res_module): + + res_block = self.residual(x) + + x = self.conv_down(x) + x = self.bn_down(x) + x = self.relu(x) + + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + + x = self.conv_up(x) + x = self.bn_up(x) + x = self.relu(x + res_block + res_module) + + return x + + +class Spatial_Basic_Block(nn.Module): + def __init__(self, in_channels, out_channels, max_graph_distance, residual=False, **kwargs): + super(Spatial_Basic_Block, self).__init__() + + if not residual: + self.residual = lambda x: 0 + elif in_channels == out_channels: + self.residual = lambda x: x + else: + self.residual = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1), + nn.BatchNorm2d(out_channels), + ) + + self.conv = SpatialGraphConv(in_channels, out_channels, max_graph_distance) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x, A): + + res_block = self.residual(x) + + x = self.conv(x, A) + x = self.bn(x) + x = self.relu(x + res_block) + + return x + + +class Temporal_Basic_Block(nn.Module): + def __init__(self, channels, temporal_window_size, stride=1, residual=False, **kwargs): + super(Temporal_Basic_Block, self).__init__() + + padding = ((temporal_window_size - 1) // 2, 0) + + if not residual: + self.residual = lambda x: 0 + elif stride == 1: + self.residual = lambda x: x + else: + self.residual = nn.Sequential( + nn.Conv2d(channels, channels, 1, (stride,1)), + nn.BatchNorm2d(channels), + ) + + self.conv = nn.Conv2d(channels, channels, (temporal_window_size,1), (stride,1), padding) + self.bn = nn.BatchNorm2d(channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x, res_module): + + res_block = self.residual(x) + + x = self.conv(x) + x = self.bn(x) + x = self.relu(x + res_block + res_module) + + return x + + +# Thanks to YAN Sijie for the released code on Github (https://github.com/yysijie/st-gcn) +class SpatialGraphConv(nn.Module): + def __init__(self, in_channels, out_channels, max_graph_distance): + super(SpatialGraphConv, self).__init__() + + # spatial class number (distance = 0 for class 0, distance = 1 for class 1, ...) + self.s_kernel_size = max_graph_distance + 1 + + # weights of different spatial classes + self.gcn = nn.Conv2d(in_channels, out_channels*self.s_kernel_size, 1) + + def forward(self, x, A): + + # numbers in same class have same weight + x = self.gcn(x) + + # divide nodes into different classes + n, kc, t, v = x.size() + x = x.view(n, self.s_kernel_size, kc//self.s_kernel_size, t, v) + + # spatial graph convolution + x = torch.einsum('nkctv,kvw->nctw', (x, A[:self.s_kernel_size])).contiguous() + + return x diff --git a/src/models/ResGCNv1/modules.py b/src/models/ResGCNv1/modules.py new file mode 100644 index 0000000..6b0f996 --- /dev/null +++ b/src/models/ResGCNv1/modules.py @@ -0,0 +1,91 @@ +import logging, torch +from torch import nn + + +def import_class(name): + components = name.split('.') + mod = __import__(components[0]) + for comp in components[1:]: + mod = getattr(mod, comp) + return mod + + +class ResGCN_Module(nn.Module): + def __init__(self, in_channels, out_channels, block, A, initial=False, stride=1, kernel_size=[9,2], **kwargs): + super(ResGCN_Module, self).__init__() + + if not len(kernel_size) == 2: + logging.info('') + logging.error('Error: Please check whether len(kernel_size) == 2') + raise ValueError() + if not kernel_size[0] % 2 == 1: + logging.info('') + logging.error('Error: Please check whether kernel_size[0] % 2 == 1') + raise ValueError() + temporal_window_size, max_graph_distance = kernel_size + + if initial: + module_res, block_res = False, False + elif block == 'Basic': + module_res, block_res = True, False + else: + module_res, block_res = False, True + + if not module_res: + self.residual = lambda x: 0 + elif stride == 1 and in_channels == out_channels: + self.residual = lambda x: x + else: + self.residual = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, (stride,1)), + nn.BatchNorm2d(out_channels), + ) + + spatial_block = import_class('models.ResGCNv1.blocks.Spatial_{}_Block'.format(block)) + temporal_block = import_class('models.ResGCNv1.blocks.Temporal_{}_Block'.format(block)) + self.scn = spatial_block(in_channels, out_channels, max_graph_distance, block_res, **kwargs) + self.tcn = temporal_block(out_channels, temporal_window_size, stride, block_res, **kwargs) + self.edge = nn.Parameter(torch.ones_like(A)) + + def forward(self, x, A): + return self.tcn(self.scn(x, A*self.edge), self.residual(x)) + + +class AttGCN_Module(nn.Module): + def __init__(self, in_channels, out_channels, block, A, attention, stride=1, kernel_size=[9,2], **kwargs): + super(AttGCN_Module, self).__init__() + + if not len(kernel_size) == 2: + logging.info('') + logging.error('Error: Please check whether len(kernel_size) == 2') + raise ValueError() + if not kernel_size[0] % 2 == 1: + logging.info('') + logging.error('Error: Please check whether kernel_size[0] % 2 == 1') + raise ValueError() + temporal_window_size, max_graph_distance = kernel_size + + if block == 'Basic': + module_res, block_res = True, False + else: + module_res, block_res = False, True + + if not module_res: + self.residual = lambda x: 0 + elif stride == 1 and in_channels == out_channels: + self.residual = lambda x: x + else: + self.residual = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, (stride,1)), + nn.BatchNorm2d(out_channels), + ) + + spatial_block = import_class('models.ResGCNv1.blocks.Spatial_{}_Block'.format(block)) + temporal_block = import_class('models.ResGCNv1.blocks.Temporal_{}_Block'.format(block)) + self.scn = spatial_block(in_channels, out_channels, max_graph_distance, block_res, **kwargs) + self.tcn = temporal_block(out_channels, temporal_window_size, stride, block_res, **kwargs) + self.att = attention(out_channels, **kwargs) + self.edge = nn.Parameter(torch.ones_like(A)) + + def forward(self, x, A): + return self.att(self.tcn(self.scn(x, A*self.edge), self.residual(x))) diff --git a/src/models/ResGCNv1/nets.py b/src/models/ResGCNv1/nets.py new file mode 100644 index 0000000..a83fd30 --- /dev/null +++ b/src/models/ResGCNv1/nets.py @@ -0,0 +1,104 @@ +import torch +from torch import nn +import torch.nn.functional as F + +from .modules import ResGCN_Module + + +class ResGCN_Input_Branch(nn.Module): + def __init__(self, structure, block, num_channel, A, **kwargs): + super(ResGCN_Input_Branch, self).__init__() + + self.register_buffer('A', A) + + module_list = [ResGCN_Module(num_channel, 64, 'Basic', A, initial=True, **kwargs)] + module_list += [ResGCN_Module(64, 64, 'Basic', A, initial=True, **kwargs) for _ in range(structure[0] - 1)] + module_list += [ResGCN_Module(64, 64, block, A, **kwargs) for _ in range(structure[1] - 1)] + module_list += [ResGCN_Module(64, 32, block, A, **kwargs)] + + self.bn = nn.BatchNorm2d(num_channel) + self.layers = nn.ModuleList(module_list) + + def forward(self, x): + + x = self.bn(x) + for layer in self.layers: + x = layer(x, self.A) + + return x + + +class ResGCN(nn.Module): + def __init__(self, module, structure, block, num_input, num_channel, num_class, A, **kwargs): + super(ResGCN, self).__init__() + + self.register_buffer('A', A) + + # input branches + self.input_branches = nn.ModuleList([ + ResGCN_Input_Branch(structure, block, num_channel, A, **kwargs) + for _ in range(num_input) + ]) + + # main stream + module_list = [module(32*num_input, 128, block, A, stride=2, **kwargs)] + module_list += [module(128, 128, block, A, **kwargs) for _ in range(structure[2] - 1)] + module_list += [module(128, 256, block, A, stride=2, **kwargs)] + module_list += [module(256, 256, block, A, **kwargs) for _ in range(structure[3] - 1)] + self.main_stream = nn.ModuleList(module_list) + + # output + self.global_pooling = nn.AdaptiveAvgPool2d(1) + self.fcn = nn.Linear(256, num_class) + + # init parameters + init_param(self.modules()) + zero_init_lastBN(self.modules()) + + def forward(self, x): + + # N, I, C, T, V = x.size() + + # input branches + x_cat = [] + for i, branch in enumerate(self.input_branches): + x_cat.append(branch(x[:,i,:,:,:])) + x = torch.cat(x_cat, dim=1) + + # main stream + for layer in self.main_stream: + x = layer(x, self.A) + + # output + x = self.global_pooling(x) + x = self.fcn(x.squeeze()) + + # L2 normalization + x = F.normalize(x, dim=1, p=2) + + return x + + +def init_param(modules): + for m in modules: + if isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + #m.bias = None + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, std=0.001) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + +def zero_init_lastBN(modules): + for m in modules: + if isinstance(m, ResGCN_Module): + if hasattr(m.scn, 'bn_up'): + nn.init.constant_(m.scn.bn_up.weight, 0) + if hasattr(m.tcn, 'bn_up'): + nn.init.constant_(m.tcn.bn_up.weight, 0) diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..da2c068 --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1 @@ +from .ResGCNv1 import create diff --git a/src/models/st_gcn/__init__.py b/src/models/st_gcn/__init__.py new file mode 100644 index 0000000..9f9161b --- /dev/null +++ b/src/models/st_gcn/__init__.py @@ -0,0 +1 @@ +from . import utils \ No newline at end of file diff --git a/src/models/st_gcn/st_gcn.py b/src/models/st_gcn/st_gcn.py new file mode 100644 index 0000000..4c2cd21 --- /dev/null +++ b/src/models/st_gcn/st_gcn.py @@ -0,0 +1,180 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from models.st_gcn.utils.tgcn import ConvTemporalGraphical +from models.st_gcn.utils.graph import Graph + + +class STGCNEmbedding(nn.Module): + r"""Spatial temporal graph convolutional networks. + + Args: + in_channels (int): Number of channels in the input data + graph_args (dict): The arguments for building the graph + edge_importance_weighting (bool): If ``True``, adds a learnable + importance weighting to the edges of the graph + **kwargs (optional): Other parameters for graph convolution units + + Shape: + - Input: :math:`(N, in_channels, T_{in}, V_{in})` + - Output: :math:`(N, num_class)` where + :math:`N` is a batch size, + :math:`T_{in}` is a length of input sequence, + :math:`V_{in}` is the number of graph nodes + """ + + def __init__(self, in_channels, graph_args, edge_importance_weighting=False, temporal_kernel_size=9, + embedding_layer_size=256, **kwargs): + super().__init__() + + # load graph + self.graph = Graph(**graph_args) + A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False) + self.register_buffer('A', A) + + # build networks + spatial_kernel_size = A.size(0) + # temporal_kernel_size = 9 + + kernel_size = (temporal_kernel_size, spatial_kernel_size) + + self.data_bn = nn.BatchNorm1d(in_channels * A.size(1)) + kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'} + + self.st_gcn_networks = nn.ModuleList(( + st_gcn(in_channels, 64, kernel_size, 1, residual=False, **kwargs0), + st_gcn(64, 64, kernel_size, 1, **kwargs), + st_gcn(64, 64, kernel_size, 1, **kwargs), + st_gcn(64, 64, kernel_size, 1, **kwargs), + st_gcn(64, 128, kernel_size, 2, **kwargs), + st_gcn(128, 128, kernel_size, 1, **kwargs), + st_gcn(128, 128, kernel_size, 1, **kwargs), + st_gcn(128, 256, kernel_size, 2, **kwargs), + # st_gcn(256, 256, kernel_size, 1, **kwargs), + st_gcn(256, 256, kernel_size, 1, **kwargs), + )) + + # initialize parameters for edge importance weighting + if edge_importance_weighting: + self.edge_importance = nn.ParameterList([ + nn.Parameter(torch.ones(self.A.size()), requires_grad=True) + for _ in self.st_gcn_networks + ]) + else: + self.edge_importance = [1] * len(self.st_gcn_networks) + + self.fcn = nn.Conv2d(256, embedding_layer_size, kernel_size=1) + + def forward(self, x, hint=None): + # data normalization + N, C, T, V = x.size() + x = x.permute(0, 3, 1, 2).contiguous() + x = x.view(N, V * C, T) + + x = self.data_bn(x) + x = x.view(N, V, C, T) + + x = x.permute(0, 2, 3, 1).contiguous() + x = x.view(N, C, T, V) + # forward + for gcn, importance in zip(self.st_gcn_networks, self.edge_importance): + x, _ = gcn(x, self.A * importance) + + # Adding average pooling as in the original model + x = F.avg_pool2d(x, x.size()[2:]) + + feature = self.fcn(x) + + # L2 normalization + feature = F.normalize(feature, dim=1, p=2) + + feature = feature.view(N, -1) + + return feature + + # Alias for model.forward() + def get_embedding(self, x): + return self.forward(x) + + +class st_gcn(nn.Module): + r"""Applies a spatial temporal graph convolution over an input graph sequence. + + Args: + in_channels (int): Number of channels in the input sequence data + out_channels (int): Number of channels produced by the convolution + kernel_size (tuple): Size of the temporal convolving kernel and graph convolving kernel + stride (int, optional): Stride of the temporal convolution. Default: 1 + dropout (int, optional): Dropout rate of the final output. Default: 0 + residual (bool, optional): If ``True``, applies a residual mechanism. Default: ``True`` + + Shape: + - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format + - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format + - Output[0]: Output graph sequence in :math:`(N, out_channels, T_{out}, V)` format + - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format + + where + :math:`N` is a batch size, + :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`, + :math:`T_{in}/T_{out}` is a length of input/output sequence, + :math:`V` is the number of graph nodes. + + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + dropout=0, + residual=True): + super().__init__() + + assert len(kernel_size) == 2 + assert kernel_size[0] % 2 == 1 + padding = ((kernel_size[0] - 1) // 2, 0) + + self.gcn = ConvTemporalGraphical(in_channels, out_channels, + kernel_size[1]) + + self.tcn = nn.Sequential( + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True), + nn.Conv2d( + out_channels, + out_channels, + (kernel_size[0], 1), + (stride, 1), + padding, + ), + nn.BatchNorm2d(out_channels), + nn.Dropout(dropout, inplace=True), + ) + + if not residual: + self.residual = lambda x: 0 + + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + + else: + self.residual = nn.Sequential( + nn.Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=(stride, 1)), + nn.BatchNorm2d(out_channels), + ) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x, A): + + res = self.residual(x) + x, A = self.gcn(x, A) + x = self.tcn(x) + res + + return self.relu(x), A diff --git a/src/models/st_gcn/utils/__init__.py b/src/models/st_gcn/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/st_gcn/utils/graph.py b/src/models/st_gcn/utils/graph.py new file mode 100644 index 0000000..649a9e1 --- /dev/null +++ b/src/models/st_gcn/utils/graph.py @@ -0,0 +1,207 @@ +import numpy as np + + +class Graph: + """ The Graph to model the skeletons extracted by the openpose + + Args: + strategy (string): must be one of the follow candidates + - uniform: Uniform Labeling + - distance: Distance Partitioning + - spatial: Spatial Configuration + For more information, please refer to the section 'Partition Strategies' + in our paper (https://arxiv.org/abs/1801.07455). + + layout (string): must be one of the follow candidates + - openpose: Is consists of 18 joints. For more information, please + refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose#output + - ntu-rgb+d: Is consists of 25 joints. For more information, please + refer to https://github.com/shahroudy/NTURGB-D + + max_hop (int): the maximal distance between two connected nodes + dilation (int): controls the spacing between the kernel points + + """ + + def __init__(self, + layout='coco', + strategy='uniform', + max_hop=1, + dilation=1): + self.max_hop = max_hop + self.dilation = dilation + + self.get_edge(layout) + self.hop_dis = get_hop_distance( + self.num_node, self.edge, max_hop=max_hop) + self.get_adjacency(strategy) + + def __str__(self): + return self.A + + def get_edge(self, layout): + if layout == 'openpose': + self.num_node = 18 + self_link = [(i, i) for i in range(self.num_node)] + neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11), + (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1), + (0, 1), (15, 0), (14, 0), (17, 15), (16, 14)] + self.edge = self_link + neighbor_link + self.center = 1 + elif layout == 'ntu-rgb+d': + self.num_node = 25 + self_link = [(i, i) for i in range(self.num_node)] + neighbor_base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21), + (6, 5), (7, 6), (8, 7), (9, 21), (10, 9), + (11, 10), (12, 11), (13, 1), (14, 13), (15, 14), + (16, 15), (17, 1), (18, 17), (19, 18), (20, 19), + (22, 23), (23, 8), (24, 25), (25, 12)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_base] + self.edge = self_link + neighbor_link + self.center = 21 - 1 + elif layout == 'ntu_edge': + self.num_node = 24 + self_link = [(i, i) for i in range(self.num_node)] + neighbor_base = [(1, 2), (3, 2), (4, 3), (5, 2), (6, 5), (7, 6), + (8, 7), (9, 2), (10, 9), (11, 10), (12, 11), + (13, 1), (14, 13), (15, 14), (16, 15), (17, 1), + (18, 17), (19, 18), (20, 19), (21, 22), (22, 8), + (23, 24), (24, 12)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_base] + self.edge = self_link + neighbor_link + self.center = 2 + elif layout == 'coco': + # keypoints = { + # 0: "nose", + # 1: "left_eye", + # 2: "right_eye", + # 3: "left_ear", + # 4: "right_ear", + # 5: "left_shoulder", + # 6: "right_shoulder", + # 7: "left_elbow", + # 8: "right_elbow", + # 9: "left_wrist", + # 10: "right_wrist", + # 11: "left_hip", + # 12: "right_hip", + # 13: "left_knee", + # 14: "right_knee", + # 15: "left_ankle", + # 16: "right_ankle" + # } + self.num_node = 17 + self_link = [(i, i) for i in range(self.num_node)] + neighbor_base = [(0,1), (0,2), (1,3), (2,4), (3,5), (4,6), (5,6), + (5,7), (7,9), (6,8), (8,10), (5,11), (6, 12), (11, 12), + (11, 13), (13, 15), (12, 14), (14, 16)] + neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_base] + self.edge = self_link + neighbor_link + self.center = 0 + elif layout == 'nonlocal-coco': + self.num_node = 17 + self_link = [(i, i) for i in range(self.num_node)] + edge=[] + for i in range(0, self.num_node ): + for j in range(0, self.num_node ): + edge.append((i, j)) + self.edge = edge + self.center = 1 + # elif layout=='customer settings' + # pass + else: + raise ValueError("Do Not Exist This Layout.") + + def get_adjacency(self, strategy): + valid_hop = range(0, self.max_hop + 1, self.dilation) + adjacency = np.zeros((self.num_node, self.num_node)) + for hop in valid_hop: + adjacency[self.hop_dis == hop] = 1 + + normalize_adjacency = normalize_digraph(adjacency) + + if strategy == 'uniform': + A = np.zeros((1, self.num_node, self.num_node)) + A[0] = normalize_adjacency + self.A = A + elif strategy == 'distance': + A = np.zeros((len(valid_hop), self.num_node, self.num_node)) + for i, hop in enumerate(valid_hop): + A[i][self.hop_dis == hop] = normalize_adjacency[self.hop_dis == + hop] + self.A = A + elif strategy == 'spatial': + A = [] + for hop in valid_hop: + a_root = np.zeros((self.num_node, self.num_node)) + a_close = np.zeros((self.num_node, self.num_node)) + a_further = np.zeros((self.num_node, self.num_node)) + for i in range(self.num_node): + for j in range(self.num_node): + if self.hop_dis[j, i] == hop: + if self.hop_dis[j, self.center] == self.hop_dis[ + i, self.center]: + a_root[j, i] = normalize_adjacency[j, i] + elif self.hop_dis[j, self. + center] > self.hop_dis[i, self. + center]: + a_close[j, i] = normalize_adjacency[j, i] + else: + a_further[j, i] = normalize_adjacency[j, i] + if hop == 0: + A.append(a_root) + else: + A.append(a_root + a_close) + A.append(a_further) + A = np.stack(A) + self.A = A + else: + raise ValueError("Do Not Exist This Strategy") + + +def get_hop_distance(num_node, edge, max_hop=1): + A = np.zeros((num_node, num_node)) + for i, j in edge: + A[j, i] = 1 + A[i, j] = 1 + + # compute hop steps + hop_dis = np.zeros((num_node, num_node)) + np.inf + transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)] + arrive_mat = (np.stack(transfer_mat) > 0) + for d in range(max_hop, -1, -1): + hop_dis[arrive_mat[d]] = d + return hop_dis + + +def normalize_digraph(A): + Dl = np.sum(A, 0) + num_node = A.shape[0] + Dn = np.zeros((num_node, num_node)) + for i in range(num_node): + if Dl[i] > 0: + Dn[i, i] = Dl[i]**(-1) + AD = np.dot(A, Dn) + return AD + + +def normalize_undigraph(A): + Dl = np.sum(A, 0) + num_node = A.shape[0] + Dn = np.zeros((num_node, num_node)) + for i in range(num_node): + if Dl[i] > 0: + Dn[i, i] = Dl[i]**(-0.5) + DAD = np.dot(np.dot(Dn, A), Dn) + return DAD + + +if __name__ == '__main__': + import matplotlib.pyplot as plt + + out = Graph(strategy='spatial', layout='coco').A + for a in out: + plt.imshow(a, cmap='gray') + plt.show() + print(out) + diff --git a/src/models/st_gcn/utils/tgcn.py b/src/models/st_gcn/utils/tgcn.py new file mode 100644 index 0000000..f51ae28 --- /dev/null +++ b/src/models/st_gcn/utils/tgcn.py @@ -0,0 +1,67 @@ +# The based unit of graph convolutional networks. + +import torch +import torch.nn as nn + + +class ConvTemporalGraphical(nn.Module): + + r"""The basic module for applying a graph convolution. + + Args: + in_channels (int): Number of channels in the input sequence data + out_channels (int): Number of channels produced by the convolution + kernel_size (int): Size of the graph convolving kernel + t_kernel_size (int): Size of the temporal convolving kernel + t_stride (int, optional): Stride of the temporal convolution. Default: 1 + t_padding (int, optional): Temporal zero-padding added to both sides of + the input. Default: 0 + t_dilation (int, optional): Spacing between temporal kernel elements. + Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the output. + Default: ``True`` + + Shape: + - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format + - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format + - Output[0]: Output graph sequence in :math:`(N, out_channels, T_{out}, V)` format + - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format + + where + :math:`N` is a batch size, + :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`, + :math:`T_{in}/T_{out}` is a length of input/output sequence, + :math:`V` is the number of graph nodes. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + t_kernel_size=1, + t_stride=1, + t_padding=0, + t_dilation=1, + bias=True): + super().__init__() + + self.kernel_size = kernel_size + self.conv = nn.Conv2d( + in_channels, + out_channels * kernel_size, + kernel_size=(t_kernel_size, 1), + padding=(t_padding, 0), + stride=(t_stride, 1), + dilation=(t_dilation, 1), + bias=bias) + + def forward(self, x, A): + assert A.size(0) == self.kernel_size + + x = self.conv(x) + + n, kc, t, v = x.size() + x = x.view(n, self.kernel_size, kc//self.kernel_size, t, v) + x = torch.einsum('nkctv,kvw->nctw', (x, A)) + + return x.contiguous(), A diff --git a/src/pose_estimator/README.md b/src/pose_estimator/README.md new file mode 100644 index 0000000..b0cb770 --- /dev/null +++ b/src/pose_estimator/README.md @@ -0,0 +1,2 @@ +## Human Pose Estimation: HRNet +This part is borrowed from [HRNet/HRNet-Human-Pose-Estimation](https://github.com/HRNet/HRNet-Human-Pose-Estimation) diff --git a/src/pose_estimator/config.py b/src/pose_estimator/config.py new file mode 100644 index 0000000..9433cc1 --- /dev/null +++ b/src/pose_estimator/config.py @@ -0,0 +1,155 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +import os + +from yacs.config import CfgNode as CN + + +_C = CN() + +_C.OUTPUT_DIR = '' +_C.LOG_DIR = '' +_C.DATA_DIR = '' +_C.GPUS = (0,) +_C.WORKERS = 4 +_C.PRINT_FREQ = 20 +_C.AUTO_RESUME = False +_C.PIN_MEMORY = True +_C.RANK = 0 + +# Cudnn related params +_C.CUDNN = CN() +_C.CUDNN.BENCHMARK = True +_C.CUDNN.DETERMINISTIC = False +_C.CUDNN.ENABLED = True + +# common params for NETWORK +_C.MODEL = CN() +_C.MODEL.NAME = 'pose_hrnet' +_C.MODEL.INIT_WEIGHTS = True +_C.MODEL.PRETRAINED = '' +_C.MODEL.NUM_JOINTS = 17 +_C.MODEL.TAG_PER_JOINT = True +_C.MODEL.TARGET_TYPE = 'gaussian' +_C.MODEL.IMAGE_SIZE = [256, 256] # width * height, ex: 192 * 256 +_C.MODEL.HEATMAP_SIZE = [64, 64] # width * height, ex: 24 * 32 +_C.MODEL.SIGMA = 2 +_C.MODEL.EXTRA = CN(new_allowed=True) + +_C.LOSS = CN() +_C.LOSS.USE_OHKM = False +_C.LOSS.TOPK = 8 +_C.LOSS.USE_TARGET_WEIGHT = True +_C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False + +# DATASET related params +_C.DATASET = CN() +_C.DATASET.ROOT = '' +_C.DATASET.DATASET = 'mpii' +_C.DATASET.TRAIN_SET = 'train' +_C.DATASET.TEST_SET = 'valid' +_C.DATASET.DATA_FORMAT = 'jpg' +_C.DATASET.HYBRID_JOINTS_TYPE = '' +_C.DATASET.SELECT_DATA = False + +# training data augmentation +_C.DATASET.FLIP = True +_C.DATASET.SCALE_FACTOR = 0.25 +_C.DATASET.ROT_FACTOR = 30 +_C.DATASET.PROB_HALF_BODY = 0.0 +_C.DATASET.NUM_JOINTS_HALF_BODY = 8 +_C.DATASET.COLOR_RGB = False + +# train +_C.TRAIN = CN() + +_C.TRAIN.LR_FACTOR = 0.1 +_C.TRAIN.LR_STEP = [90, 110] +_C.TRAIN.LR = 0.001 + +_C.TRAIN.OPTIMIZER = 'adam' +_C.TRAIN.MOMENTUM = 0.9 +_C.TRAIN.WD = 0.0001 +_C.TRAIN.NESTEROV = False +_C.TRAIN.GAMMA1 = 0.99 +_C.TRAIN.GAMMA2 = 0.0 + +_C.TRAIN.BEGIN_EPOCH = 0 +_C.TRAIN.END_EPOCH = 140 + +_C.TRAIN.RESUME = False +_C.TRAIN.CHECKPOINT = '' + +_C.TRAIN.BATCH_SIZE_PER_GPU = 32 +_C.TRAIN.SHUFFLE = True + +# testing +_C.TEST = CN() + +# size of images for each device +_C.TEST.BATCH_SIZE_PER_GPU = 32 +# Test Model Epoch +_C.TEST.FLIP_TEST = False +_C.TEST.POST_PROCESS = False +_C.TEST.SHIFT_HEATMAP = False + +_C.TEST.USE_GT_BBOX = False + +# nms +_C.TEST.IMAGE_THRE = 0.1 +_C.TEST.NMS_THRE = 0.6 +_C.TEST.SOFT_NMS = False +_C.TEST.OKS_THRE = 0.5 +_C.TEST.IN_VIS_THRE = 0.0 +_C.TEST.COCO_BBOX_FILE = '' +_C.TEST.BBOX_THRE = 1.0 +_C.TEST.MODEL_FILE = '' + +# debug +_C.DEBUG = CN() +_C.DEBUG.DEBUG = False +_C.DEBUG.SAVE_BATCH_IMAGES_GT = False +_C.DEBUG.SAVE_BATCH_IMAGES_PRED = False +_C.DEBUG.SAVE_HEATMAPS_GT = False +_C.DEBUG.SAVE_HEATMAPS_PRED = False + + +def update_config(cfg, args): + cfg.defrost() + cfg.merge_from_file(args.cfg) + cfg.merge_from_list(args.opt) + + if args.modelDir: + cfg.OUTPUT_DIR = args.modelDir + + if args.logDir: + cfg.LOG_DIR = args.logDir + + if args.dataDir: + cfg.DATA_DIR = args.dataDir + + cfg.DATASET.ROOT = os.path.join( + cfg.DATA_DIR, cfg.DATASET.ROOT + ) + + cfg.MODEL.PRETRAINED = os.path.join( + cfg.DATA_DIR, cfg.MODEL.PRETRAINED + ) + + if cfg.TEST.MODEL_FILE: + cfg.TEST.MODEL_FILE = os.path.join( + cfg.DATA_DIR, cfg.TEST.MODEL_FILE + ) + + cfg.freeze() + + +if __name__ == '__main__': + import sys + with open(sys.argv[1], 'w') as f: + print(_C, file=f) + diff --git a/src/pose_estimator/inference-config.yaml b/src/pose_estimator/inference-config.yaml new file mode 100644 index 0000000..9e57cf2 --- /dev/null +++ b/src/pose_estimator/inference-config.yaml @@ -0,0 +1,127 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: 'coco' + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + ROOT: 'data/coco/' + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 17 + PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 288 + - 384 + HEATMAP_SIZE: + - 72 + - 96 + SIGMA: 3 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + - 256 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + USE_GT_BBOX: true + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/src/pose_estimator/model_hrnet.py b/src/pose_estimator/model_hrnet.py new file mode 100644 index 0000000..4a0ebe3 --- /dev/null +++ b/src/pose_estimator/model_hrnet.py @@ -0,0 +1,497 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +import os +import logging + +import torch +import torch.nn as nn + + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class HighResolutionModule(nn.Module): + def __init__(self, num_branches, blocks, num_blocks, num_inchannels, + num_channels, fuse_method, multi_scale_output=True): + super(HighResolutionModule, self).__init__() + self._check_branches( + num_branches, blocks, num_blocks, num_inchannels, num_channels) + + self.num_inchannels = num_inchannels + self.fuse_method = fuse_method + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches( + num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(True) + + def _check_branches(self, num_branches, blocks, num_blocks, + num_inchannels, num_channels): + if num_branches != len(num_blocks): + error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format( + num_branches, len(num_blocks)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format( + num_branches, len(num_channels)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_inchannels): + error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format( + num_branches, len(num_inchannels)) + logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, + stride=1): + downsample = None + if stride != 1 or \ + self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.num_inchannels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d( + num_channels[branch_index] * block.expansion, + momentum=BN_MOMENTUM + ), + ) + + layers = [] + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index], + stride, + downsample + ) + ) + self.num_inchannels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index] + ) + ) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels) + ) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + num_inchannels = self.num_inchannels + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_inchannels[i], + 1, 1, 0, bias=False + ), + nn.BatchNorm2d(num_inchannels[i]), + nn.Upsample(scale_factor=2**(j-i), mode='nearest') + ) + ) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i-j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3) + ) + ) + else: + num_outchannels_conv3x3 = num_inchannels[j] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + nn.ReLU(True) + ) + ) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + return self.num_inchannels + + def forward(self, x): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + + for i in range(len(self.fuse_layers)): + y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + else: + y = y + self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + + return x_fuse + + +blocks_dict = { + 'BASIC': BasicBlock, + 'BOTTLENECK': Bottleneck +} + + +class PoseHighResolutionNet(nn.Module): + + def __init__(self, cfg, **kwargs): + self.inplanes = 64 + extra = cfg['MODEL']['EXTRA'] + super(PoseHighResolutionNet, self).__init__() + + # stem net + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.layer1 = self._make_layer(Bottleneck, 64, 4) + + self.stage2_cfg = extra['STAGE2'] + num_channels = self.stage2_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage2_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition1 = self._make_transition_layer([256], num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + self.stage3_cfg = extra['STAGE3'] + num_channels = self.stage3_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage3_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition2 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + self.stage4_cfg = extra['STAGE4'] + num_channels = self.stage4_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage4_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition3 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multi_scale_output=False) + + self.final_layer = nn.Conv2d( + in_channels=pre_stage_channels[0], + out_channels=cfg['MODEL']['NUM_JOINTS'], + kernel_size=extra['FINAL_CONV_KERNEL'], + stride=1, + padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0 + ) + + self.pretrained_layers = extra['PRETRAINED_LAYERS'] + + def _make_transition_layer( + self, num_channels_pre_layer, num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + nn.Conv2d( + num_channels_pre_layer[i], + num_channels_cur_layer[i], + 3, 1, 1, bias=False + ), + nn.BatchNorm2d(num_channels_cur_layer[i]), + nn.ReLU(inplace=True) + ) + ) + else: + transition_layers.append(None) + else: + conv3x3s = [] + for j in range(i+1-num_branches_pre): + inchannels = num_channels_pre_layer[-1] + outchannels = num_channels_cur_layer[i] \ + if j == i-num_branches_pre else inchannels + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + inchannels, outchannels, 3, 2, 1, bias=False + ), + nn.BatchNorm2d(outchannels), + nn.ReLU(inplace=True) + ) + ) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, num_inchannels, + multi_scale_output=True): + num_modules = layer_config['NUM_MODULES'] + num_branches = layer_config['NUM_BRANCHES'] + num_blocks = layer_config['NUM_BLOCKS'] + num_channels = layer_config['NUM_CHANNELS'] + block = blocks_dict[layer_config['BLOCK']] + fuse_method = layer_config['FUSE_METHOD'] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + if not multi_scale_output and i == num_modules - 1: + reset_multi_scale_output = False + else: + reset_multi_scale_output = True + + modules.append( + HighResolutionModule( + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + fuse_method, + reset_multi_scale_output + ) + ) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['NUM_BRANCHES']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['NUM_BRANCHES']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['NUM_BRANCHES']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + x = self.final_layer(y_list[0]) + + return x + + def init_weights(self, pretrained=''): + logger.info('=> init weights from normal distribution') + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + + if os.path.isfile(pretrained): + pretrained_state_dict = torch.load(pretrained) + logger.info('=> loading pretrained model {}'.format(pretrained)) + + need_init_state_dict = {} + for name, m in pretrained_state_dict.items(): + if name.split('.')[0] in self.pretrained_layers \ + or self.pretrained_layers[0] is '*': + need_init_state_dict[name] = m + self.load_state_dict(need_init_state_dict, strict=False) + elif pretrained: + logger.error('=> please download pre-trained models first!') + raise ValueError('{} is not exist!'.format(pretrained)) + + +def get_pose_net(cfg, is_train, **kwargs): + model = PoseHighResolutionNet(cfg, **kwargs) + + if is_train and cfg['MODEL']['INIT_WEIGHTS']: + model.init_weights(cfg['MODEL']['PRETRAINED']) + + return model diff --git a/src/pose_estimator/pose_estimator_hrnet.py b/src/pose_estimator/pose_estimator_hrnet.py new file mode 100644 index 0000000..c0a4ca2 --- /dev/null +++ b/src/pose_estimator/pose_estimator_hrnet.py @@ -0,0 +1,111 @@ +import torch +import torchvision.transforms as transforms +from PIL import Image + +from datasets.preparation import box_to_center_scale +from pose_estimator import model_hrnet +from pose_estimator.config import _C as config, update_config +from utils import * + + +class PoseEstimatorHRNet: + def __init__(self, + config_path='inference-config.yaml', + weights_path='../../models/pose_hrnet_w32_384x288.pth'): + self.config_path = config_path + self.weights_path = weights_path + + args.cfg = self.config_path + # opt expected by supporting codebase + args.modelDir = '' + args.logDir = '' + args.dataDir = '' + args.prevModelDir = '' + + update_config(config, args) + self.config = config + + self.model = model_hrnet.get_pose_net(config, is_train=False) + self.model.load_state_dict(torch.load(weights_path), strict=False) + self.model = torch.nn.DataParallel(self.model).cuda() + + self.model.eval() # Set in evaluation mode + + def estimate_pose_from_image(self, img, box): + center, scale = box_to_center_scale(box, config.MODEL.IMAGE_SIZE[0], config.MODEL.IMAGE_SIZE[1]) + + rotation = 0 + + # pose estimation transformation + trans = get_affine_transform(center, scale, rotation, config.MODEL.IMAGE_SIZE) + model_input = cv2.warpAffine( + img, + trans, + (int(config.MODEL.IMAGE_SIZE[0]), int(config.MODEL.IMAGE_SIZE[1])), + flags=cv2.INTER_LINEAR) + + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]) + + # pose estimation inference + input_img = transform(model_input).unsqueeze(0) + + with torch.no_grad(): + # compute output heatmap + output = self.model(input_img) + preds, _ = get_final_preds( + config, + output.clone().cpu().numpy(), + np.asarray([center]), + np.asarray([scale])) + + return preds + + +if __name__ == "__main__": + import argparse + import matplotlib.pyplot as plt + import matplotlib.patches as patches + from matplotlib.ticker import NullLocator + + parser = argparse.ArgumentParser() + # general + parser.add_argument('--cfg', type=str, default='inference-config.yaml') + parser.add_argument('opt', help='Modify config options using the command-line', default=None, + nargs=argparse.REMAINDER) + + args = parser.parse_args() + pose_estimator = PoseEstimatorHRNet() + + img = np.array(Image.open('../data/samples/messi.jpg')) + boxes = [ + [17.860302, 26.873545, 824.93115, 694.90466], + [1202.5271, 475.52982, 88.31201, 215.9581], + [648.0603, 104.8192, 492.93066, 621.0242] + ] + + # Create plot + plt.figure() + fig, ax = plt.subplots(1) + ax.imshow(img) + + for box in boxes: + pose_predictions = pose_estimator.estimate_pose_from_image(img, box) + for _, mat in enumerate(pose_predictions[0]): + x, y = int(mat[0]), int(mat[1]) + circle = patches.Circle((x, y), radius=5, linewidth=2, edgecolor=(1, 0, 0), facecolor="none") + # Add the pose points to the plot + ax.add_patch(circle) + + x1, y1, box_w, box_h = box + bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=(0, 1, 0), facecolor="none") + # Add the bbox to the plot + ax.add_patch(bbox) + + plt.axis("off") + plt.gca().xaxis.set_major_locator(NullLocator()) + plt.gca().yaxis.set_major_locator(NullLocator()) + plt.show() diff --git a/src/pose_estimator/utils.py b/src/pose_estimator/utils.py new file mode 100644 index 0000000..037d731 --- /dev/null +++ b/src/pose_estimator/utils.py @@ -0,0 +1,138 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# ------------------------------------------------------------------------------ + +import math + +import numpy as np +import cv2 + + +def transform_preds(coords, center, scale, output_size): + target_coords = np.zeros(coords.shape) + trans = get_affine_transform(center, scale, 0, output_size, inv=1) + for p in range(coords.shape[0]): + target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) + return target_coords + + +def get_affine_transform( + center, scale, rot, output_size, + shift=np.array([0, 0], dtype=np.float32), inv=0 +): + if not isinstance(scale, np.ndarray) and not isinstance(scale, list): + print(scale) + scale = np.array([scale, scale]) + + scale_tmp = scale * 200.0 + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = get_dir([0, src_w * -0.5], rot_rad) + dst_dir = np.array([0, dst_w * -0.5], np.float32) + + src = np.zeros((3, 2), dtype=np.float32) + dst = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + + src[2:, :] = get_3rd_point(src[0, :], src[1, :]) + dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def affine_transform(pt, t): + new_pt = np.array([pt[0], pt[1], 1.]).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + + +def get_3rd_point(a, b): + direct = a - b + return b + np.array([-direct[1], direct[0]], dtype=np.float32) + + +def get_dir(src_point, rot_rad): + sn, cs = np.sin(rot_rad), np.cos(rot_rad) + + src_result = [0, 0] + src_result[0] = src_point[0] * cs - src_point[1] * sn + src_result[1] = src_point[0] * sn + src_point[1] * cs + + return src_result + + +def get_max_preds(batch_heatmaps): + """ + get predictions from score maps + heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) + """ + assert isinstance(batch_heatmaps, np.ndarray), \ + 'batch_heatmaps should be numpy.ndarray' + assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim' + + batch_size = batch_heatmaps.shape[0] + num_joints = batch_heatmaps.shape[1] + width = batch_heatmaps.shape[3] + heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) + idx = np.argmax(heatmaps_reshaped, 2) + maxvals = np.amax(heatmaps_reshaped, 2) + + maxvals = maxvals.reshape((batch_size, num_joints, 1)) + idx = idx.reshape((batch_size, num_joints, 1)) + + preds = np.tile(idx, (1, 1, 2)).astype(np.float32) + + preds[:, :, 0] = (preds[:, :, 0]) % width + preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) + + pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) + pred_mask = pred_mask.astype(np.float32) + + preds *= pred_mask + return preds, maxvals + + +def get_final_preds(config, batch_heatmaps, center, scale): + coords, maxvals = get_max_preds(batch_heatmaps) + + heatmap_height = batch_heatmaps.shape[2] + heatmap_width = batch_heatmaps.shape[3] + + # post-processing + if config.TEST.POST_PROCESS: + for n in range(coords.shape[0]): + for p in range(coords.shape[1]): + hm = batch_heatmaps[n][p] + px = int(math.floor(coords[n][p][0] + 0.5)) + py = int(math.floor(coords[n][p][1] + 0.5)) + if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1: + diff = np.array( + [ + hm[py][px+1] - hm[py][px-1], + hm[py+1][px]-hm[py-1][px] + ] + ) + coords[n][p] += np.sign(diff) * .25 + + preds = coords.copy() + + # Transform back + for i in range(coords.shape[0]): + preds[i] = transform_preds( + coords[i], center[i], scale[i], [heatmap_width, heatmap_height] + ) + + return preds, maxvals diff --git a/src/preparation/prepare_detection.py b/src/preparation/prepare_detection.py new file mode 100644 index 0000000..d600c4c --- /dev/null +++ b/src/preparation/prepare_detection.py @@ -0,0 +1,82 @@ +import csv + +import torch +from torch.utils.data import DataLoader + +from tqdm import tqdm + +from datasets import DatasetSimple +from detector.detector_yolov3 import DetectorYOLOv3 +from detector.detector_utils import preprocess_image +from detector.utils import non_max_suppression, rescale_boxes + +Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor + + +def detection(dataset_base_path, image_list, output_file): + dataset = DatasetSimple( + dataset_base_path, image_list, transform=preprocess_image + ) + data_loader = DataLoader(dataset, batch_size=30, shuffle=False, num_workers=8) + print(f"Data loaded: {len(data_loader)} batches") + + file = open(output_file, "w") + writer = csv.writer(file) + writer.writerow(["image_name", "x", "y", "w", "h"]) + + detector = DetectorYOLOv3( + model_def="../detector/config/yolov3.cfg", + weights_path="../../models/yolov3.weights", + ) + + human_candidates = dict() + for i, data in tqdm(enumerate(data_loader), total=len(data_loader)): + imgs = data[0].squeeze() + names = data[1] + + # Configure input + input_imgs = torch.autograd.Variable(imgs.type(Tensor)) + + # Get detections + with torch.no_grad(): + detections = detector.model(input_imgs) + detections = non_max_suppression( + detections, detector.conf_thres, detector.nms_thres + ) + + for j in range(imgs.shape[0]): + human_candidates[names[j]] = list() + if detections[j] is None: + continue + + detection = detections[j].data.cpu().numpy() + detection = rescale_boxes(detection, detector.img_size, (240, 320)) + + for x1, y1, x2, y2, conf, cls_conf, cls_pred in detection: + box_w = x2 - x1 + box_h = y2 - y1 + + if int(cls_pred) == 0: + human_candidates[names[j]].append([x1, y1, box_w, box_h]) + + if len(human_candidates[names[j]]) < 1: + print( + f"{names[j]}: Invalid detections ({len(human_candidates[names[j]])}), skipping" + ) + continue + + writer.writerow([names[j]] + human_candidates[names[j]][0]) + + file.close() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Detect people in dataset") + parser.add_argument("dataset_base_path") + parser.add_argument("image_list") + parser.add_argument("output_file") + + args = parser.parse_args() + detection(**vars(args)) diff --git a/src/preparation/prepare_pose_estimation.py b/src/preparation/prepare_pose_estimation.py new file mode 100644 index 0000000..5525d43 --- /dev/null +++ b/src/preparation/prepare_pose_estimation.py @@ -0,0 +1,77 @@ +import csv +import itertools + +import torch +from torch.utils.data import DataLoader +import torchvision.transforms as transforms + +from tqdm import tqdm + +from datasets import DatasetDetections, CropToBox +from pose_estimator.pose_estimator_hrnet import PoseEstimatorHRNet +from pose_estimator.utils import * +from visualization.utils import keypoints + +Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor + + +def pose_estimation(dataset_base_path, detection_list, output_file): + + pose_estimator = PoseEstimatorHRNet( + config_path="../pose_estimator/inference-config.yaml", + weights_path="../../weights/pose_hrnet_w32_384x288.pth", + ) + transform_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + + dataset = DatasetDetections( + dataset_base_path, + detection_list, + sample_transform=CropToBox(pose_estimator.config), + transform=transforms.Compose( + [ + transforms.ToTensor(), + transform_normalize, + ] + ), + ) + data_loader = DataLoader(dataset, batch_size=200, shuffle=False, num_workers=8) + print(f"Data loaded: {len(data_loader)} batches") + + file = open(output_file, "w") + writer = csv.writer(file) + header = [[f"{k}_x", f"{k}_y", f"{k}_conf"] for k in keypoints.values()] + writer.writerow(["image_name"] + list(itertools.chain.from_iterable(header))) + + poses = dict() + for i, data in tqdm(enumerate(data_loader), total=len(data_loader)): + imgs = data[0].squeeze() + names = data[1] + centers, scales = data[2] + + with torch.no_grad(): + # compute output heatmap + output = pose_estimator.model(imgs) + preds, maxvals = get_final_preds( + pose_estimator.config, + output.clone().cpu().numpy(), + np.asarray(centers), + np.asarray(scales), + ) + + result = np.append(preds, maxvals, axis=2) + + for j in range(imgs.shape[0]): + poses[names[j]] = result[j] + writer.writerow([names[j]] + list(result[j].reshape(-1))) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Detect poses in dataset") + parser.add_argument("dataset_base_path") + parser.add_argument("detection_list", default='../../data/casia-b_detections.csv') + parser.add_argument("output_file", default='../../data/casia-b_pose_coco.csv') + + args = parser.parse_args() + pose_estimation(**vars(args)) diff --git a/src/preparation/split_casia-b.py b/src/preparation/split_casia-b.py new file mode 100644 index 0000000..f54f412 --- /dev/null +++ b/src/preparation/split_casia-b.py @@ -0,0 +1,56 @@ +import pandas as pd +import csv +from tqdm import tqdm + +# Splitting data +# Training set : first 74 ids +# Test set : rest 50 ids + +skeletons = pd.read_csv("../../data/casia-b_pose_coco.csv") + +header = list(skeletons) + +ids_train = list(range(1, 60)) +ids_valid = list(range(60, 75)) +ids_test = list(range(75, 125)) + +balancing = { + "nm": 1, + "cl": 3, + "bg": 3 +} + +# Store the different sets in lists according to the indexes assigned above +data = {"train": [], "valid": [], "train_valid": [], "test": [], + "train_balanced": [], "valid_balanced": [], "train_valid_balanced": []} + +for skeleton in tqdm(skeletons.values.tolist()): + label = skeleton[0].split('/')[1].split('-') + p_id = int(label[0]) + p_ws = label[1] + + if p_id in ids_train: + data["train"].append(skeleton) + for _ in range(balancing[p_ws]): + data["train_balanced"].append(skeleton) + + if p_id in ids_valid: + data["valid"].append(skeleton) + for _ in range(balancing[p_ws]): + data["valid_balanced"].append(skeleton) + + if p_id in ids_valid or p_id in ids_train: + data["train_valid"].append(skeleton) + for _ in range(balancing[p_ws]): + data["train_valid_balanced"].append(skeleton) + + if p_id in ids_test: + data["test"].append(skeleton) + +for split, lines in data.items(): + print(f"Saving {split}...") + with open(f"../../data/casia-b_pose_{split}.csv", "w") as f: + writer = csv.writer(f) + writer.writerow(header) + for line in lines: + writer.writerow(line) diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..88420be --- /dev/null +++ b/src/train.py @@ -0,0 +1,245 @@ +import sys +import time + +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from torchvision import transforms +from ray import tune +from ray.tune.schedulers import HyperBandScheduler + +from datasets import dataset_factory +from datasets.augmentation import * +from datasets.graph import Graph +from evaluate import evaluate, _evaluate_casia_b +from losses import SupConLoss + +from common import * +from utils import AverageMeter + + +def train(train_loader, model, criterion, optimizer, scheduler, scaler, epoch, opt): + """one epoch training""" + model.train() + + batch_time = AverageMeter() + data_time = AverageMeter() + losses = AverageMeter() + + end = time.time() + for idx, (points, target) in enumerate(train_loader): + data_time.update(time.time() - end) + + points = torch.cat([points[0], points[1]], dim=0) + labels = target[0] + + if torch.cuda.is_available(): + points = points.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) + bsz = labels.shape[0] + + with torch.cuda.amp.autocast(enabled=opt.use_amp): + # compute loss + features = model(points) + f1, f2 = torch.split(features, [bsz, bsz], dim=0) + features = torch.cat([f1.unsqueeze(1), f2.unsqueeze(1)], dim=1) + loss = criterion(features, labels) + + # update metric + losses.update(loss.item(), bsz) + + # SGD + scaler.scale(loss).backward() + scaler.step(optimizer) + scheduler.step() + scaler.update() + optimizer.zero_grad() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + # print info + if (idx + 1) % opt.log_interval == 0: + print( + f"Train: [{epoch}][{idx + 1}/{len(train_loader)}]\t" + f"BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t" + f"DT {data_time.val:.3f} ({data_time.avg:.3f})\t" + f"loss {losses.val:.3f} ({losses.avg:.3f})" + ) + sys.stdout.flush() + + return losses.avg + + +def main(opt): + opt = setup_environment(opt) + graph = Graph("coco") + + # Dataset + transform = transforms.Compose( + [ + MirrorPoses(opt.mirror_probability), + FlipSequence(opt.flip_probability), + RandomSelectSequence(opt.sequence_length), + ShuffleSequence(opt.shuffle), + PointNoise(std=opt.point_noise_std), + JointNoise(std=opt.joint_noise_std), + MultiInput(graph.connect_joint, opt.use_multi_branch), + ToTensor() + ], + ) + + dataset_class = dataset_factory(opt.dataset) + dataset = dataset_class( + opt.train_data_path, + train=True, + sequence_length=opt.sequence_length, + transform=TwoNoiseTransform(transform), + ) + + dataset_valid = dataset_class( + opt.valid_data_path, + sequence_length=opt.sequence_length, + transform=transforms.Compose( + [ + SelectSequenceCenter(opt.sequence_length), + MultiInput(graph.connect_joint, opt.use_multi_branch), + ToTensor() + ] + ), + ) + + train_loader = torch.utils.data.DataLoader( + dataset, + batch_size=opt.batch_size, + num_workers=opt.num_workers, + pin_memory=True, + shuffle=True, + ) + + val_loader = torch.utils.data.DataLoader( + dataset_valid, + batch_size=opt.batch_size_validation, + num_workers=opt.num_workers, + pin_memory=True, + ) + + # Model & criterion + model = get_model_resgcn(graph, opt) + criterion = SupConLoss(temperature=opt.temp) + + print("# parameters: ", count_parameters(model)) + + if opt.cuda: + model.cuda() + criterion.cuda() + + # Trainer + optimizer, scheduler, scaler = get_trainer(model, opt, len(train_loader)) + + # Load checkpoint or weights + load_checkpoint(model, optimizer, scheduler, scaler, opt) + + # Tensorboard + writer = SummaryWriter(log_dir=opt.tb_path) + + sample_input = torch.zeros(opt.batch_size, model_args["num_input"], model_args["num_channel"], + opt.sequence_length, graph.num_node).cuda() + writer.add_graph(model, input_to_model=sample_input) + + best_acc = 0 + loss = 0 + for epoch in range(opt.start_epoch, opt.epochs + 1): + # train for one epoch + time1 = time.time() + loss = train( + train_loader, model, criterion, optimizer, scheduler, scaler, epoch, opt + ) + + time2 = time.time() + print(f"epoch {epoch}, total time {time2 - time1:.2f}") + + # tensorboard logger + writer.add_scalar("loss/train", loss, epoch) + writer.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], epoch) + + # evaluation + result, accuracy_avg, sub_accuracies, dataframe = evaluate( + val_loader, model, opt.evaluation_fn, use_flip=True + ) + writer.add_text("accuracy/validation", dataframe.to_markdown(), epoch) + writer.add_scalar("accuracy/validation", accuracy_avg, epoch) + for key, sub_accuracy in sub_accuracies.items(): + writer.add_scalar(f"accuracy/validation/{key}", sub_accuracy, epoch) + + print(f"epoch {epoch}, avg accuracy {accuracy_avg:.4f}") + is_best = accuracy_avg > best_acc + if is_best: + best_acc = accuracy_avg + + if opt.tune: + tune.report(accuracy=accuracy_avg) + + if epoch % opt.save_interval == 0 or (is_best and epoch > opt.save_best_start * opt.epochs): + save_file = os.path.join(opt.save_folder, f"ckpt_epoch_{'best' if is_best else epoch}.pth") + save_model(model, optimizer, scheduler, scaler, opt, opt.epochs, save_file) + + # save the last model + save_file = os.path.join(opt.save_folder, "last.pth") + save_model(model, optimizer, scheduler, scaler, opt, opt.epochs, save_file) + + log_hyperparameter(writer, opt, best_acc, loss) + + print(f"best accuracy: {best_acc*100:.2f}") + + +def _inject_config(config): + opt_new = {k: config[k] if k in config.keys() else v for k, v in vars(opt).items()} + main(argparse.Namespace(**opt_new)) + + +def tune_(): + hyperband = HyperBandScheduler(metric="accuracy", mode="max") + + analysis = tune.run( + _inject_config, + config={}, + stop={"accuracy": 0.90, "training_iteration": 100}, + resources_per_trial={"gpu": 1}, + num_samples=10, + scheduler=hyperband + ) + + print("Best config: ", analysis.get_best_config(metric="accuracy", mode="max")) + + df = analysis.results_df + print(df) + + +if __name__ == "__main__": + import datetime + + opt = parse_option() + + date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + opt.model_name = f"{date}_{opt.dataset}_{opt.network_name}" \ + f"_lr_{opt.learning_rate}_decay_{opt.weight_decay}_bsz_{opt.batch_size}" + + if opt.exp_name: + opt.model_name += "_" + opt.exp_name + + opt.model_path = f"../save/supcon_{opt.dataset}_models" + opt.tb_path = f"../save/supcon_{opt.dataset}_tensorboard/{opt.model_name}" + + opt.save_folder = os.path.join(opt.model_path, opt.model_name) + if not os.path.isdir(opt.save_folder): + os.makedirs(opt.save_folder) + + opt.evaluation_fn = None + if opt.dataset == "casia-b": + opt.evaluation_fn = _evaluate_casia_b + + if opt.tune: + tune_() + else: + main(opt) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..0bdeab0 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,18 @@ + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count