diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f139efe
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,54 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Project Specific
+models/*.weights
+models/*.pth
+data/*
+data/*.csv
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e189569
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Torben Teepe
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 43adf3c..4ba1a1f 100644
--- a/README.md
+++ b/README.md
@@ -3,3 +3,27 @@ This repository contains the PyTorch code for:
 GaitGraph: Graph Convolutional Network for Skeleton-Based Gait Recognition
 [Torben Teepe](https://github.com/tteepe), Ali Khan, Johannes Gilg, [Fabian Herzog](https://github.com/fubel)
 
+![Pipeline](images/pipeline.png)
+
+## Quick Start
+Quick Start & models coming soon!
+
+## Main Results
+Top-1 Accuracy per probe angle excluding identical-view cases for the provided models on 
+[CASIA-B](http://www.cbsr.ia.ac.cn/english/Gait%20Databases.asp) dataset.
+
+|        |    0 |   18 |   36 |   54 |   72 |   90 |   108 |   126 |   144 |   162 |   180 |   mean |
+|:-------|-----:|-----:|-----:|-----:|-----:|-----:|------:|------:|------:|------:|------:|-------:|
+| NM#5-6 | 85.3 | 88.5 | 91   | 92.5 | 87.2 | 86.5 |  88.4 |  89.2 |  87.9 |  85.9 |  81.9 |   87.7 |
+| BG#1-2 | 75.8 | 76.7 | 75.9 | 76.1 | 71.4 | 73.9 |  78   |  74.7 |  75.4 |  75.4 |  69.2 |   74.8 |
+| CL#1-2 | 69.6 | 66.1 | 68.8 | 67.2 | 64.5 | 62   |  69.5 |  65.6 |  65.7 |  66.1 |  64.3 |   66.3 |
+
+## Licence & Acknowledgement
+GaitPose itself is released under the MIT License (see LICENSE).
+
+The following parts of the code are borrowed from other projects. Thanks for their wonderful work!
+- Object Detector: [eriklindernoren/PyTorch-YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3)
+- Pose Estimator: [HRNet/HRNet-Human-Pose-Estimation](https://github.com/HRNet/HRNet-Human-Pose-Estimation)
+- ST-GCN Model: [yysijie/st-gcn](https://github.com/yysijie/st-gcn)
+- ResGCNv1 Model: [yfsong0709/ResGCNv1](https://github.com/yfsong0709/ResGCNv1)
+- SupCon Loss: [HobbitLong/SupContrast](https://github.com/HobbitLong/SupContrast)
diff --git a/images/pipeline.png b/images/pipeline.png
new file mode 100644
index 0000000..a49cb72
Binary files /dev/null and b/images/pipeline.png differ
diff --git a/models/download_weights.sh b/models/download_weights.sh
new file mode 100644
index 0000000..39aec9a
--- /dev/null
+++ b/models/download_weights.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Download weights for vanilla YOLOv3
+wget -c https://pjreddie.com/media/files/yolov3.weights
+# Download weights for tiny YOLOv3
+wget -c https://pjreddie.com/media/files/yolov3-tiny.weights
+## Download weights for backbone network
+#wget -c https://pjreddie.com/media/files/darknet53.conv.74
+
+print "#############################################################"
+print "######## Weights for HRNet Pose Estimation need to ##########"
+print "######## be downloaded manually from here:         ##########"
+print "######## https://drive.google.com/drive/folders/1nzM_OBV9LbAEA7HClC0chEyf_7ECDXYA"
+print "######## Files: pose_hrnet_*.pth                   ##########"
+print "#############################################################"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..71200c6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+yacs==0.1.8
+numpy==1.19.5
+torch==1.7.1
+torchvision==0.8.2
+matplotlib==3.3.3
+tabulate==0.8.7
+tensorflow==2.4.0
+tensorboard==2.4.0
+pillow==8.1.0
+tqdm==4.56.0
+opencv-python~=4.5
+jupyter==1.0.0
+pandas==1.1.0
diff --git a/save/.gitkeep b/save/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/src/common.py b/src/common.py
new file mode 100644
index 0000000..6b7ba22
--- /dev/null
+++ b/src/common.py
@@ -0,0 +1,171 @@
+import os
+import argparse
+import torch
+from models.st_gcn.st_gcn import STGCNEmbedding
+import models.ResGCNv1
+
+
+def parse_option():
+    parser = argparse.ArgumentParser(description="Training model on gait sequence")
+    parser.add_argument("dataset", choices=["casia-b", "outdoor-gait", "tum-gaid"])
+    parser.add_argument("train_data_path", help="Path to train data CSV")
+    parser.add_argument("--valid_data_path", help="Path to validation data CSV")
+    parser.add_argument("--valid_split", type=float, default=0.2)
+
+    parser.add_argument("--checkpoint_path", help="Path to checkpoint to resume")
+    parser.add_argument("--weight_path", help="Path to weights for model")
+
+    # Optionals
+    parser.add_argument("--num_workers", type=int, default=8)
+    parser.add_argument(
+        "--gpus", default="0", help="-1 for CPU, use comma for multiple gpus"
+    )
+    parser.add_argument("--batch_size", type=int, default=64)
+    parser.add_argument("--batch_size_validation", type=int, default=64)
+    parser.add_argument("--epochs", type=int, default=500)
+    parser.add_argument("--start_epoch", type=int, default=1)
+    parser.add_argument("--log_interval", type=int, default=10)
+    parser.add_argument("--save_interval", type=int, default=50, help="save frequency")
+    parser.add_argument(
+        "--save_best_start", type=float, default=0.3, help="save frequency"
+    )
+    parser.add_argument("--use_amp", action="store_true")
+    parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--shuffle", action="store_true")
+    parser.add_argument("--exp_name", help="Name of the experiment")
+
+    parser.add_argument("--network_name", default="resgcn-n39-r4")
+    parser.add_argument("--sequence_length", type=int, default=60)
+    parser.add_argument("--embedding_layer_size", type=int, default=256)
+    parser.add_argument("--temporal_kernel_size", type=int, default=9)
+    parser.add_argument("--dropout", type=float, default=0.4)
+    parser.add_argument("--learning_rate", type=float, default=1e-3)
+    parser.add_argument(
+        "--lr_decay_rate", type=float, default=0.1, help="decay rate for learning rate"
+    )
+    parser.add_argument("--point_noise_std", type=float, default=0.05)
+    parser.add_argument("--joint_noise_std", type=float, default=0.1)
+    parser.add_argument("--flip_probability", type=float, default=0.5)
+    parser.add_argument("--mirror_probability", type=float, default=0.5)
+    parser.add_argument("--weight_decay", type=float, default=1e-5)
+    parser.add_argument("--use_multi_branch", action="store_true")
+    parser.add_argument(
+        "--temp", type=float, default=0.07, help="temperature for loss function"
+    )
+    opt = parser.parse_args()
+
+    # Sanitize opts
+    opt.gpus_str = opt.gpus
+    opt.gpus = [int(gpu) for gpu in opt.gpus.split(",")]
+
+    return opt
+
+
+def log_hyperparameter(writer, opt, accuracy, loss):
+    writer.add_hparams(
+        {
+            "batch_size": opt.batch_size,
+            "sequence_length": opt.sequence_length,
+            "embedding_layer_size": opt.embedding_layer_size,
+            "dropout": opt.dropout,
+            "learning_rate": opt.learning_rate,
+            "lr_decay_rate": opt.lr_decay_rate,
+            "point_noise_std": opt.point_noise_std,
+            "weight_decay": opt.weight_decay,
+            "temp": opt.temp,
+        },
+        {
+            "hparam/accuracy": accuracy,
+            "hparam/loss": loss,
+        },
+    )
+
+
+def setup_environment(opt):
+    # HACK: Fix tensorboard
+    import tensorflow as tf
+    import tensorboard as tb
+
+    tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpus_str
+    opt.cuda = opt.gpus[0] >= 0
+    torch.device("cuda" if opt.cuda else "cpu")
+
+    return opt
+
+
+def get_model_stgcn(opt):
+    # Model
+    input_channels = 3
+    edge_importance_weighting = True
+    graph_args = {"strategy": "spatial"}
+
+    embedding_net = STGCNEmbedding(
+        input_channels,
+        graph_args,
+        edge_importance_weighting=edge_importance_weighting,
+        embedding_layer_size=opt.embedding_layer_size,
+        temporal_kernel_size=opt.temporal_kernel_size,
+        dropout=opt.dropout,
+    )
+
+    return embedding_net
+
+
+def get_model_resgcn(graph, opt):
+    model_args = {
+        "A": torch.tensor(graph.A, dtype=torch.float32, requires_grad=False),
+        "num_class": opt.embedding_layer_size,
+        "num_input": 1 if not opt.use_multi_branch else 3,
+        "num_channel": 3 if not opt.use_multi_branch else 6,
+        "parts": graph.parts,
+    }
+    return models.ResGCNv1.create(opt.network_name, **model_args)
+
+
+def get_trainer(model, opt, steps_per_epoch):
+    optimizer = torch.optim.Adam(
+        model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay
+    )
+    scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        optimizer, opt.learning_rate, epochs=opt.epochs, steps_per_epoch=steps_per_epoch
+    )
+    scaler = torch.cuda.amp.GradScaler(enabled=opt.use_amp)
+
+    return optimizer, scheduler, scaler
+
+
+def load_checkpoint(model, optimizer, scheduler, scaler, opt):
+    if opt.checkpoint_path is not None:
+        checkpoint = torch.load(opt.checkpoint_path)
+        model.load_state_dict(checkpoint["model"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        scheduler.load_state_dict(checkpoint["scheduler"])
+        scaler.load_state_dict(checkpoint["scaler"])
+        opt.start_epoch = checkpoint["epoch"]
+
+    if opt.weight_path is not None:
+        checkpoint = torch.load(opt.weight_path)
+        model.load_state_dict(checkpoint["model"], strict=False)
+
+
+def save_model(model, optimizer, scheduler, scaler, opt, epoch, save_file):
+    print("==> Saving...")
+    state = {
+        "opt": opt,
+        "model": model.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "scheduler": scheduler.state_dict(),
+        "scaler": scaler.state_dict(),
+        "epoch": epoch,
+    }
+    torch.save(state, save_file)
+    del state
+
+
+def count_parameters(model):
+    """
+    Useful function to compute number of parameters in a model.
+    """
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
new file mode 100644
index 0000000..149424a
--- /dev/null
+++ b/src/datasets/__init__.py
@@ -0,0 +1,11 @@
+from .preparation import DatasetSimple, DatasetDetections
+from .gait import (
+    CasiaBPose,
+)
+
+
+def dataset_factory(name):
+    if name == "casia-b":
+        return CasiaBPose
+
+    raise ValueError()
diff --git a/src/datasets/augmentation.py b/src/datasets/augmentation.py
new file mode 100644
index 0000000..be2d495
--- /dev/null
+++ b/src/datasets/augmentation.py
@@ -0,0 +1,282 @@
+import numpy as np
+import cv2
+import torch
+
+from pose_estimator.utils import get_affine_transform
+
+
+class ToTensor(object):
+    def __call__(self, data):
+        return torch.tensor(data, dtype=torch.float)
+
+
+class MultiInput(object):
+    def __init__(self, connect_joint, enabled=False):
+        self.connect_joint = connect_joint
+        self.enabled = enabled
+
+    def __call__(self, data):
+        # (C, T, V) -> (I, C * 2, T, V)
+        data = np.transpose(data, (2, 0, 1))
+
+        if not self.enabled:
+            return data[np.newaxis, ...]
+
+        C, T, V = data.shape
+        data_new = np.zeros((3, C * 2, T, V))
+        # Joints
+        data_new[0, :C, :, :] = data
+        for i in range(V):
+            data_new[0, C:, :, i] = data[:, :, i] - data[:, :, 1]
+        # Velocity
+        for i in range(T - 2):
+            data_new[1, :C, i, :] = data[:, i + 1, :] - data[:, i, :]
+            data_new[1, C:, i, :] = data[:, i + 2, :] - data[:, i, :]
+        # Bones
+        for i in range(len(self.connect_joint)):
+            data_new[2, :C, :, i] = data[:, :, i] - data[:, :, self.connect_joint[i]]
+        bone_length = 0
+        for i in range(C - 1):
+            bone_length += np.power(data_new[2, i, :, :], 2)
+        bone_length = np.sqrt(bone_length) + 0.0001
+        for i in range(C - 1):
+            data_new[2, C, :, :] = np.arccos(data_new[2, i, :, :] / bone_length)
+
+        return data_new
+
+
+class FlipSequence(object):
+    def __init__(self, probability=0.5):
+        self.probability = probability
+
+    def __call__(self, data):
+        if np.random.random() <= self.probability:
+            return np.flip(data, axis=0).copy()
+        return data
+
+
+class MirrorPoses(object):
+    def __init__(self, probability=0.5):
+        self.probability = probability
+
+    def __call__(self, data):
+        if np.random.random() <= self.probability:
+            center = np.mean(data[:, :, 0], axis=1, keepdims=True)
+            data[:, :, 0] = center - data[:, :, 0] + center
+
+        return data
+
+
+class RandomSelectSequence(object):
+    def __init__(self, sequence_length=10):
+        self.sequence_length = sequence_length
+
+    def __call__(self, data):
+        try:
+            start = np.random.randint(0, data.shape[0] - self.sequence_length)
+        except ValueError:
+            print(data.shape[0])
+            raise ValueError
+        end = start + self.sequence_length
+        return data[start:end]
+
+
+class SelectSequenceCenter(object):
+    def __init__(self, sequence_length=10):
+        self.sequence_length = sequence_length
+
+    def __call__(self, data):
+        try:
+            start = int((data.shape[0]/2) - (self.sequence_length / 2))
+        except ValueError:
+            print(data.shape[0])
+            raise ValueError
+        end = start + self.sequence_length
+        return data[start:end]
+
+
+class ShuffleSequence(object):
+    def __init__(self, enabled=False):
+        self.enabled = enabled
+
+    def __call__(self, data):
+        if self.enabled:
+            np.random.shuffle(data)
+        return data
+
+
+class TwoNoiseTransform(object):
+    """Create two crops of the same image"""
+    def __init__(self, transform):
+        self.transform = transform
+
+    def __call__(self, x):
+        return [self.transform(x), self.transform(x)]
+
+
+class PointNoise(object):
+    """
+    Add Gaussian noise to pose points
+    std: standard deviation
+    """
+
+    def __init__(self, std=0.15):
+        self.std = std
+
+    def __call__(self, data):
+        noise = np.random.normal(0, self.std, data.shape).astype(np.float32)
+        return data + noise
+
+
+class JointNoise(object):
+    """
+    Add Gaussian noise to joint
+    std: standard deviation
+    """
+
+    def __init__(self, std=0.5):
+        self.std = std
+
+    def __call__(self, data):
+        # T, V, C
+        noise = np.hstack((
+            np.random.normal(0, 0.25, (data.shape[1], 2)),
+            np.zeros((data.shape[1], 1))
+        )).astype(np.float32)
+
+        return data + np.repeat(noise[np.newaxis, ...], data.shape[0], axis=0)
+
+
+class DropOutFrames(object):
+    """
+    Type of data augmentation. Dropout frames randomly from a sequence.
+    Properties:
+    dropout_rate_range: Defines the range from which dropout rate is picked
+    prob: Probability that this technique is applied on a sample.
+    """
+
+    def __init__(self, probability=0.1, sequence_length=60):
+        self.probability = probability
+        self.sequence_length = sequence_length
+
+    def __call__(self, data):
+        T, V, C = data.shape
+
+        new_data = []
+        dropped = 0
+        for i in range(T):
+            if np.random.random() <= self.probability:
+                new_data.append(data[i])
+            else:
+                dropped += 1
+            if T - dropped <= self.sequence_length:
+                break
+
+        for j in range(i, T):
+            new_data.append(data[j])
+
+        return np.array(new_data)
+
+
+class DropOutJoints(object):
+    """
+    Type of data augmentation. Zero joints randomly from a pose.
+    Properties:
+    dropout_rate_range:
+    prob: Probability that this technique is applied on a sample.
+    """
+
+    def __init__(
+        self, prob=1, dropout_rate_range=0.1,
+    ):
+        self.dropout_rate_range = dropout_rate_range
+        self.prob = prob
+
+    def __call__(self, data):
+        if np.random.binomial(1, self.prob, 1) != 1:
+            return data
+
+        T, V, C = data.shape
+        data = data.reshape(T * V, C)
+        # Choose the dropout_rate randomly for every sample from 0 - dropout range
+        dropout_rate = np.random.uniform(0, self.dropout_rate_range, 1)
+        zero_indices = 1 - np.random.binomial(1, dropout_rate, T * V)
+        for i in range(3):
+            data[:, i] = zero_indices * data[:, i]
+        data = data.reshape(T, V, C)
+        return data
+
+
+class InterpolateFrames(object):
+    """
+    Type of data augmentation. Create more frames between adjacent frames by interpolation
+    """
+
+    def __init__(self, probability=0.1):
+        """
+        :param probability: The probability with which this augmentation technique will be applied
+        """
+        self.probability = probability
+
+    def __call__(self, data):
+        # data shape is T,V,C = Frames, Joints, Channels (X,Y,conf)
+        T, V, C = data.shape
+
+        # interpolated_data = np.zeros((T + T - 1, V, C), dtype=np.float32)
+        interpolated_data = []
+        for i in range(T):
+            # Add original frame
+            interpolated_data.append(data[i])
+
+            # Skip last
+            if i == T - 1:
+                break
+
+            if np.random.random() <= self.probability:
+                continue
+
+            # Calculate difference between x and y points of each joint of current frame and current frame plus 1
+            x_difference = data[i + 1, :, 0] - data[i, :, 0]
+            y_difference = data[i + 1, :, 1] - data[i, :, 1]
+
+            new_frame_x = (
+                data[i, :, 0] + (x_difference * np.random.normal(0.5, 1))
+            )
+            new_frame_y = (
+                data[i, :, 1] + (y_difference * np.random.normal(0.5, 1))
+            )
+            # Take average of conf of current and next frame to find the conf of the interpolated frame
+            new_frame_conf = (data[i + 1, :, 2] + data[i, :, 2]) / 2
+            interpolated_frame = np.array(
+                [new_frame_x, new_frame_y, new_frame_conf]
+            ).transpose()
+
+            interpolated_data.append(interpolated_frame)
+
+        return np.array(interpolated_data)
+
+
+class CropToBox(object):
+    """Crop image to detection box
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, img, center, scale):
+        rotation = 0
+        # pose estimation transformation
+        trans = get_affine_transform(
+            center, scale, rotation, self.config.MODEL.IMAGE_SIZE
+        )
+        model_input = cv2.warpAffine(
+            np.array(img),
+            trans,
+            (
+                int(self.config.MODEL.IMAGE_SIZE[0]),
+                int(self.config.MODEL.IMAGE_SIZE[1]),
+            ),
+            flags=cv2.INTER_LINEAR,
+        )
+
+        return model_input
diff --git a/src/datasets/gait.py b/src/datasets/gait.py
new file mode 100644
index 0000000..7753d51
--- /dev/null
+++ b/src/datasets/gait.py
@@ -0,0 +1,129 @@
+import numpy as np
+from torch.utils.data import Dataset
+
+
+class PoseDataset(Dataset):
+    """
+    Args:
+     data_list_path (string):   Path to pose data.
+     sequence_length:           Length of sequence for each data point. The number of frames of pose data returned.
+     train:                     Training dataset or validation. default : True
+     transform:                 Transformation on the dataset
+     target_transform:          Transformation on the target.
+    """
+
+    def __init__(
+        self,
+        data_list_path,
+        sequence_length=1,
+        train=True,
+        transform=None,
+        target_transform=None,
+    ):
+        super(PoseDataset, self).__init__()
+        self.data_list = np.loadtxt(data_list_path, skiprows=1, dtype=str)
+        self.sequence_length = sequence_length
+        self.train = train
+
+        self.transform = transform
+        self.target_transform = target_transform
+
+        self.data_dict = {}
+
+        for row in self.data_list:
+            row = row.split(",")
+
+            target, frame_num = self._filename_to_target(row[0])
+
+            if target not in self.data_dict:
+                self.data_dict[target] = {}
+
+            if len(row[1:]) != 51:
+                print("Invalid pose data for: ", target, ", frame: ", frame_num)
+                continue
+            # Added try block to see if all the joint values are present. other wise skip that frame.
+            try:
+                self.data_dict[target][frame_num] = np.array(
+                    row[1:], dtype=np.float32
+                ).reshape((-1, 3))
+            except ValueError:
+                print("Invalid pose data for: ", target, ", frame: ", frame_num)
+                continue
+
+        # Check for data samples that have less than sequence_length frames and remove them.
+        for target, sequence in self.data_dict.copy().items():
+            if len(sequence) < self.sequence_length + 1:
+                del self.data_dict[target]
+
+        self.targets = list(self.data_dict.keys())
+
+        self.data = list(self.data_dict.values())
+
+    def _filename_to_target(self, filename):
+        raise NotImplemented()
+
+    def __len__(self):
+        return len(self.targets)
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (pose, target) where target is index of the target class.
+        """
+        target = self.targets[index]
+        data = np.stack(list(self.data[index].values()))
+
+        if self.transform is not None:
+            data = self.transform(data)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return data, target
+
+    def get_num_classes(self):
+        """
+        Returns number of unique ids present in the dataset. Useful for classification networks.
+
+        """
+        if type(self.targets[0]) == int:
+            classes = set(self.targets)
+        else:
+            classes = set([target[0] for target in self.targets])
+        num_classes = len(classes)
+        return num_classes
+
+
+class CasiaBPose(PoseDataset):
+    """
+    CASIA-B Dataset
+    The format of the video filename in Dataset B is 'xxx-mm-nn-ttt.avi', where
+      xxx: subject id, from 001 to 124.
+      mm: walking status, can be 'nm' (normal), 'cl' (in a coat) or 'bg' (with a bag).
+      nn: sequence number.
+      ttt: view angle, can be '000', '018', ..., '180'.
+     """
+
+    mapping_walking_status = {
+        'nm': 0,
+        'bg': 1,
+        'cl': 2,
+    }
+
+    def _filename_to_target(self, filename):
+        _, sequence_id, frame = filename.split("/")
+        subject_id, walking_status, sequence_num, view_angle = sequence_id.split("-")
+        walking_status = self.mapping_walking_status[walking_status]
+        return (
+            (int(subject_id), int(walking_status), int(sequence_num), int(view_angle)),
+            int(frame[:-4]),
+        )
+
+
+class KinectGait(PoseDataset):
+    def _filename_to_target(self, filename):
+        subject_id, sequence_num, frame = filename.split("-")
+        return (int(subject_id), int(sequence_num)), int(frame)
diff --git a/src/datasets/graph.py b/src/datasets/graph.py
new file mode 100644
index 0000000..253cf57
--- /dev/null
+++ b/src/datasets/graph.py
@@ -0,0 +1,187 @@
+import logging, numpy as np
+
+
+# Thanks to YAN Sijie for the released code on Github (https://github.com/yysijie/st-gcn)
+class Graph():
+    def __init__(self, dataset, max_hop=3, dilation=1):
+        self.dataset = dataset.split('-')[0]
+        self.max_hop = max_hop
+        self.dilation = dilation
+
+        # get edges
+        self.num_node, self.edge, self.connect_joint, self.parts = self._get_edge()
+
+        # get adjacency matrix
+        self.A = self._get_adjacency()
+
+    def __str__(self):
+        return self.A
+
+    def _get_edge(self):
+        if self.dataset == 'kinetics':
+            num_node = 18
+            neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11),
+                             (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1),
+                             (0, 1), (15, 0), (14, 0), (17, 15), (16, 14), (8, 11)]
+            connect_joint = np.array([1,1,1,2,3,1,5,6,2,8,9,5,11,12,0,0,14,15])
+            parts = [
+                np.array([5, 6, 7]),              # left_arm
+                np.array([2, 3, 4]),              # right_arm
+                np.array([11, 12, 13]),           # left_leg
+                np.array([8, 9, 10]),             # right_leg
+                np.array([0, 1, 14, 15, 16, 17])  # torso
+            ]
+        elif self.dataset == 'ntu':
+            num_node = 25
+            neighbor_1base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21),
+                              (6, 5), (7, 6), (8, 7), (9, 21), (10, 9),
+                              (11, 10), (12, 11), (13, 1), (14, 13), (15, 14),
+                              (16, 15), (17, 1), (18, 17), (19, 18), (20, 19),
+                              (22, 23), (23, 8), (24, 25), (25, 12)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            connect_joint = np.array([2,2,21,3,21,5,6,7,21,9,10,11,1,13,14,15,1,17,18,19,2,23,8,25,12]) - 1
+            parts = [
+                np.array([5, 6, 7, 8, 22, 23]) - 1,     # left_arm
+                np.array([9, 10, 11, 12, 24, 25]) - 1,  # right_arm
+                np.array([13, 14, 15, 16]) - 1,         # left_leg
+                np.array([17, 18, 19, 20]) - 1,         # right_leg
+                np.array([1, 2, 3, 4, 21]) - 1          # torso
+            ]
+        elif self.dataset == 'sysu':
+            num_node = 20
+            neighbor_1base = [(1, 2), (2, 3), (3, 4), (3, 5), (5, 6),
+                              (6, 7), (7, 8), (3, 9), (9, 10), (10, 11),
+                              (11, 12), (1, 13), (13, 14), (14, 15), (15, 16),
+                              (1, 17), (17, 18), (18, 19), (19, 20)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            connect_joint = np.array([2,2,2,3,3,5,6,7,3,9,10,11,1,13,14,15,1,17,18,19]) - 1
+            parts = [
+                np.array([5, 6, 7, 8]) - 1,     # left_arm
+                np.array([9, 10, 11, 12]) - 1,  # right_arm
+                np.array([13, 14, 15, 16]) - 1,         # left_leg
+                np.array([17, 18, 19, 20]) - 1,         # right_leg
+                np.array([1, 2, 3, 4]) - 1          # torso
+            ]
+        elif self.dataset == 'ucla':
+            num_node = 20
+            neighbor_1base = [(1, 2), (2, 3), (3, 4), (3, 5), (5, 6),
+                              (6, 7), (7, 8), (3, 9), (9, 10), (10, 11),
+                              (11, 12), (1, 13), (13, 14), (14, 15), (15, 16),
+                              (1, 17), (17, 18), (18, 19), (19, 20)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            connect_joint = np.array([2,2,2,3,3,5,6,7,3,9,10,11,1,13,14,15,1,17,18,19]) - 1
+            parts = [
+                np.array([5, 6, 7, 8]) - 1,     # left_arm
+                np.array([9, 10, 11, 12]) - 1,  # right_arm
+                np.array([13, 14, 15, 16]) - 1,         # left_leg
+                np.array([17, 18, 19, 20]) - 1,         # right_leg
+                np.array([1, 2, 3, 4]) - 1          # torso
+            ]
+        elif self.dataset == 'cmu':
+            num_node = 26
+            neighbor_1base = [(1, 2), (2, 3), (3, 4), (5, 6), (6, 7),
+                              (7, 8), (1, 9), (5, 9), (9, 10), (10, 11),
+                              (11, 12), (12, 13), (13, 14), (12, 15), (15, 16),
+                              (16, 17), (17, 18), (18, 19), (17, 20), (12, 21),
+                              (21, 22), (22, 23), (23, 24), (24, 25), (23, 26)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            connect_joint = np.array([9,1,2,3,9,5,6,7,10,10,10,11,12,13,12,15,16,17,18,17,12,21,22,23,24,23]) - 1
+            parts = [
+                np.array([15, 16, 17, 18, 19, 20]) - 1,     # left_arm
+                np.array([21, 22, 23, 24, 25, 26]) - 1,  # right_arm
+                np.array([1, 2, 3, 4]) - 1,         # left_leg
+                np.array([5, 6, 7, 8]) - 1,         # right_leg
+                np.array([9, 10, 11, 12, 13, 14]) - 1          # torso
+            ]
+        elif self.dataset == 'h36m':
+            num_node = 20
+            neighbor_1base = [(1, 2), (2, 3), (3, 4), (5, 6), (6, 7),
+                              (7, 8), (1, 9), (5, 9), (9, 10), (10, 11),
+                              (11, 12), (10, 13), (13, 14), (14, 15), (15, 16),
+                              (10, 17), (17, 18), (18, 19), (19, 20)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_1base]
+            connect_joint = np.array([9,1,2,3,9,5,6,7,9,9,10,11,10,13,14,15,10,17,18,19]) - 1
+            parts = [
+                np.array([13, 14, 15, 16]) - 1,     # left_arm
+                np.array([17, 18, 19, 20]) - 1,  # right_arm
+                np.array([1, 2, 3, 4]) - 1,         # left_leg
+                np.array([5, 6, 7, 8]) - 1,         # right_leg
+                np.array([9, 10, 11, 12]) - 1          # torso
+            ]
+        elif self.dataset == 'coco':
+            # keypoints = {
+            #     0: "nose",
+            #     1: "left_eye",
+            #     2: "right_eye",
+            #     3: "left_ear",
+            #     4: "right_ear",
+            #     5: "left_shoulder",
+            #     6: "right_shoulder",
+            #     7: "left_elbow",
+            #     8: "right_elbow",
+            #     9: "left_wrist",
+            #     10: "right_wrist",
+            #     11: "left_hip",
+            #     12: "right_hip",
+            #     13: "left_knee",
+            #     14: "right_knee",
+            #     15: "left_ankle",
+            #     16: "right_ankle"
+            # }
+            num_node = 17
+            self_link = [(i, i) for i in range(num_node)]
+            neighbor_link = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 6),
+                             (5, 7), (7, 9), (6, 8), (8, 10), (5, 11), (6, 12), (11, 12),
+                             (11, 13), (13, 15), (12, 14), (14, 16)]
+            self.edge = self_link + neighbor_link
+            self.center = 0
+            connect_joint = np.array([5,0,0,1,2,0,0,5,6,7,8,5,6,11,12,13,14])
+            parts = [
+                np.array([5, 7, 9]),       # left_arm
+                np.array([6, 8, 10]),      # right_arm
+                np.array([11, 13, 15]),    # left_leg
+                np.array([12, 14, 16]),    # right_leg
+                np.array([5, 6, 11, 12, 0, 1, 2, 3, 4]),  # torso + head
+            ]
+        else:
+            num_node, neighbor_link, connect_joint, parts = 0, [], [], []
+            logging.info('')
+            logging.error('Error: Do NOT exist this dataset: {}!'.format(self.dataset))
+            raise ValueError()
+        self_link = [(i, i) for i in range(num_node)]
+        edge = self_link + neighbor_link
+        return num_node, edge, connect_joint, parts
+
+    def _get_hop_distance(self):
+        A = np.zeros((self.num_node, self.num_node))
+        for i, j in self.edge:
+            A[j, i] = 1
+            A[i, j] = 1
+        hop_dis = np.zeros((self.num_node, self.num_node)) + np.inf
+        transfer_mat = [np.linalg.matrix_power(A, d) for d in range(self.max_hop + 1)]
+        arrive_mat = (np.stack(transfer_mat) > 0)
+        for d in range(self.max_hop, -1, -1):
+            hop_dis[arrive_mat[d]] = d
+        return hop_dis
+
+    def _get_adjacency(self):
+        hop_dis = self._get_hop_distance()
+        valid_hop = range(0, self.max_hop + 1, self.dilation)
+        adjacency = np.zeros((self.num_node, self.num_node))
+        for hop in valid_hop:
+            adjacency[hop_dis == hop] = 1
+        normalize_adjacency = self._normalize_digraph(adjacency)
+        A = np.zeros((len(valid_hop), self.num_node, self.num_node))
+        for i, hop in enumerate(valid_hop):
+            A[i][hop_dis == hop] = normalize_adjacency[hop_dis == hop]
+        return A
+
+    def _normalize_digraph(self, A):
+        Dl = np.sum(A, 0)
+        num_node = A.shape[0]
+        Dn = np.zeros((num_node, num_node))
+        for i in range(num_node):
+            if Dl[i] > 0:
+                Dn[i, i] = Dl[i]**(-1)
+        AD = np.dot(A, Dn)
+        return AD
diff --git a/src/datasets/preparation.py b/src/datasets/preparation.py
new file mode 100644
index 0000000..4c4e703
--- /dev/null
+++ b/src/datasets/preparation.py
@@ -0,0 +1,99 @@
+import os
+
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+
+
+class DatasetSimple(Dataset):
+    """
+      Args:
+        root (string): Root directory path.
+        frame_list_path (string): Frame list path.
+        transform (callable, optional): A function/transform that takes in an PIL image
+            and returns a transformed version.
+            E.g, ``transforms.RandomCrop`` for images.
+        sample_transform (callable, optional): A function/transform that takes
+            in the target and transforms it.
+    """
+
+    def __init__(self, root, frame_list_path, transform=None, sample_transform=None):
+        self.root = root
+        self.frame_list = np.loadtxt(frame_list_path, skiprows=1, dtype=str)
+        self.transform = transform
+        self.sample_transform = sample_transform
+
+    def __len__(self):
+        return len(self.frame_list)
+
+    def __getitem__(self, index):
+        image_name = self.frame_list[index]
+        image_path = os.path.join(self.root, image_name)
+
+        with open(image_path, "rb") as f:
+            img = Image.open(f)
+            img.convert("RGB")
+
+        if self.transform:
+            img = self.transform(img)
+
+        return img, image_name
+
+
+def box_to_center_scale(box, model_image_width, model_image_height):
+    """convert a box to center,scale information required for pose transformation
+    Parameters
+    ----------
+    box : list | ndarray
+    model_image_width : int
+    model_image_height : int
+
+    Returns
+    -------
+    (numpy array, numpy array)
+        Two numpy arrays, coordinates for the center of the box and the scale of the box
+    """
+    center = np.zeros(2, dtype=np.float32)
+
+    top_left_corner = box[0:2]
+    box_width = box[2]
+    box_height = box[3]
+    center[0] = top_left_corner[0] + box_width * 0.5
+    center[1] = top_left_corner[1] + box_height * 0.5
+
+    aspect_ratio = model_image_width * 1.0 / model_image_height
+    pixel_std = 200
+
+    if box_width > aspect_ratio * box_height:
+        box_height = box_width * 1.0 / aspect_ratio
+    elif box_width < aspect_ratio * box_height:
+        box_width = box_height * aspect_ratio
+    scale = np.array(
+        [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
+        dtype=np.float32)
+    if center[0] != -1:
+        scale = scale * 1.25
+
+    return center, scale
+
+
+class DatasetDetections(DatasetSimple):
+    def __getitem__(self, index):
+        frame_info = self.frame_list[index].split(",")
+        image_name = frame_info[0]
+        image_path = os.path.join(self.root, image_name)
+
+        box = np.array(frame_info[1:], dtype=np.float32)
+        center, scale = box_to_center_scale(box, 288, 384)
+
+        with open(image_path, "rb") as f:
+            img = Image.open(f)
+            img.convert("RGB")
+
+        if self.sample_transform:
+            img = self.sample_transform(img, center, scale)
+
+        if self.transform:
+            img = self.transform(img)
+
+        return img, image_name, (center, scale)
diff --git a/src/detector/README.md b/src/detector/README.md
new file mode 100644
index 0000000..71d2a4a
--- /dev/null
+++ b/src/detector/README.md
@@ -0,0 +1,2 @@
+## Detector PyTorch-YOLOv3
+This part is borrowed from [eriklindernoren/PyTorch-YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3)
diff --git a/src/detector/config/yolov3-tiny.cfg b/src/detector/config/yolov3-tiny.cfg
new file mode 100644
index 0000000..ade4969
--- /dev/null
+++ b/src/detector/config/yolov3-tiny.cfg
@@ -0,0 +1,206 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=2
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+# 0
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 1
+[maxpool]
+size=2
+stride=2
+
+# 2
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 3
+[maxpool]
+size=2
+stride=2
+
+# 4
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 5
+[maxpool]
+size=2
+stride=2
+
+# 6
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 7
+[maxpool]
+size=2
+stride=2
+
+# 8
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 9
+[maxpool]
+size=2
+stride=2
+
+# 10
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 11
+[maxpool]
+size=2
+stride=1
+
+# 12
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+###########
+
+# 13
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+# 14
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 15
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+# 16
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+# 17
+[route]
+layers = -4
+
+# 18
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+# 19
+[upsample]
+stride=2
+
+# 20
+[route]
+layers = -1, 8
+
+# 21
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 22
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+# 23
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
diff --git a/src/detector/config/yolov3.cfg b/src/detector/config/yolov3.cfg
new file mode 100644
index 0000000..946e015
--- /dev/null
+++ b/src/detector/config/yolov3.cfg
@@ -0,0 +1,788 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=16
+subdivisions=1
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
diff --git a/src/detector/detector_yolov3.py b/src/detector/detector_yolov3.py
new file mode 100644
index 0000000..f1af80b
--- /dev/null
+++ b/src/detector/detector_yolov3.py
@@ -0,0 +1,98 @@
+import argparse
+from PIL import Image
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.ticker import NullLocator
+
+from detector.models import *
+from utils import *
+
+
+Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
+
+
+class DetectorYOLOv3:
+    def __init__(self,
+                 model_def='config/yolov3.cfg',
+                 weights_path='../weights/yolov3.weights',
+                 conf_thres=0.8,
+                 nms_thres=0.4,
+                 img_size=416):
+        self.model_def = model_def
+        self.weights_path = weights_path
+        self.img_size = img_size
+        self.conf_thres = conf_thres
+        self.nms_thres = nms_thres
+
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = Darknet(self.model_def, img_size=self.img_size).to(device)
+
+        if self.weights_path.endswith(".weights"):
+            # Load darknet weights
+            self.model.load_darknet_weights(self.weights_path)
+        else:
+            # Load checkpoint weights
+            self.model.load_state_dict(torch.load(self.weights_path))
+
+        self.model.eval()  # Set in evaluation mode
+
+    def detect_from_image(self, img):
+        input_img = preprocess_image(img)
+
+        # Configure input
+        input_img = Variable(input_img.type(Tensor))
+
+        # Get detections
+        with torch.no_grad():
+            detections = self.model(input_img)
+            detections = non_max_suppression(detections, self.conf_thres, self.nms_thres)[0]
+            if detections is None:
+                return []
+            else:
+                detections = detections.data.cpu().numpy()
+
+        # Draw bounding boxes and labels of detections
+        human_candidates = []
+        if detections is not None:
+            # Rescale boxes to original img
+            detections = rescale_boxes(detections, self.img_size, img.shape[:2])
+
+            for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections:
+                box_w = x2 - x1
+                box_h = y2 - y1
+
+                if int(cls_pred) == 0:
+                    human_candidate = [x1, y1, box_w, box_h]
+                    human_candidates.append(human_candidate)
+        return human_candidates
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
+    parser.add_argument("--weights_path", type=str, default="../../models/yolov3.weights",
+                        help="path to weights file")
+    parser.add_argument("--conf_thres", type=float, default=0.8, help="object confidence threshold")
+    parser.add_argument("--nms_thres", type=float, default=0.4, help="iou threshold for non-maximum suppression")
+    opt = parser.parse_args()
+
+    detector = DetectorYOLOv3(**vars(opt))
+
+    img = np.array(Image.open('../data/samples/messi.jpg'))
+    human_candidates = detector.detect_from_image(img)
+
+    # Create plot
+    plt.figure()
+    fig, ax = plt.subplots(1)
+    ax.imshow(img)
+
+    for x1, y1, box_w, box_h in human_candidates:
+        bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=(1, 0, 0), facecolor="none")
+        # Add the bbox to the plot
+        ax.add_patch(bbox)
+
+    plt.axis("off")
+    plt.gca().xaxis.set_major_locator(NullLocator())
+    plt.gca().yaxis.set_major_locator(NullLocator())
+    plt.show()
diff --git a/src/detector/models.py b/src/detector/models.py
new file mode 100644
index 0000000..5da98fc
--- /dev/null
+++ b/src/detector/models.py
@@ -0,0 +1,340 @@
+from __future__ import division
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from utils import build_targets, to_cpu, parse_model_config
+
+
+def create_modules(module_defs):
+    """
+    Constructs module list of layer blocks from module configuration in module_defs
+    """
+    hyperparams = module_defs.pop(0)
+    output_filters = [int(hyperparams["channels"])]
+    module_list = nn.ModuleList()
+    for module_i, module_def in enumerate(module_defs):
+        modules = nn.Sequential()
+
+        if module_def["type"] == "convolutional":
+            bn = int(module_def["batch_normalize"])
+            filters = int(module_def["filters"])
+            kernel_size = int(module_def["size"])
+            pad = (kernel_size - 1) // 2
+            modules.add_module(
+                f"conv_{module_i}",
+                nn.Conv2d(
+                    in_channels=output_filters[-1],
+                    out_channels=filters,
+                    kernel_size=kernel_size,
+                    stride=int(module_def["stride"]),
+                    padding=pad,
+                    bias=not bn,
+                ),
+            )
+            if bn:
+                modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))
+            if module_def["activation"] == "leaky":
+                modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))
+
+        elif module_def["type"] == "maxpool":
+            kernel_size = int(module_def["size"])
+            stride = int(module_def["stride"])
+            if kernel_size == 2 and stride == 1:
+                modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
+            maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
+            modules.add_module(f"maxpool_{module_i}", maxpool)
+
+        elif module_def["type"] == "upsample":
+            upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
+            modules.add_module(f"upsample_{module_i}", upsample)
+
+        elif module_def["type"] == "route":
+            layers = [int(x) for x in module_def["layers"].split(",")]
+            filters = sum([output_filters[1:][i] for i in layers])
+            modules.add_module(f"route_{module_i}", EmptyLayer())
+
+        elif module_def["type"] == "shortcut":
+            filters = output_filters[1:][int(module_def["from"])]
+            modules.add_module(f"shortcut_{module_i}", EmptyLayer())
+
+        elif module_def["type"] == "yolo":
+            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
+            # Extract anchors
+            anchors = [int(x) for x in module_def["anchors"].split(",")]
+            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
+            anchors = [anchors[i] for i in anchor_idxs]
+            num_classes = int(module_def["classes"])
+            img_size = int(hyperparams["height"])
+            # Define detection layer
+            yolo_layer = YOLOLayer(anchors, num_classes, img_size)
+            modules.add_module(f"yolo_{module_i}", yolo_layer)
+        # Register module list and number of output filters
+        module_list.append(modules)
+        output_filters.append(filters)
+
+    return hyperparams, module_list
+
+
+class Upsample(nn.Module):
+    """ nn.Upsample is deprecated """
+
+    def __init__(self, scale_factor, mode="nearest"):
+        super(Upsample, self).__init__()
+        self.scale_factor = scale_factor
+        self.mode = mode
+
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
+        return x
+
+
+class EmptyLayer(nn.Module):
+    """Placeholder for 'route' and 'shortcut' layers"""
+
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+
+
+class YOLOLayer(nn.Module):
+    """Detection layer"""
+
+    def __init__(self, anchors, num_classes, img_dim=416):
+        super(YOLOLayer, self).__init__()
+        self.anchors = anchors
+        self.num_anchors = len(anchors)
+        self.num_classes = num_classes
+        self.ignore_thres = 0.5
+        self.mse_loss = nn.MSELoss()
+        self.bce_loss = nn.BCELoss()
+        self.obj_scale = 1
+        self.noobj_scale = 100
+        self.metrics = {}
+        self.img_dim = img_dim
+        self.grid_size = 0  # grid size
+
+    def compute_grid_offsets(self, grid_size, cuda=True):
+        self.grid_size = grid_size
+        g = self.grid_size
+        FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
+        self.stride = self.img_dim / self.grid_size
+        # Calculate offsets for each grid
+        self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
+        self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
+        self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
+        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
+        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
+
+    def forward(self, x, targets=None, img_dim=None):
+
+        # Tensors for cuda support
+        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
+        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
+        ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
+
+        self.img_dim = img_dim
+        num_samples = x.size(0)
+        grid_size = x.size(2)
+
+        prediction = (
+            x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
+            .permute(0, 1, 3, 4, 2)
+            .contiguous()
+        )
+
+        # Get outputs
+        x = torch.sigmoid(prediction[..., 0])  # Center x
+        y = torch.sigmoid(prediction[..., 1])  # Center y
+        w = prediction[..., 2]  # Width
+        h = prediction[..., 3]  # Height
+        pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
+        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
+
+        # If grid size does not match current we compute new offsets
+        if grid_size != self.grid_size:
+            self.compute_grid_offsets(grid_size, cuda=x.is_cuda)
+
+        # Add offset and scale with anchors
+        pred_boxes = FloatTensor(prediction[..., :4].shape)
+        pred_boxes[..., 0] = x.data + self.grid_x
+        pred_boxes[..., 1] = y.data + self.grid_y
+        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
+        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
+
+        output = torch.cat(
+            (
+                pred_boxes.view(num_samples, -1, 4) * self.stride,
+                pred_conf.view(num_samples, -1, 1),
+                pred_cls.view(num_samples, -1, self.num_classes),
+            ),
+            -1,
+        )
+
+        if targets is None:
+            return output, 0
+        else:
+            iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
+                pred_boxes=pred_boxes,
+                pred_cls=pred_cls,
+                target=targets,
+                anchors=self.scaled_anchors,
+                ignore_thres=self.ignore_thres,
+            )
+
+            # Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
+            loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])
+            loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
+            loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
+            loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
+            loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
+            loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
+            loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
+            loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])
+            total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
+
+            # Metrics
+            cls_acc = 100 * class_mask[obj_mask].mean()
+            conf_obj = pred_conf[obj_mask].mean()
+            conf_noobj = pred_conf[noobj_mask].mean()
+            conf50 = (pred_conf > 0.5).float()
+            iou50 = (iou_scores > 0.5).float()
+            iou75 = (iou_scores > 0.75).float()
+            detected_mask = conf50 * class_mask * tconf
+            precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
+            recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
+            recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
+
+            self.metrics = {
+                "loss": to_cpu(total_loss).item(),
+                "x": to_cpu(loss_x).item(),
+                "y": to_cpu(loss_y).item(),
+                "w": to_cpu(loss_w).item(),
+                "h": to_cpu(loss_h).item(),
+                "conf": to_cpu(loss_conf).item(),
+                "cls": to_cpu(loss_cls).item(),
+                "cls_acc": to_cpu(cls_acc).item(),
+                "recall50": to_cpu(recall50).item(),
+                "recall75": to_cpu(recall75).item(),
+                "precision": to_cpu(precision).item(),
+                "conf_obj": to_cpu(conf_obj).item(),
+                "conf_noobj": to_cpu(conf_noobj).item(),
+                "grid_size": grid_size,
+            }
+
+            return output, total_loss
+
+
+class Darknet(nn.Module):
+    """YOLOv3 object detection model"""
+
+    def __init__(self, config_path, img_size=416):
+        super(Darknet, self).__init__()
+        self.module_defs = parse_model_config(config_path)
+        self.hyperparams, self.module_list = create_modules(self.module_defs)
+        self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]
+        self.img_size = img_size
+        self.seen = 0
+        self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
+
+    def forward(self, x, targets=None):
+        img_dim = x.shape[2]
+        loss = 0
+        layer_outputs, yolo_outputs = [], []
+        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
+            if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
+                x = module(x)
+            elif module_def["type"] == "route":
+                x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)
+            elif module_def["type"] == "shortcut":
+                layer_i = int(module_def["from"])
+                x = layer_outputs[-1] + layer_outputs[layer_i]
+            elif module_def["type"] == "yolo":
+                x, layer_loss = module[0](x, targets, img_dim)
+                loss += layer_loss
+                yolo_outputs.append(x)
+            layer_outputs.append(x)
+        yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
+        return yolo_outputs if targets is None else (loss, yolo_outputs)
+
+    def load_darknet_weights(self, weights_path):
+        """Parses and loads the weights stored in 'weights_path'"""
+
+        # Open the weights file
+        with open(weights_path, "rb") as f:
+            header = np.fromfile(f, dtype=np.int32, count=5)  # First five are header values
+            self.header_info = header  # Needed to write header when saving weights
+            self.seen = header[3]  # number of images seen during training
+            weights = np.fromfile(f, dtype=np.float32)  # The rest are weights
+
+        # Establish cutoff for loading backbone weights
+        cutoff = None
+        if "darknet53.conv.74" in weights_path:
+            cutoff = 75
+
+        ptr = 0
+        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
+            if i == cutoff:
+                break
+            if module_def["type"] == "convolutional":
+                conv_layer = module[0]
+                if module_def["batch_normalize"]:
+                    # Load BN bias, weights, running mean and running variance
+                    bn_layer = module[1]
+                    num_b = bn_layer.bias.numel()  # Number of biases
+                    # Bias
+                    bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
+                    bn_layer.bias.data.copy_(bn_b)
+                    ptr += num_b
+                    # Weight
+                    bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
+                    bn_layer.weight.data.copy_(bn_w)
+                    ptr += num_b
+                    # Running Mean
+                    bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
+                    bn_layer.running_mean.data.copy_(bn_rm)
+                    ptr += num_b
+                    # Running Var
+                    bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
+                    bn_layer.running_var.data.copy_(bn_rv)
+                    ptr += num_b
+                else:
+                    # Load conv. bias
+                    num_b = conv_layer.bias.numel()
+                    conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
+                    conv_layer.bias.data.copy_(conv_b)
+                    ptr += num_b
+                # Load conv. weights
+                num_w = conv_layer.weight.numel()
+                conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
+                conv_layer.weight.data.copy_(conv_w)
+                ptr += num_w
+
+    def save_darknet_weights(self, path, cutoff=-1):
+        """
+            @:param path    - path of the new weights file
+            @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
+        """
+        fp = open(path, "wb")
+        self.header_info[3] = self.seen
+        self.header_info.tofile(fp)
+
+        # Iterate through layers
+        for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
+            if module_def["type"] == "convolutional":
+                conv_layer = module[0]
+                # If batch norm, load bn first
+                if module_def["batch_normalize"]:
+                    bn_layer = module[1]
+                    bn_layer.bias.data.cpu().numpy().tofile(fp)
+                    bn_layer.weight.data.cpu().numpy().tofile(fp)
+                    bn_layer.running_mean.data.cpu().numpy().tofile(fp)
+                    bn_layer.running_var.data.cpu().numpy().tofile(fp)
+                # Load conv bias
+                else:
+                    conv_layer.bias.data.cpu().numpy().tofile(fp)
+                # Load conv weights
+                conv_layer.weight.data.cpu().numpy().tofile(fp)
+
+        fp.close()
diff --git a/src/detector/utils.py b/src/detector/utils.py
new file mode 100644
index 0000000..7a01c4d
--- /dev/null
+++ b/src/detector/utils.py
@@ -0,0 +1,239 @@
+import torch
+import numpy as np
+import cv2
+
+
+def pad_to_square(img, pad_value):
+    h, w, _ = img.shape
+    dim_diff = np.abs(h - w)
+    # Upper (left) and lower (right) padding
+    pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2
+    # Determine padding
+    pad = ((pad1, pad2), (0, 0), (0, 0)) if h <= w else ((0, 0), (pad1, pad2), (0, 0))
+    # Add padding
+    img = np.pad(img, pad, "constant", constant_values=pad_value)
+    return img, pad
+
+
+def preprocess_image(img, img_size=416):
+    input_img, _ = pad_to_square(np.array(img), 127.5)
+    # Resize
+    input_img = cv2.resize(
+        input_img, (img_size, img_size), interpolation=cv2.INTER_AREA
+    )
+    # Channels-first
+    input_img = np.transpose(input_img, (2, 0, 1))
+
+    # extend one dimension
+    input_img = np.expand_dims(input_img, axis=0)
+
+    # As pytorch tensor
+    input_img = torch.from_numpy(input_img).float() / 255.0
+    return input_img
+
+
+def parse_model_config(path):
+    """Parses the yolo-v3 layer configuration file and returns module definitions"""
+    file = open(path, 'r')
+    lines = file.read().split('\n')
+    lines = [x for x in lines if x and not x.startswith('#')]
+    lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
+    module_defs = []
+    for line in lines:
+        if line.startswith('['): # This marks the start of a new block
+            module_defs.append({})
+            module_defs[-1]['type'] = line[1:-1].rstrip()
+            if module_defs[-1]['type'] == 'convolutional':
+                module_defs[-1]['batch_normalize'] = 0
+        else:
+            key, value = line.split("=")
+            value = value.strip()
+            module_defs[-1][key.rstrip()] = value.strip()
+
+    return module_defs
+
+
+def parse_data_config(path):
+    """Parses the data configuration file"""
+    options = dict()
+    options['gpus'] = '0,1,2,3'
+    options['num_workers'] = '10'
+    with open(path, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#'):
+            continue
+        key, value = line.split('=')
+        options[key.strip()] = value.strip()
+    return options
+
+
+def xywh2xyxy(x):
+    y = x.new(x.shape)
+    y[..., 0] = x[..., 0] - x[..., 2] / 2
+    y[..., 1] = x[..., 1] - x[..., 3] / 2
+    y[..., 2] = x[..., 0] + x[..., 2] / 2
+    y[..., 3] = x[..., 1] + x[..., 3] / 2
+    return y
+
+
+def bbox_wh_iou(wh1, wh2):
+    wh2 = wh2.t()
+    w1, h1 = wh1[0], wh1[1]
+    w2, h2 = wh2[0], wh2[1]
+    inter_area = torch.min(w1, w2) * torch.min(h1, h2)
+    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
+    return inter_area / union_area
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    """
+    Returns the IoU of two bounding boxes
+    """
+    if not x1y1x2y2:
+        # Transform from center and width to exact coordinates
+        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+    else:
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+    # get the corrdinates of the intersection rectangle
+    inter_rect_x1 = torch.max(b1_x1, b2_x1)
+    inter_rect_y1 = torch.max(b1_y1, b2_y1)
+    inter_rect_x2 = torch.min(b1_x2, b2_x2)
+    inter_rect_y2 = torch.min(b1_y2, b2_y2)
+    # Intersection area
+    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(
+        inter_rect_y2 - inter_rect_y1 + 1, min=0
+    )
+    # Union Area
+    b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+    b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+    return iou
+
+
+def to_cpu(tensor):
+    return tensor.detach().cpu()
+
+
+def rescale_boxes(boxes, current_dim, original_shape):
+    """ Rescales bounding boxes to the original shape """
+    orig_h, orig_w = original_shape
+    # The amount of padding that was added
+    pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape))
+    pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape))
+    # Image height and width after padding is removed
+    unpad_h = current_dim - pad_y
+    unpad_w = current_dim - pad_x
+    # Rescale bounding boxes to dimension of original img
+    boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w
+    boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h
+    boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w
+    boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h
+    return boxes
+
+
+def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):
+    """
+    Removes detections with lower object confidence score than 'conf_thres' and performs
+    Non-Maximum Suppression to further filter detections.
+    Returns detections with shape:
+        (x1, y1, x2, y2, object_conf, class_score, class_pred)
+    """
+
+    # From (center x, center y, width, height) to (x1, y1, x2, y2)
+    prediction[..., :4] = xywh2xyxy(prediction[..., :4])
+    output = [None for _ in range(len(prediction))]
+    for image_i, image_pred in enumerate(prediction):
+        # Filter out confidence scores below threshold
+        image_pred = image_pred[image_pred[:, 4] >= conf_thres]
+        # If none are remaining => process next img
+        if not image_pred.size(0):
+            continue
+        # Object confidence times class confidence
+        score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]
+        # Sort by it
+        image_pred = image_pred[(-score).argsort()]
+        class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True)
+        detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)
+        # Perform non-maximum suppression
+        keep_boxes = []
+        while detections.size(0):
+            large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
+            label_match = detections[0, -1] == detections[:, -1]
+            # Indices of boxes with lower confidence scores, large IOUs and matching labels
+            invalid = large_overlap & label_match
+            weights = detections[invalid, 4:5]
+            # Merge overlapping bboxes by order of confidence
+            detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
+            keep_boxes += [detections[0]]
+            detections = detections[~invalid]
+        if keep_boxes:
+            output[image_i] = torch.stack(keep_boxes)
+
+    return output
+
+
+def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):
+
+    ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
+    FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor
+
+    nB = pred_boxes.size(0)
+    nA = pred_boxes.size(1)
+    nC = pred_cls.size(-1)
+    nG = pred_boxes.size(2)
+
+    # Output tensors
+    obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0)
+    noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1)
+    class_mask = FloatTensor(nB, nA, nG, nG).fill_(0)
+    iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0)
+    tx = FloatTensor(nB, nA, nG, nG).fill_(0)
+    ty = FloatTensor(nB, nA, nG, nG).fill_(0)
+    tw = FloatTensor(nB, nA, nG, nG).fill_(0)
+    th = FloatTensor(nB, nA, nG, nG).fill_(0)
+    tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)
+
+    # Convert to position relative to box
+    target_boxes = target[:, 2:6] * nG
+    gxy = target_boxes[:, :2]
+    gwh = target_boxes[:, 2:]
+    # Get anchors with best iou
+    ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors])
+    best_ious, best_n = ious.max(0)
+    # Separate target values
+    b, target_labels = target[:, :2].long().t()
+    gx, gy = gxy.t()
+    gw, gh = gwh.t()
+    gi, gj = gxy.long().t()
+    # Set masks
+    obj_mask[b, best_n, gj, gi] = 1
+    noobj_mask[b, best_n, gj, gi] = 0
+
+    # Set noobj mask to zero where iou exceeds ignore threshold
+    for i, anchor_ious in enumerate(ious.t()):
+        noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
+
+    # Coordinates
+    tx[b, best_n, gj, gi] = gx - gx.floor()
+    ty[b, best_n, gj, gi] = gy - gy.floor()
+    # Width and height
+    tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
+    th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
+    # One-hot encoding of label
+    tcls[b, best_n, gj, gi, target_labels] = 1
+    # Compute label correctness and iou at best anchor
+    class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
+    iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)
+
+    tconf = obj_mask.float()
+    return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf
diff --git a/src/evaluate.py b/src/evaluate.py
new file mode 100644
index 0000000..5890cd3
--- /dev/null
+++ b/src/evaluate.py
@@ -0,0 +1,192 @@
+import sys
+import time
+
+import numpy as np
+import pandas
+import torch
+from torchvision import transforms
+from torch.utils.data import DataLoader
+
+from common import get_model_resgcn
+from utils import AverageMeter
+from datasets import dataset_factory
+from datasets.augmentation import ShuffleSequence, SelectSequenceCenter, ToTensor, MultiInput
+from datasets.graph import Graph
+
+
+def _evaluate_casia_b(embeddings):
+    """
+    Test dataset consists of sequences of last 50 ids from CASIA B Dataset.
+    Data is divided in the following way:
+    Gallery Set:
+        NM 1, NM 2, NM 3, NM 4
+    Probe Set:
+        Subset 1:
+            NM 5, NM 6
+         Subset 2:
+            BG 1, BG 2
+         Subset 3:
+            CL 1, CL 2
+    """
+
+    gallery = {k: v for (k, v) in embeddings.items() if k[1] == 0 and k[2] <= 4}
+    gallery_per_angle = {}
+    for angle in range(0, 181, 18):
+        gallery_per_angle[angle] = {k: v for (k, v) in gallery.items() if k[3] == angle}
+
+    probe_nm = {k: v for (k, v) in embeddings.items() if k[1] == 0 and k[2] >= 5}
+    probe_bg = {k: v for (k, v) in embeddings.items() if k[1] == 1}
+    probe_cl = {k: v for (k, v) in embeddings.items() if k[1] == 2}
+
+    correct = np.zeros((3, 11, 11))
+    total = np.zeros((3, 11, 11))
+    for gallery_angle in range(0, 181, 18):
+        gallery_embeddings = np.array(list(gallery_per_angle[gallery_angle].values()))
+        gallery_targets = list(gallery_per_angle[gallery_angle].keys())
+        gallery_pos = int(gallery_angle / 18)
+
+        probe_num = 0
+        for probe in [probe_nm, probe_bg, probe_cl]:
+            for (target, embedding) in probe.items():
+                subject_id, _, _, probe_angle = target
+                probe_pos = int(probe_angle / 18)
+
+                distance = np.linalg.norm(gallery_embeddings - embedding, ord=2, axis=1)
+                min_pos = np.argmin(distance)
+                min_target = gallery_targets[int(min_pos)]
+
+                if min_target[0] == subject_id:
+                    correct[probe_num, gallery_pos, probe_pos] += 1
+                total[probe_num, gallery_pos, probe_pos] += 1
+
+            probe_num += 1
+
+    accuracy = correct / total
+
+    # Exclude same view
+    for i in range(3):
+        accuracy[i] -= np.diag(np.diag(accuracy[i]))
+
+    accuracy_flat = np.sum(accuracy, 1) / 10
+
+    header = ["NM#5-6", "BG#1-2", "CL#1-2"]
+
+    accuracy_avg = np.mean(accuracy)
+    sub_accuracies_avg = np.mean(accuracy_flat, 1)
+    sub_accuracies = dict(zip(header, list(sub_accuracies_avg)))
+
+    dataframe = pandas.DataFrame(
+        np.concatenate((accuracy_flat, sub_accuracies_avg[..., np.newaxis]), 1),
+        header,
+        list(range(0, 181, 18)) + ["mean"],
+    )
+
+    return correct, accuracy_avg, sub_accuracies, dataframe
+
+
+def evaluate(data_loader, model, evaluation_fn, log_interval=10, use_flip=False):
+    model.eval()
+    batch_time = AverageMeter()
+
+    # Calculate embeddings
+    with torch.no_grad():
+        end = time.time()
+        embeddings = dict()
+        for idx, (points, target) in enumerate(data_loader):
+            if use_flip:
+                bsz = points.shape[0]
+                data_flipped = torch.flip(points, dims=[1])
+                points = torch.cat([points, data_flipped], dim=0)
+
+            if torch.cuda.is_available():
+                points = points.cuda(non_blocking=True)
+
+            output = model(points)
+
+            if use_flip:
+                f1, f2 = torch.split(output, [bsz, bsz], dim=0)
+                output = torch.mean(torch.stack([f1, f2]), dim=0)
+
+            for i in range(output.shape[0]):
+                sequence = tuple(
+                    int(t[i]) if type(t[i]) is torch.Tensor else t[i] for t in target
+                )
+                embeddings[sequence] = output[i].cpu().numpy()
+
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if idx % log_interval == 0:
+                print(
+                    f"Test: [{idx}/{len(data_loader)}]\t"
+                    f"Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
+                )
+                sys.stdout.flush()
+
+    return evaluation_fn(embeddings)
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Evaluate model on dataset")
+    parser.add_argument("dataset", choices=["casia-b"])
+    parser.add_argument("weights_path")
+    parser.add_argument("data_path")
+    parser.add_argument("--network_name", default="resgcn-n39-r8")
+    parser.add_argument("--sequence_length", type=int, default=60)
+    parser.add_argument("--batch_size", type=int, default=256)
+    parser.add_argument("--embedding_layer_size", type=int, default=128)
+    parser.add_argument("--use_multi_branch", action="store_true")
+    parser.add_argument("--shuffle", action="store_true")
+
+    opt = parser.parse_args()
+
+    # Config for dataset
+    graph = Graph("coco")
+    dataset_class = dataset_factory(opt.dataset)
+    evaluation_fn = None
+    if opt.dataset == "casia-b":
+        evaluation_fn = _evaluate_casia_b
+
+    # Load data
+    dataset = dataset_class(
+        opt.data_path,
+        train=False,
+        sequence_length=opt.sequence_length,
+        transform=transforms.Compose(
+            [
+                SelectSequenceCenter(opt.sequence_length),
+                ShuffleSequence(opt.shuffle),
+                MultiInput(graph.connect_joint, opt.use_multi_branch),
+                ToTensor()
+            ]
+        ),
+    )
+    data_loader = DataLoader(dataset, batch_size=opt.batch_size)
+    print(f"Data loaded: {len(data_loader)} batches")
+
+    # Init model
+    model = get_model_resgcn(graph, opt)
+
+    if torch.cuda.is_available():
+        model.cuda()
+
+    # Load weights
+    checkpoint = torch.load(opt.weights_path)
+    model.load_state_dict(checkpoint["model"])
+
+    result, accuracy_avg, sub_accuracies, dataframe = evaluate(
+        data_loader, model, evaluation_fn, use_flip=True
+    )
+
+    print("\n")
+    print((dataframe * 100).round(2))
+    print(f"AVG: {accuracy_avg*100} %")
+    print("=================================")
+    print((dataframe * 100).round(1).to_latex())
+    print((dataframe * 100).round(1).to_markdown())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/losses.py b/src/losses.py
new file mode 100644
index 0000000..e643b8d
--- /dev/null
+++ b/src/losses.py
@@ -0,0 +1,96 @@
+import torch
+import torch.nn as nn
+
+"""
+Author: Yonglong Tian (yonglong@mit.edu)
+Date: May 07, 2020
+"""
+
+
+class SupConLoss(nn.Module):
+    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    It also supports the unsupervised contrastive loss in SimCLR"""
+    def __init__(self, temperature=0.07, contrast_mode='all',
+                 base_temperature=0.07):
+        super(SupConLoss, self).__init__()
+        self.temperature = temperature
+        self.contrast_mode = contrast_mode
+        self.base_temperature = base_temperature
+
+    def forward(self, features, labels=None, mask=None):
+        """Compute loss for model. If both `labels` and `mask` are None,
+        it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+        Args:
+            features: hidden vector of shape [bsz, n_views, ...].
+            labels: ground truth of shape [bsz].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+        Returns:
+            A loss scalar.
+        """
+        device = (torch.device('cuda')
+                  if features.is_cuda
+                  else torch.device('cpu'))
+
+        if len(features.shape) < 3:
+            raise ValueError('`features` needs to be [bsz, n_views, ...],'
+                             'at least 3 dimensions are required')
+        if len(features.shape) > 3:
+            features = features.view(features.shape[0], features.shape[1], -1)
+
+        batch_size = features.shape[0]
+        if labels is not None and mask is not None:
+            raise ValueError('Cannot define both `labels` and `mask`')
+        elif labels is None and mask is None:
+            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
+        elif labels is not None:
+            labels = labels.contiguous().view(-1, 1)
+            if labels.shape[0] != batch_size:
+                raise ValueError('Num of labels does not match num of features')
+            mask = torch.eq(labels, labels.T).float().to(device)
+        else:
+            mask = mask.float().to(device)
+
+        contrast_count = features.shape[1]
+        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
+        if self.contrast_mode == 'one':
+            anchor_feature = features[:, 0]
+            anchor_count = 1
+        elif self.contrast_mode == 'all':
+            anchor_feature = contrast_feature
+            anchor_count = contrast_count
+        else:
+            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
+
+        # compute logits
+        anchor_dot_contrast = torch.div(
+            torch.matmul(anchor_feature, contrast_feature.T),
+            self.temperature)
+        # for numerical stability
+        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+
+        # tile mask
+        mask = mask.repeat(anchor_count, contrast_count)
+        # mask-out self-contrast cases
+        logits_mask = torch.scatter(
+            torch.ones_like(mask),
+            1,
+            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
+            0
+        )
+        mask = mask * logits_mask
+
+        # compute log_prob
+        exp_logits = torch.exp(logits) * logits_mask
+        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
+
+        # compute mean of log-likelihood over positive
+        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
+
+        # loss
+        loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = loss.view(anchor_count, batch_size).mean()
+
+        return loss
diff --git a/src/models/README.md b/src/models/README.md
new file mode 100644
index 0000000..5d6fe0e
--- /dev/null
+++ b/src/models/README.md
@@ -0,0 +1,4 @@
+
+
+__Spatial Temporal Graph Convolutional Networks (ST-GCN) for Skeleton-Based Action Recognition__
+ST-GCN network is borrowed from [yysijie/st-gcn](https://github.com/yysijie/st-gcn)
diff --git a/src/models/ResGCNv1/__init__.py b/src/models/ResGCNv1/__init__.py
new file mode 100644
index 0000000..a8411b6
--- /dev/null
+++ b/src/models/ResGCNv1/__init__.py
@@ -0,0 +1,56 @@
+import logging
+
+from . import blocks
+from .nets import ResGCN
+from .modules import ResGCN_Module, AttGCN_Module
+from .attentions import *
+
+
+__model = {
+    'resgcn': ResGCN,
+}
+
+__attention = {
+    'pa': Part_Att,
+    'ca': Channel_Att,
+    'fa': Frame_Att,
+    'ja': Joint_Att,
+    'pca': Part_Conv_Att,
+    'psa': Part_Share_Att,
+}
+
+__structure = {
+    'b15': {'structure': [1,2,2,2], 'block': 'Basic'},
+    'b19': {'structure': [1,2,3,3], 'block': 'Basic'},
+    'b23': {'structure': [1,3,4,3], 'block': 'Basic'},
+    'b29': {'structure': [1,3,6,4], 'block': 'Basic'},
+    'n39': {'structure': [1,2,2,2], 'block': 'Bottleneck'},
+    'n51': {'structure': [1,2,3,3], 'block': 'Bottleneck'},
+    'n57': {'structure': [1,3,4,3], 'block': 'Bottleneck'},
+    'n75': {'structure': [1,3,6,4], 'block': 'Bottleneck'},
+}
+
+__reduction = {
+    'r1': {'reduction': 1},
+    'r2': {'reduction': 2},
+    'r4': {'reduction': 4},
+    'r8': {'reduction': 8},
+}
+
+
+def create(model_type, **kwargs):
+    model_split = model_type.split('-')
+    if model_split[0] in __attention.keys():
+        kwargs.update({'module': AttGCN_Module, 'attention': __attention[model_split[0]]})
+        del(model_split[0])
+    else:
+        kwargs.update({'module': ResGCN_Module, 'attention': None})
+    try:
+        [model, structure, reduction] = model_split
+    except:
+        [model, structure], reduction = model_split, 'r1'
+    if not (model in __model.keys() and structure in __structure.keys() and reduction in __reduction.keys()):
+        logging.info('')
+        logging.error('Error: Do NOT exist this model_type: {}!'.format(model_type))
+        raise ValueError()
+    return __model[model](**(__structure[structure]), **(__reduction[reduction]), **kwargs)
diff --git a/src/models/ResGCNv1/attentions.py b/src/models/ResGCNv1/attentions.py
new file mode 100644
index 0000000..4dc903b
--- /dev/null
+++ b/src/models/ResGCNv1/attentions.py
@@ -0,0 +1,187 @@
+import torch
+from torch import nn
+
+
+class Part_Att(nn.Module):
+    def __init__(self, channel, parts, **kwargs):
+        super(Part_Att, self).__init__()
+
+        self.parts = parts
+        self.joints = get_corr_joints(parts)
+
+        inter_channel = channel // 4
+
+        self.fcn = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(channel, inter_channel, kernel_size=1),
+            nn.BatchNorm2d(inter_channel),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channel, channel*len(self.parts), kernel_size=1),
+        )
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.bn = nn.BatchNorm2d(channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        N, C, T, V = x.size()
+        res = x
+
+        x_att = self.softmax(self.fcn(x).view(N, C, len(self.parts)))
+        x_att = torch.split(x_att, 1, dim=-1)
+        x_att = [x_att[self.joints[i]].expand_as(x[:,:,:,i]) for i in range(V)]
+        x_att = torch.stack(x_att, dim=-1)
+        return self.relu(self.bn(x * x_att) + res)
+
+
+class Part_Share_Att(nn.Module):
+    def __init__(self, channel, parts, **kwargs):
+        super(Part_Share_Att, self).__init__()
+
+        self.parts = parts
+        self.joints = get_corr_joints(parts)
+
+        inter_channel = channel // 4
+
+        self.part_pool = nn.Sequential(
+            nn.Conv2d(channel, inter_channel, kernel_size=1),
+            nn.BatchNorm2d(inter_channel),
+            nn.ReLU(inplace=True),
+            nn.AdaptiveAvgPool2d(1),
+        )
+
+        self.fcn = nn.Sequential(
+            nn.Conv2d(inter_channel, inter_channel, kernel_size=1),
+            nn.BatchNorm2d(inter_channel),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channel, channel*len(self.parts), kernel_size=1),
+        )
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.bn = nn.BatchNorm2d(channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        N, C, T, V = x.size()
+        res = x
+
+        x_split = [self.part_pool(x[:,:,:,part]) for part in self.parts]
+        x_att = self.softmax(self.fcn(sum(x_split)).view(N, C, len(self.parts)))
+        x_att = torch.split(x_att, 1, dim=-1)
+        x_att = [x_att[self.joints[i]].expand_as(x[:,:,:,i]) for i in range(V)]
+        x_att = torch.stack(x_att, dim=-1)
+        return self.relu(self.bn(x * x_att) + res)
+
+
+class Part_Conv_Att(nn.Module):
+    def __init__(self, channel, parts, **kwargs):
+        super(Part_Conv_Att, self).__init__()
+
+        self.parts = parts
+        self.joints = get_corr_joints(parts)
+
+        inter_channel = channel // 4
+
+        self.part_pool = nn.ModuleList([nn.Sequential(
+            nn.Conv2d(channel, inter_channel, kernel_size=1),
+            nn.BatchNorm2d(inter_channel),
+            nn.ReLU(inplace=True),
+            nn.AdaptiveAvgPool2d(1),
+        ) for _ in range(len(self.parts))])
+
+        self.fcn = nn.Sequential(
+            nn.Conv2d(inter_channel, inter_channel, kernel_size=1),
+            nn.BatchNorm2d(inter_channel),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channel, channel*len(self.parts), kernel_size=1),
+        )
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.bn = nn.BatchNorm2d(channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        N, C, T, V = x.size()
+        res = x
+
+        x_split = [pool(x[:,:,:,part]) for part, pool in zip(self.parts, self.part_pool)]
+        x_att = self.softmax(self.fcn(sum(x_split)).view(N, C, len(self.parts)))
+        x_att = torch.split(x_att, 1, dim=-1)
+        x_att = [x_att[self.joints[i]].expand_as(x[:,:,:,i]) for i in range(V)]
+        x_att = torch.stack(x_att, dim=-1)
+        return self.relu(self.bn(x * x_att) + res)
+
+
+class Channel_Att(nn.Module):
+    def __init__(self, channel, **kwargs):
+        super(Channel_Att, self).__init__()
+
+        self.fcn = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(channel, channel//4, kernel_size=1),
+            nn.BatchNorm2d(channel//4),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channel//4, channel, kernel_size=1),
+            nn.Softmax(dim=1)
+        )
+
+        self.bn = nn.BatchNorm2d(channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        res = x
+        x_att = self.fcn(x).squeeze()
+        return self.relu(self.bn(x * x_att[:, :, None, None]) + res)
+
+
+class Frame_Att(nn.Module):
+    def __init__(self, channel, **kwargs):
+        super(Frame_Att, self).__init__()
+
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+        self.conv = nn.Conv2d(2, 1, kernel_size=(9,1), padding=(4,0))
+        self.bn = nn.BatchNorm2d(channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        res = x
+        x_avg = torch.transpose(self.avg_pool(torch.transpose(x, 1, 2)), 1, 2)
+        x_max = torch.transpose(self.max_pool(torch.transpose(x, 1, 2)), 1, 2)
+        x_att = self.conv(torch.cat([x_avg, x_max], dim=1)).squeeze()
+        return self.relu(self.bn(x * x_att[:, None, :, None]) + res)
+
+
+class Joint_Att(nn.Module):
+    def __init__(self, channel, parts, **kwargs):
+        super(Joint_Att, self).__init__()
+
+        num_joint = sum([len(part) for part in parts])
+
+        self.fcn = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(num_joint, num_joint//2, kernel_size=1),
+            nn.BatchNorm2d(num_joint//2),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(num_joint//2, num_joint, kernel_size=1),
+            nn.Softmax(dim=1)
+        )
+
+        self.bn = nn.BatchNorm2d(channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        res = x
+        x_att = self.fcn(torch.transpose(x, 1, 3)).squeeze()
+        return self.relu(self.bn(x * x_att[:, None, None, :]) + res)
+
+
+def get_corr_joints(parts):
+    num_joints = max([max(part) for part in parts]) + 1
+    res = []
+    for i in range(num_joints):
+        for j in range(len(parts)):
+            if i in parts[j]:
+                res.append(j)
+                break
+    return torch.Tensor(res).long()
diff --git a/src/models/ResGCNv1/blocks.py b/src/models/ResGCNv1/blocks.py
new file mode 100644
index 0000000..14bb411
--- /dev/null
+++ b/src/models/ResGCNv1/blocks.py
@@ -0,0 +1,175 @@
+import torch
+from torch import nn
+
+
+class Spatial_Bottleneck_Block(nn.Module):
+    def __init__(self, in_channels, out_channels, max_graph_distance, residual=False, reduction=4, **kwargs):
+        super(Spatial_Bottleneck_Block, self).__init__()
+
+        inter_channels = out_channels // reduction
+
+        if not residual:
+            self.residual = lambda x: 0
+        elif in_channels == out_channels:
+            self.residual = lambda x: x
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1),
+                nn.BatchNorm2d(out_channels),
+            )
+
+        self.conv_down = nn.Conv2d(in_channels, inter_channels, 1)
+        self.bn_down = nn.BatchNorm2d(inter_channels)
+        self.conv = SpatialGraphConv(inter_channels, inter_channels, max_graph_distance)
+        self.bn = nn.BatchNorm2d(inter_channels)
+        self.conv_up = nn.Conv2d(inter_channels, out_channels, 1)
+        self.bn_up = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, A):
+
+        res_block = self.residual(x)
+
+        x = self.conv_down(x)
+        x = self.bn_down(x)
+        x = self.relu(x)
+
+        x = self.conv(x, A)
+        x = self.bn(x)
+        x = self.relu(x)
+
+        x = self.conv_up(x)
+        x = self.bn_up(x)
+        x = self.relu(x + res_block)
+
+        return x
+
+
+class Temporal_Bottleneck_Block(nn.Module):
+    def __init__(self, channels, temporal_window_size, stride=1, residual=False, reduction=4, **kwargs):
+        super(Temporal_Bottleneck_Block, self).__init__()
+
+        padding = ((temporal_window_size - 1) // 2, 0)
+        inter_channels = channels // reduction
+
+        if not residual:
+            self.residual = lambda x: 0
+        elif stride == 1:
+            self.residual = lambda x: x
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(channels, channels, 1, (stride,1)),
+                nn.BatchNorm2d(channels),
+            )
+
+        self.conv_down = nn.Conv2d(channels, inter_channels, 1)
+        self.bn_down = nn.BatchNorm2d(inter_channels)
+        self.conv = nn.Conv2d(inter_channels, inter_channels, (temporal_window_size,1), (stride,1), padding)
+        self.bn = nn.BatchNorm2d(inter_channels)
+        self.conv_up = nn.Conv2d(inter_channels, channels, 1)
+        self.bn_up = nn.BatchNorm2d(channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, res_module):
+
+        res_block = self.residual(x)
+
+        x = self.conv_down(x)
+        x = self.bn_down(x)
+        x = self.relu(x)
+
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+
+        x = self.conv_up(x)
+        x = self.bn_up(x)
+        x = self.relu(x + res_block + res_module)
+
+        return x
+
+
+class Spatial_Basic_Block(nn.Module):
+    def __init__(self, in_channels, out_channels, max_graph_distance, residual=False, **kwargs):
+        super(Spatial_Basic_Block, self).__init__()
+
+        if not residual:
+            self.residual = lambda x: 0
+        elif in_channels == out_channels:
+            self.residual = lambda x: x
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1),
+                nn.BatchNorm2d(out_channels),
+            )
+
+        self.conv = SpatialGraphConv(in_channels, out_channels, max_graph_distance)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, A):
+
+        res_block = self.residual(x)
+
+        x = self.conv(x, A)
+        x = self.bn(x)
+        x = self.relu(x + res_block)
+
+        return x
+
+
+class Temporal_Basic_Block(nn.Module):
+    def __init__(self, channels, temporal_window_size, stride=1, residual=False, **kwargs):
+        super(Temporal_Basic_Block, self).__init__()
+
+        padding = ((temporal_window_size - 1) // 2, 0)
+
+        if not residual:
+            self.residual = lambda x: 0
+        elif stride == 1:
+            self.residual = lambda x: x
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(channels, channels, 1, (stride,1)),
+                nn.BatchNorm2d(channels),
+            )
+
+        self.conv = nn.Conv2d(channels, channels, (temporal_window_size,1), (stride,1), padding)
+        self.bn = nn.BatchNorm2d(channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, res_module):
+
+        res_block = self.residual(x)
+
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x + res_block + res_module)
+
+        return x
+
+
+# Thanks to YAN Sijie for the released code on Github (https://github.com/yysijie/st-gcn)
+class SpatialGraphConv(nn.Module):
+    def __init__(self, in_channels, out_channels, max_graph_distance):
+        super(SpatialGraphConv, self).__init__()
+
+        # spatial class number (distance = 0 for class 0, distance = 1 for class 1, ...)
+        self.s_kernel_size = max_graph_distance + 1
+
+        # weights of different spatial classes
+        self.gcn = nn.Conv2d(in_channels, out_channels*self.s_kernel_size, 1)
+
+    def forward(self, x, A):
+
+        # numbers in same class have same weight
+        x = self.gcn(x)
+
+        # divide nodes into different classes
+        n, kc, t, v = x.size()
+        x = x.view(n, self.s_kernel_size, kc//self.s_kernel_size, t, v)
+
+        # spatial graph convolution
+        x = torch.einsum('nkctv,kvw->nctw', (x, A[:self.s_kernel_size])).contiguous()
+
+        return x
diff --git a/src/models/ResGCNv1/modules.py b/src/models/ResGCNv1/modules.py
new file mode 100644
index 0000000..6b0f996
--- /dev/null
+++ b/src/models/ResGCNv1/modules.py
@@ -0,0 +1,91 @@
+import logging, torch
+from torch import nn
+
+
+def import_class(name):
+    components = name.split('.')
+    mod = __import__(components[0])
+    for comp in components[1:]:
+        mod = getattr(mod, comp)
+    return mod
+
+
+class ResGCN_Module(nn.Module):
+    def __init__(self, in_channels, out_channels, block, A, initial=False, stride=1, kernel_size=[9,2], **kwargs):
+        super(ResGCN_Module, self).__init__()
+
+        if not len(kernel_size) == 2:
+            logging.info('')
+            logging.error('Error: Please check whether len(kernel_size) == 2')
+            raise ValueError()
+        if not kernel_size[0] % 2 == 1:
+            logging.info('')
+            logging.error('Error: Please check whether kernel_size[0] % 2 == 1')
+            raise ValueError()
+        temporal_window_size, max_graph_distance = kernel_size
+
+        if initial:
+            module_res, block_res = False, False
+        elif block == 'Basic':
+            module_res, block_res = True, False
+        else:
+            module_res, block_res = False, True
+
+        if not module_res:
+            self.residual = lambda x: 0
+        elif stride == 1 and in_channels == out_channels:
+            self.residual = lambda x: x
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, (stride,1)),
+                nn.BatchNorm2d(out_channels),
+            )
+
+        spatial_block = import_class('models.ResGCNv1.blocks.Spatial_{}_Block'.format(block))
+        temporal_block = import_class('models.ResGCNv1.blocks.Temporal_{}_Block'.format(block))
+        self.scn = spatial_block(in_channels, out_channels, max_graph_distance, block_res, **kwargs)
+        self.tcn = temporal_block(out_channels, temporal_window_size, stride, block_res, **kwargs)
+        self.edge = nn.Parameter(torch.ones_like(A))
+
+    def forward(self, x, A):
+        return self.tcn(self.scn(x, A*self.edge), self.residual(x))
+
+
+class AttGCN_Module(nn.Module):
+    def __init__(self, in_channels, out_channels, block, A, attention, stride=1, kernel_size=[9,2], **kwargs):
+        super(AttGCN_Module, self).__init__()
+
+        if not len(kernel_size) == 2:
+            logging.info('')
+            logging.error('Error: Please check whether len(kernel_size) == 2')
+            raise ValueError()
+        if not kernel_size[0] % 2 == 1:
+            logging.info('')
+            logging.error('Error: Please check whether kernel_size[0] % 2 == 1')
+            raise ValueError()
+        temporal_window_size, max_graph_distance = kernel_size
+
+        if block == 'Basic':
+            module_res, block_res = True, False
+        else:
+            module_res, block_res = False, True
+
+        if not module_res:
+            self.residual = lambda x: 0
+        elif stride == 1 and in_channels == out_channels:
+            self.residual = lambda x: x
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, (stride,1)),
+                nn.BatchNorm2d(out_channels),
+            )
+
+        spatial_block = import_class('models.ResGCNv1.blocks.Spatial_{}_Block'.format(block))
+        temporal_block = import_class('models.ResGCNv1.blocks.Temporal_{}_Block'.format(block))
+        self.scn = spatial_block(in_channels, out_channels, max_graph_distance, block_res, **kwargs)
+        self.tcn = temporal_block(out_channels, temporal_window_size, stride, block_res, **kwargs)
+        self.att = attention(out_channels, **kwargs)
+        self.edge = nn.Parameter(torch.ones_like(A))
+
+    def forward(self, x, A):
+        return self.att(self.tcn(self.scn(x, A*self.edge), self.residual(x)))
diff --git a/src/models/ResGCNv1/nets.py b/src/models/ResGCNv1/nets.py
new file mode 100644
index 0000000..a83fd30
--- /dev/null
+++ b/src/models/ResGCNv1/nets.py
@@ -0,0 +1,104 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from .modules import ResGCN_Module
+
+
+class ResGCN_Input_Branch(nn.Module):
+    def __init__(self, structure, block, num_channel, A, **kwargs):
+        super(ResGCN_Input_Branch, self).__init__()
+
+        self.register_buffer('A', A)
+
+        module_list = [ResGCN_Module(num_channel, 64, 'Basic', A, initial=True, **kwargs)]
+        module_list += [ResGCN_Module(64, 64, 'Basic', A, initial=True, **kwargs) for _ in range(structure[0] - 1)]
+        module_list += [ResGCN_Module(64, 64, block, A, **kwargs) for _ in range(structure[1] - 1)]
+        module_list += [ResGCN_Module(64, 32, block, A, **kwargs)]
+
+        self.bn = nn.BatchNorm2d(num_channel)
+        self.layers = nn.ModuleList(module_list)
+
+    def forward(self, x):
+
+        x = self.bn(x)
+        for layer in self.layers:
+            x = layer(x, self.A)
+
+        return x
+
+
+class ResGCN(nn.Module):
+    def __init__(self, module, structure, block, num_input, num_channel, num_class, A, **kwargs):
+        super(ResGCN, self).__init__()
+
+        self.register_buffer('A', A)
+
+        # input branches
+        self.input_branches = nn.ModuleList([
+            ResGCN_Input_Branch(structure, block, num_channel, A, **kwargs)
+            for _ in range(num_input)
+        ])
+
+        # main stream
+        module_list = [module(32*num_input, 128, block, A, stride=2, **kwargs)]
+        module_list += [module(128, 128, block, A, **kwargs) for _ in range(structure[2] - 1)]
+        module_list += [module(128, 256, block, A, stride=2, **kwargs)]
+        module_list += [module(256, 256, block, A, **kwargs) for _ in range(structure[3] - 1)]
+        self.main_stream = nn.ModuleList(module_list)
+
+        # output
+        self.global_pooling = nn.AdaptiveAvgPool2d(1)
+        self.fcn = nn.Linear(256, num_class)
+
+        # init parameters
+        init_param(self.modules())
+        zero_init_lastBN(self.modules())
+
+    def forward(self, x):
+
+        # N, I, C, T, V = x.size()
+
+        # input branches
+        x_cat = []
+        for i, branch in enumerate(self.input_branches):
+            x_cat.append(branch(x[:,i,:,:,:]))
+        x = torch.cat(x_cat, dim=1)
+
+        # main stream
+        for layer in self.main_stream:
+            x = layer(x, self.A)
+
+        # output
+        x = self.global_pooling(x)
+        x = self.fcn(x.squeeze())
+
+        # L2 normalization
+        x = F.normalize(x, dim=1, p=2)
+
+        return x
+
+
+def init_param(modules):
+    for m in modules:
+        if isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            #m.bias = None
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.weight, 1)
+            nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.001)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+
+def zero_init_lastBN(modules):
+    for m in modules:
+        if isinstance(m, ResGCN_Module):
+            if hasattr(m.scn, 'bn_up'):
+                nn.init.constant_(m.scn.bn_up.weight, 0)
+            if hasattr(m.tcn, 'bn_up'):
+                nn.init.constant_(m.tcn.bn_up.weight, 0)
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000..da2c068
--- /dev/null
+++ b/src/models/__init__.py
@@ -0,0 +1 @@
+from .ResGCNv1 import create
diff --git a/src/models/st_gcn/__init__.py b/src/models/st_gcn/__init__.py
new file mode 100644
index 0000000..9f9161b
--- /dev/null
+++ b/src/models/st_gcn/__init__.py
@@ -0,0 +1 @@
+from . import utils
\ No newline at end of file
diff --git a/src/models/st_gcn/st_gcn.py b/src/models/st_gcn/st_gcn.py
new file mode 100644
index 0000000..4c2cd21
--- /dev/null
+++ b/src/models/st_gcn/st_gcn.py
@@ -0,0 +1,180 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from models.st_gcn.utils.tgcn import ConvTemporalGraphical
+from models.st_gcn.utils.graph import Graph
+
+
+class STGCNEmbedding(nn.Module):
+    r"""Spatial temporal graph convolutional networks.
+
+    Args:
+        in_channels (int): Number of channels in the input data
+        graph_args (dict): The arguments for building the graph
+        edge_importance_weighting (bool): If ``True``, adds a learnable
+            importance weighting to the edges of the graph
+        **kwargs (optional): Other parameters for graph convolution units
+
+    Shape:
+        - Input: :math:`(N, in_channels, T_{in}, V_{in})`
+        - Output: :math:`(N, num_class)` where
+            :math:`N` is a batch size,
+            :math:`T_{in}` is a length of input sequence,
+            :math:`V_{in}` is the number of graph nodes
+    """
+
+    def __init__(self, in_channels, graph_args, edge_importance_weighting=False, temporal_kernel_size=9,
+                 embedding_layer_size=256, **kwargs):
+        super().__init__()
+
+        # load graph
+        self.graph = Graph(**graph_args)
+        A = torch.tensor(self.graph.A, dtype=torch.float32, requires_grad=False)
+        self.register_buffer('A', A)
+
+        # build networks
+        spatial_kernel_size = A.size(0)
+        # temporal_kernel_size = 9
+
+        kernel_size = (temporal_kernel_size, spatial_kernel_size)
+
+        self.data_bn = nn.BatchNorm1d(in_channels * A.size(1))
+        kwargs0 = {k: v for k, v in kwargs.items() if k != 'dropout'}
+
+        self.st_gcn_networks = nn.ModuleList((
+            st_gcn(in_channels, 64, kernel_size, 1, residual=False, **kwargs0),
+            st_gcn(64, 64, kernel_size, 1, **kwargs),
+            st_gcn(64, 64, kernel_size, 1, **kwargs),
+            st_gcn(64, 64, kernel_size, 1, **kwargs),
+            st_gcn(64, 128, kernel_size, 2, **kwargs),
+            st_gcn(128, 128, kernel_size, 1, **kwargs),
+            st_gcn(128, 128, kernel_size, 1, **kwargs),
+            st_gcn(128, 256, kernel_size, 2, **kwargs),
+            # st_gcn(256, 256, kernel_size, 1, **kwargs),
+            st_gcn(256, 256, kernel_size, 1, **kwargs),
+        ))
+
+        # initialize parameters for edge importance weighting
+        if edge_importance_weighting:
+            self.edge_importance = nn.ParameterList([
+                nn.Parameter(torch.ones(self.A.size()), requires_grad=True)
+                for _ in self.st_gcn_networks
+            ])
+        else:
+            self.edge_importance = [1] * len(self.st_gcn_networks)
+
+        self.fcn = nn.Conv2d(256, embedding_layer_size, kernel_size=1)
+
+    def forward(self, x, hint=None):
+        # data normalization
+        N, C, T, V = x.size()
+        x = x.permute(0, 3, 1, 2).contiguous()
+        x = x.view(N, V * C, T)
+
+        x = self.data_bn(x)
+        x = x.view(N, V, C, T)
+
+        x = x.permute(0, 2, 3, 1).contiguous()
+        x = x.view(N, C, T, V)
+        # forward
+        for gcn, importance in zip(self.st_gcn_networks, self.edge_importance):
+            x, _ = gcn(x, self.A * importance)
+
+        # Adding average pooling as in the original model
+        x = F.avg_pool2d(x, x.size()[2:])
+
+        feature = self.fcn(x)
+
+        # L2 normalization
+        feature = F.normalize(feature, dim=1, p=2)
+
+        feature = feature.view(N, -1)
+
+        return feature
+
+    # Alias for model.forward()
+    def get_embedding(self, x):
+        return self.forward(x)
+
+
+class st_gcn(nn.Module):
+    r"""Applies a spatial temporal graph convolution over an input graph sequence.
+
+    Args:
+        in_channels (int): Number of channels in the input sequence data
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (tuple): Size of the temporal convolving kernel and graph convolving kernel
+        stride (int, optional): Stride of the temporal convolution. Default: 1
+        dropout (int, optional): Dropout rate of the final output. Default: 0
+        residual (bool, optional): If ``True``, applies a residual mechanism. Default: ``True``
+
+    Shape:
+        - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
+        - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
+        - Output[0]: Output graph sequence in :math:`(N, out_channels, T_{out}, V)` format
+        - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
+
+        where
+            :math:`N` is a batch size,
+            :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
+            :math:`T_{in}/T_{out}` is a length of input/output sequence,
+            :math:`V` is the number of graph nodes.
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dropout=0,
+                 residual=True):
+        super().__init__()
+
+        assert len(kernel_size) == 2
+        assert kernel_size[0] % 2 == 1
+        padding = ((kernel_size[0] - 1) // 2, 0)
+
+        self.gcn = ConvTemporalGraphical(in_channels, out_channels,
+                                         kernel_size[1])
+
+        self.tcn = nn.Sequential(
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                out_channels,
+                out_channels,
+                (kernel_size[0], 1),
+                (stride, 1),
+                padding,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.Dropout(dropout, inplace=True),
+        )
+
+        if not residual:
+            self.residual = lambda x: 0
+
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+
+        else:
+            self.residual = nn.Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=(stride, 1)),
+                nn.BatchNorm2d(out_channels),
+            )
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, A):
+
+        res = self.residual(x)
+        x, A = self.gcn(x, A)
+        x = self.tcn(x) + res
+
+        return self.relu(x), A
diff --git a/src/models/st_gcn/utils/__init__.py b/src/models/st_gcn/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/models/st_gcn/utils/graph.py b/src/models/st_gcn/utils/graph.py
new file mode 100644
index 0000000..649a9e1
--- /dev/null
+++ b/src/models/st_gcn/utils/graph.py
@@ -0,0 +1,207 @@
+import numpy as np
+
+
+class Graph:
+    """ The Graph to model the skeletons extracted by the openpose
+
+    Args:
+        strategy (string): must be one of the follow candidates
+        - uniform: Uniform Labeling
+        - distance: Distance Partitioning
+        - spatial: Spatial Configuration
+        For more information, please refer to the section 'Partition Strategies'
+            in our paper (https://arxiv.org/abs/1801.07455).
+
+        layout (string): must be one of the follow candidates
+        - openpose: Is consists of 18 joints. For more information, please
+            refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose#output
+        - ntu-rgb+d: Is consists of 25 joints. For more information, please
+            refer to https://github.com/shahroudy/NTURGB-D
+
+        max_hop (int): the maximal distance between two connected nodes
+        dilation (int): controls the spacing between the kernel points
+
+    """
+
+    def __init__(self,
+                 layout='coco',
+                 strategy='uniform',
+                 max_hop=1,
+                 dilation=1):
+        self.max_hop = max_hop
+        self.dilation = dilation
+
+        self.get_edge(layout)
+        self.hop_dis = get_hop_distance(
+            self.num_node, self.edge, max_hop=max_hop)
+        self.get_adjacency(strategy)
+
+    def __str__(self):
+        return self.A
+
+    def get_edge(self, layout):
+        if layout == 'openpose':
+            self.num_node = 18
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_link = [(4, 3), (3, 2), (7, 6), (6, 5), (13, 12), (12, 11),
+                             (10, 9), (9, 8), (11, 5), (8, 2), (5, 1), (2, 1),
+                             (0, 1), (15, 0), (14, 0), (17, 15), (16, 14)]
+            self.edge = self_link + neighbor_link
+            self.center = 1
+        elif layout == 'ntu-rgb+d':
+            self.num_node = 25
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_base = [(1, 2), (2, 21), (3, 21), (4, 3), (5, 21),
+                              (6, 5), (7, 6), (8, 7), (9, 21), (10, 9),
+                              (11, 10), (12, 11), (13, 1), (14, 13), (15, 14),
+                              (16, 15), (17, 1), (18, 17), (19, 18), (20, 19),
+                              (22, 23), (23, 8), (24, 25), (25, 12)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_base]
+            self.edge = self_link + neighbor_link
+            self.center = 21 - 1
+        elif layout == 'ntu_edge':
+            self.num_node = 24
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_base = [(1, 2), (3, 2), (4, 3), (5, 2), (6, 5), (7, 6),
+                              (8, 7), (9, 2), (10, 9), (11, 10), (12, 11),
+                              (13, 1), (14, 13), (15, 14), (16, 15), (17, 1),
+                              (18, 17), (19, 18), (20, 19), (21, 22), (22, 8),
+                              (23, 24), (24, 12)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_base]
+            self.edge = self_link + neighbor_link
+            self.center = 2
+        elif layout == 'coco':
+            # keypoints = {
+            #     0: "nose",
+            #     1: "left_eye",
+            #     2: "right_eye",
+            #     3: "left_ear",
+            #     4: "right_ear",
+            #     5: "left_shoulder",
+            #     6: "right_shoulder",
+            #     7: "left_elbow",
+            #     8: "right_elbow",
+            #     9: "left_wrist",
+            #     10: "right_wrist",
+            #     11: "left_hip",
+            #     12: "right_hip",
+            #     13: "left_knee",
+            #     14: "right_knee",
+            #     15: "left_ankle",
+            #     16: "right_ankle"
+            # }
+            self.num_node = 17
+            self_link = [(i, i) for i in range(self.num_node)]
+            neighbor_base = [(0,1), (0,2), (1,3), (2,4), (3,5), (4,6), (5,6),
+                             (5,7), (7,9), (6,8), (8,10), (5,11), (6, 12), (11, 12),
+                             (11, 13), (13, 15), (12, 14), (14, 16)]
+            neighbor_link = [(i - 1, j - 1) for (i, j) in neighbor_base]
+            self.edge = self_link + neighbor_link
+            self.center = 0
+        elif layout == 'nonlocal-coco':
+            self.num_node = 17
+            self_link = [(i, i) for i in range(self.num_node)]
+            edge=[]
+            for i in range(0,  self.num_node ):
+                for j in range(0,  self.num_node ):
+                    edge.append((i, j))
+            self.edge = edge
+            self.center = 1
+        # elif layout=='customer settings'
+        #     pass
+        else:
+            raise ValueError("Do Not Exist This Layout.")
+
+    def get_adjacency(self, strategy):
+        valid_hop = range(0, self.max_hop + 1, self.dilation)
+        adjacency = np.zeros((self.num_node, self.num_node))
+        for hop in valid_hop:
+            adjacency[self.hop_dis == hop] = 1
+
+        normalize_adjacency = normalize_digraph(adjacency)
+
+        if strategy == 'uniform':
+            A = np.zeros((1, self.num_node, self.num_node))
+            A[0] = normalize_adjacency
+            self.A = A
+        elif strategy == 'distance':
+            A = np.zeros((len(valid_hop), self.num_node, self.num_node))
+            for i, hop in enumerate(valid_hop):
+                A[i][self.hop_dis == hop] = normalize_adjacency[self.hop_dis ==
+                                                                hop]
+            self.A = A
+        elif strategy == 'spatial':
+            A = []
+            for hop in valid_hop:
+                a_root = np.zeros((self.num_node, self.num_node))
+                a_close = np.zeros((self.num_node, self.num_node))
+                a_further = np.zeros((self.num_node, self.num_node))
+                for i in range(self.num_node):
+                    for j in range(self.num_node):
+                        if self.hop_dis[j, i] == hop:
+                            if self.hop_dis[j, self.center] == self.hop_dis[
+                                    i, self.center]:
+                                a_root[j, i] = normalize_adjacency[j, i]
+                            elif self.hop_dis[j, self.
+                                              center] > self.hop_dis[i, self.
+                                                                     center]:
+                                a_close[j, i] = normalize_adjacency[j, i]
+                            else:
+                                a_further[j, i] = normalize_adjacency[j, i]
+                if hop == 0:
+                    A.append(a_root)
+                else:
+                    A.append(a_root + a_close)
+                    A.append(a_further)
+            A = np.stack(A)
+            self.A = A
+        else:
+            raise ValueError("Do Not Exist This Strategy")
+
+
+def get_hop_distance(num_node, edge, max_hop=1):
+    A = np.zeros((num_node, num_node))
+    for i, j in edge:
+        A[j, i] = 1
+        A[i, j] = 1
+
+    # compute hop steps
+    hop_dis = np.zeros((num_node, num_node)) + np.inf
+    transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
+    arrive_mat = (np.stack(transfer_mat) > 0)
+    for d in range(max_hop, -1, -1):
+        hop_dis[arrive_mat[d]] = d
+    return hop_dis
+
+
+def normalize_digraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-1)
+    AD = np.dot(A, Dn)
+    return AD
+
+
+def normalize_undigraph(A):
+    Dl = np.sum(A, 0)
+    num_node = A.shape[0]
+    Dn = np.zeros((num_node, num_node))
+    for i in range(num_node):
+        if Dl[i] > 0:
+            Dn[i, i] = Dl[i]**(-0.5)
+    DAD = np.dot(np.dot(Dn, A), Dn)
+    return DAD
+
+
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+
+    out = Graph(strategy='spatial', layout='coco').A
+    for a in out:
+        plt.imshow(a, cmap='gray')
+        plt.show()
+    print(out)
+
diff --git a/src/models/st_gcn/utils/tgcn.py b/src/models/st_gcn/utils/tgcn.py
new file mode 100644
index 0000000..f51ae28
--- /dev/null
+++ b/src/models/st_gcn/utils/tgcn.py
@@ -0,0 +1,67 @@
+# The based unit of graph convolutional networks.
+
+import torch
+import torch.nn as nn
+
+
+class ConvTemporalGraphical(nn.Module):
+
+    r"""The basic module for applying a graph convolution.
+
+    Args:
+        in_channels (int): Number of channels in the input sequence data
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int): Size of the graph convolving kernel
+        t_kernel_size (int): Size of the temporal convolving kernel
+        t_stride (int, optional): Stride of the temporal convolution. Default: 1
+        t_padding (int, optional): Temporal zero-padding added to both sides of
+            the input. Default: 0
+        t_dilation (int, optional): Spacing between temporal kernel elements.
+            Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output.
+            Default: ``True``
+
+    Shape:
+        - Input[0]: Input graph sequence in :math:`(N, in_channels, T_{in}, V)` format
+        - Input[1]: Input graph adjacency matrix in :math:`(K, V, V)` format
+        - Output[0]: Output graph sequence in :math:`(N, out_channels, T_{out}, V)` format
+        - Output[1]: Graph adjacency matrix for output data in :math:`(K, V, V)` format
+
+        where
+            :math:`N` is a batch size,
+            :math:`K` is the spatial kernel size, as :math:`K == kernel_size[1]`,
+            :math:`T_{in}/T_{out}` is a length of input/output sequence,
+            :math:`V` is the number of graph nodes.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 t_kernel_size=1,
+                 t_stride=1,
+                 t_padding=0,
+                 t_dilation=1,
+                 bias=True):
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels * kernel_size,
+            kernel_size=(t_kernel_size, 1),
+            padding=(t_padding, 0),
+            stride=(t_stride, 1),
+            dilation=(t_dilation, 1),
+            bias=bias)
+
+    def forward(self, x, A):
+        assert A.size(0) == self.kernel_size
+
+        x = self.conv(x)
+
+        n, kc, t, v = x.size()
+        x = x.view(n, self.kernel_size, kc//self.kernel_size, t, v)
+        x = torch.einsum('nkctv,kvw->nctw', (x, A))
+
+        return x.contiguous(), A
diff --git a/src/pose_estimator/README.md b/src/pose_estimator/README.md
new file mode 100644
index 0000000..b0cb770
--- /dev/null
+++ b/src/pose_estimator/README.md
@@ -0,0 +1,2 @@
+## Human Pose Estimation: HRNet
+This part is borrowed from [HRNet/HRNet-Human-Pose-Estimation](https://github.com/HRNet/HRNet-Human-Pose-Estimation)
diff --git a/src/pose_estimator/config.py b/src/pose_estimator/config.py
new file mode 100644
index 0000000..9433cc1
--- /dev/null
+++ b/src/pose_estimator/config.py
@@ -0,0 +1,155 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+import os
+
+from yacs.config import CfgNode as CN
+
+
+_C = CN()
+
+_C.OUTPUT_DIR = ''
+_C.LOG_DIR = ''
+_C.DATA_DIR = ''
+_C.GPUS = (0,)
+_C.WORKERS = 4
+_C.PRINT_FREQ = 20
+_C.AUTO_RESUME = False
+_C.PIN_MEMORY = True
+_C.RANK = 0
+
+# Cudnn related params
+_C.CUDNN = CN()
+_C.CUDNN.BENCHMARK = True
+_C.CUDNN.DETERMINISTIC = False
+_C.CUDNN.ENABLED = True
+
+# common params for NETWORK
+_C.MODEL = CN()
+_C.MODEL.NAME = 'pose_hrnet'
+_C.MODEL.INIT_WEIGHTS = True
+_C.MODEL.PRETRAINED = ''
+_C.MODEL.NUM_JOINTS = 17
+_C.MODEL.TAG_PER_JOINT = True
+_C.MODEL.TARGET_TYPE = 'gaussian'
+_C.MODEL.IMAGE_SIZE = [256, 256]  # width * height, ex: 192 * 256
+_C.MODEL.HEATMAP_SIZE = [64, 64]  # width * height, ex: 24 * 32
+_C.MODEL.SIGMA = 2
+_C.MODEL.EXTRA = CN(new_allowed=True)
+
+_C.LOSS = CN()
+_C.LOSS.USE_OHKM = False
+_C.LOSS.TOPK = 8
+_C.LOSS.USE_TARGET_WEIGHT = True
+_C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False
+
+# DATASET related params
+_C.DATASET = CN()
+_C.DATASET.ROOT = ''
+_C.DATASET.DATASET = 'mpii'
+_C.DATASET.TRAIN_SET = 'train'
+_C.DATASET.TEST_SET = 'valid'
+_C.DATASET.DATA_FORMAT = 'jpg'
+_C.DATASET.HYBRID_JOINTS_TYPE = ''
+_C.DATASET.SELECT_DATA = False
+
+# training data augmentation
+_C.DATASET.FLIP = True
+_C.DATASET.SCALE_FACTOR = 0.25
+_C.DATASET.ROT_FACTOR = 30
+_C.DATASET.PROB_HALF_BODY = 0.0
+_C.DATASET.NUM_JOINTS_HALF_BODY = 8
+_C.DATASET.COLOR_RGB = False
+
+# train
+_C.TRAIN = CN()
+
+_C.TRAIN.LR_FACTOR = 0.1
+_C.TRAIN.LR_STEP = [90, 110]
+_C.TRAIN.LR = 0.001
+
+_C.TRAIN.OPTIMIZER = 'adam'
+_C.TRAIN.MOMENTUM = 0.9
+_C.TRAIN.WD = 0.0001
+_C.TRAIN.NESTEROV = False
+_C.TRAIN.GAMMA1 = 0.99
+_C.TRAIN.GAMMA2 = 0.0
+
+_C.TRAIN.BEGIN_EPOCH = 0
+_C.TRAIN.END_EPOCH = 140
+
+_C.TRAIN.RESUME = False
+_C.TRAIN.CHECKPOINT = ''
+
+_C.TRAIN.BATCH_SIZE_PER_GPU = 32
+_C.TRAIN.SHUFFLE = True
+
+# testing
+_C.TEST = CN()
+
+# size of images for each device
+_C.TEST.BATCH_SIZE_PER_GPU = 32
+# Test Model Epoch
+_C.TEST.FLIP_TEST = False
+_C.TEST.POST_PROCESS = False
+_C.TEST.SHIFT_HEATMAP = False
+
+_C.TEST.USE_GT_BBOX = False
+
+# nms
+_C.TEST.IMAGE_THRE = 0.1
+_C.TEST.NMS_THRE = 0.6
+_C.TEST.SOFT_NMS = False
+_C.TEST.OKS_THRE = 0.5
+_C.TEST.IN_VIS_THRE = 0.0
+_C.TEST.COCO_BBOX_FILE = ''
+_C.TEST.BBOX_THRE = 1.0
+_C.TEST.MODEL_FILE = ''
+
+# debug
+_C.DEBUG = CN()
+_C.DEBUG.DEBUG = False
+_C.DEBUG.SAVE_BATCH_IMAGES_GT = False
+_C.DEBUG.SAVE_BATCH_IMAGES_PRED = False
+_C.DEBUG.SAVE_HEATMAPS_GT = False
+_C.DEBUG.SAVE_HEATMAPS_PRED = False
+
+
+def update_config(cfg, args):
+    cfg.defrost()
+    cfg.merge_from_file(args.cfg)
+    cfg.merge_from_list(args.opt)
+
+    if args.modelDir:
+        cfg.OUTPUT_DIR = args.modelDir
+
+    if args.logDir:
+        cfg.LOG_DIR = args.logDir
+
+    if args.dataDir:
+        cfg.DATA_DIR = args.dataDir
+
+    cfg.DATASET.ROOT = os.path.join(
+        cfg.DATA_DIR, cfg.DATASET.ROOT
+    )
+
+    cfg.MODEL.PRETRAINED = os.path.join(
+        cfg.DATA_DIR, cfg.MODEL.PRETRAINED
+    )
+
+    if cfg.TEST.MODEL_FILE:
+        cfg.TEST.MODEL_FILE = os.path.join(
+            cfg.DATA_DIR, cfg.TEST.MODEL_FILE
+        )
+
+    cfg.freeze()
+
+
+if __name__ == '__main__':
+    import sys
+    with open(sys.argv[1], 'w') as f:
+        print(_C, file=f)
+
diff --git a/src/pose_estimator/inference-config.yaml b/src/pose_estimator/inference-config.yaml
new file mode 100644
index 0000000..9e57cf2
--- /dev/null
+++ b/src/pose_estimator/inference-config.yaml
@@ -0,0 +1,127 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/src/pose_estimator/model_hrnet.py b/src/pose_estimator/model_hrnet.py
new file mode 100644
index 0000000..4a0ebe3
--- /dev/null
+++ b/src/pose_estimator/model_hrnet.py
@@ -0,0 +1,497 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+import os
+import logging
+
+import torch
+import torch.nn as nn
+
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class HighResolutionModule(nn.Module):
+    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
+                 num_channels, fuse_method, multi_scale_output=True):
+        super(HighResolutionModule, self).__init__()
+        self._check_branches(
+            num_branches, blocks, num_blocks, num_inchannels, num_channels)
+
+        self.num_inchannels = num_inchannels
+        self.fuse_method = fuse_method
+        self.num_branches = num_branches
+
+        self.multi_scale_output = multi_scale_output
+
+        self.branches = self._make_branches(
+            num_branches, blocks, num_blocks, num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(True)
+
+    def _check_branches(self, num_branches, blocks, num_blocks,
+                        num_inchannels, num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
+                num_branches, len(num_blocks))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
+                num_branches, len(num_channels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_inchannels):
+            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
+                num_branches, len(num_inchannels))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1, stride=stride, bias=False
+                ),
+                nn.BatchNorm2d(
+                    num_channels[branch_index] * block.expansion,
+                    momentum=BN_MOMENTUM
+                ),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.num_inchannels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample
+            )
+        )
+        self.num_inchannels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index]
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels)
+            )
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_inchannels[j],
+                                num_inchannels[i],
+                                1, 1, 0, bias=False
+                            ),
+                            nn.BatchNorm2d(num_inchannels[i]),
+                            nn.Upsample(scale_factor=2**(j-i), mode='nearest')
+                        )
+                    )
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i-j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3, 2, 1, bias=False
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3)
+                                )
+                            )
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3, 2, 1, bias=False
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3),
+                                    nn.ReLU(True)
+                                )
+                            )
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        return self.num_inchannels
+
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                else:
+                    y = y + self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+
+        return x_fuse
+
+
+blocks_dict = {
+    'BASIC': BasicBlock,
+    'BOTTLENECK': Bottleneck
+}
+
+
+class PoseHighResolutionNet(nn.Module):
+
+    def __init__(self, cfg, **kwargs):
+        self.inplanes = 64
+        extra = cfg['MODEL']['EXTRA']
+        super(PoseHighResolutionNet, self).__init__()
+
+        # stem net
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(Bottleneck, 64, 4)
+
+        self.stage2_cfg = extra['STAGE2']
+        num_channels = self.stage2_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage2_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition1 = self._make_transition_layer([256], num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        self.stage3_cfg = extra['STAGE3']
+        num_channels = self.stage3_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage3_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition2 = self._make_transition_layer(
+            pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        self.stage4_cfg = extra['STAGE4']
+        num_channels = self.stage4_cfg['NUM_CHANNELS']
+        block = blocks_dict[self.stage4_cfg['BLOCK']]
+        num_channels = [
+            num_channels[i] * block.expansion for i in range(len(num_channels))
+        ]
+        self.transition3 = self._make_transition_layer(
+            pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multi_scale_output=False)
+
+        self.final_layer = nn.Conv2d(
+            in_channels=pre_stage_channels[0],
+            out_channels=cfg['MODEL']['NUM_JOINTS'],
+            kernel_size=extra['FINAL_CONV_KERNEL'],
+            stride=1,
+            padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0
+        )
+
+        self.pretrained_layers = extra['PRETRAINED_LAYERS']
+
+    def _make_transition_layer(
+            self, num_channels_pre_layer, num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                3, 1, 1, bias=False
+                            ),
+                            nn.BatchNorm2d(num_channels_cur_layer[i]),
+                            nn.ReLU(inplace=True)
+                        )
+                    )
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i+1-num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = num_channels_cur_layer[i] \
+                        if j == i-num_branches_pre else inchannels
+                    conv3x3s.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                inchannels, outchannels, 3, 2, 1, bias=False
+                            ),
+                            nn.BatchNorm2d(outchannels),
+                            nn.ReLU(inplace=True)
+                        )
+                    )
+                transition_layers.append(nn.Sequential(*conv3x3s))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes, planes * block.expansion,
+                    kernel_size=1, stride=stride, bias=False
+                ),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, num_inchannels,
+                    multi_scale_output=True):
+        num_modules = layer_config['NUM_MODULES']
+        num_branches = layer_config['NUM_BRANCHES']
+        num_blocks = layer_config['NUM_BLOCKS']
+        num_channels = layer_config['NUM_CHANNELS']
+        block = blocks_dict[layer_config['BLOCK']]
+        fuse_method = layer_config['FUSE_METHOD']
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+
+            modules.append(
+                HighResolutionModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    fuse_method,
+                    reset_multi_scale_output
+                )
+            )
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['NUM_BRANCHES']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['NUM_BRANCHES']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['NUM_BRANCHES']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        x = self.final_layer(y_list[0])
+
+        return x
+
+    def init_weights(self, pretrained=''):
+        logger.info('=> init weights from normal distribution')
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.normal_(m.weight, std=0.001)
+                for name, _ in m.named_parameters():
+                    if name in ['bias']:
+                        nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                nn.init.normal_(m.weight, std=0.001)
+                for name, _ in m.named_parameters():
+                    if name in ['bias']:
+                        nn.init.constant_(m.bias, 0)
+
+        if os.path.isfile(pretrained):
+            pretrained_state_dict = torch.load(pretrained)
+            logger.info('=> loading pretrained model {}'.format(pretrained))
+
+            need_init_state_dict = {}
+            for name, m in pretrained_state_dict.items():
+                if name.split('.')[0] in self.pretrained_layers \
+                   or self.pretrained_layers[0] is '*':
+                    need_init_state_dict[name] = m
+            self.load_state_dict(need_init_state_dict, strict=False)
+        elif pretrained:
+            logger.error('=> please download pre-trained models first!')
+            raise ValueError('{} is not exist!'.format(pretrained))
+
+
+def get_pose_net(cfg, is_train, **kwargs):
+    model = PoseHighResolutionNet(cfg, **kwargs)
+
+    if is_train and cfg['MODEL']['INIT_WEIGHTS']:
+        model.init_weights(cfg['MODEL']['PRETRAINED'])
+
+    return model
diff --git a/src/pose_estimator/pose_estimator_hrnet.py b/src/pose_estimator/pose_estimator_hrnet.py
new file mode 100644
index 0000000..c0a4ca2
--- /dev/null
+++ b/src/pose_estimator/pose_estimator_hrnet.py
@@ -0,0 +1,111 @@
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+
+from datasets.preparation import box_to_center_scale
+from pose_estimator import model_hrnet
+from pose_estimator.config import _C as config, update_config
+from utils import *
+
+
+class PoseEstimatorHRNet:
+    def __init__(self,
+                 config_path='inference-config.yaml',
+                 weights_path='../../models/pose_hrnet_w32_384x288.pth'):
+        self.config_path = config_path
+        self.weights_path = weights_path
+
+        args.cfg = self.config_path
+        # opt expected by supporting codebase
+        args.modelDir = ''
+        args.logDir = ''
+        args.dataDir = ''
+        args.prevModelDir = ''
+
+        update_config(config, args)
+        self.config = config
+
+        self.model = model_hrnet.get_pose_net(config, is_train=False)
+        self.model.load_state_dict(torch.load(weights_path), strict=False)
+        self.model = torch.nn.DataParallel(self.model).cuda()
+
+        self.model.eval()  # Set in evaluation mode
+
+    def estimate_pose_from_image(self, img, box):
+        center, scale = box_to_center_scale(box, config.MODEL.IMAGE_SIZE[0], config.MODEL.IMAGE_SIZE[1])
+
+        rotation = 0
+
+        # pose estimation transformation
+        trans = get_affine_transform(center, scale, rotation, config.MODEL.IMAGE_SIZE)
+        model_input = cv2.warpAffine(
+            img,
+            trans,
+            (int(config.MODEL.IMAGE_SIZE[0]), int(config.MODEL.IMAGE_SIZE[1])),
+            flags=cv2.INTER_LINEAR)
+
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                 std=[0.229, 0.224, 0.225]),
+        ])
+
+        # pose estimation inference
+        input_img = transform(model_input).unsqueeze(0)
+
+        with torch.no_grad():
+            # compute output heatmap
+            output = self.model(input_img)
+            preds, _ = get_final_preds(
+                config,
+                output.clone().cpu().numpy(),
+                np.asarray([center]),
+                np.asarray([scale]))
+
+            return preds
+
+
+if __name__ == "__main__":
+    import argparse
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+    from matplotlib.ticker import NullLocator
+
+    parser = argparse.ArgumentParser()
+    # general
+    parser.add_argument('--cfg', type=str, default='inference-config.yaml')
+    parser.add_argument('opt', help='Modify config options using the command-line', default=None,
+                        nargs=argparse.REMAINDER)
+
+    args = parser.parse_args()
+    pose_estimator = PoseEstimatorHRNet()
+
+    img = np.array(Image.open('../data/samples/messi.jpg'))
+    boxes = [
+             [17.860302, 26.873545, 824.93115, 694.90466],
+             [1202.5271, 475.52982, 88.31201, 215.9581],
+             [648.0603, 104.8192, 492.93066, 621.0242]
+             ]
+
+    # Create plot
+    plt.figure()
+    fig, ax = plt.subplots(1)
+    ax.imshow(img)
+
+    for box in boxes:
+        pose_predictions = pose_estimator.estimate_pose_from_image(img, box)
+        for _, mat in enumerate(pose_predictions[0]):
+            x, y = int(mat[0]), int(mat[1])
+            circle = patches.Circle((x, y), radius=5, linewidth=2, edgecolor=(1, 0, 0), facecolor="none")
+            # Add the pose points to the plot
+            ax.add_patch(circle)
+
+            x1, y1, box_w, box_h = box
+            bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=(0, 1, 0), facecolor="none")
+            # Add the bbox to the plot
+            ax.add_patch(bbox)
+
+    plt.axis("off")
+    plt.gca().xaxis.set_major_locator(NullLocator())
+    plt.gca().yaxis.set_major_locator(NullLocator())
+    plt.show()
diff --git a/src/pose_estimator/utils.py b/src/pose_estimator/utils.py
new file mode 100644
index 0000000..037d731
--- /dev/null
+++ b/src/pose_estimator/utils.py
@@ -0,0 +1,138 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+import math
+
+import numpy as np
+import cv2
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def get_affine_transform(
+        center, scale, rot, output_size,
+        shift=np.array([0, 0], dtype=np.float32), inv=0
+):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale * 200.0
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def get_max_preds(batch_heatmaps):
+    """
+    get predictions from score maps
+    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+    """
+    assert isinstance(batch_heatmaps, np.ndarray), \
+        'batch_heatmaps should be numpy.ndarray'
+    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    batch_size = batch_heatmaps.shape[0]
+    num_joints = batch_heatmaps.shape[1]
+    width = batch_heatmaps.shape[3]
+    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2)
+    maxvals = np.amax(heatmaps_reshaped, 2)
+
+    maxvals = maxvals.reshape((batch_size, num_joints, 1))
+    idx = idx.reshape((batch_size, num_joints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+    preds[:, :, 0] = (preds[:, :, 0]) % width
+    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+    pred_mask = pred_mask.astype(np.float32)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+
+def get_final_preds(config, batch_heatmaps, center, scale):
+    coords, maxvals = get_max_preds(batch_heatmaps)
+
+    heatmap_height = batch_heatmaps.shape[2]
+    heatmap_width = batch_heatmaps.shape[3]
+
+    # post-processing
+    if config.TEST.POST_PROCESS:
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                hm = batch_heatmaps[n][p]
+                px = int(math.floor(coords[n][p][0] + 0.5))
+                py = int(math.floor(coords[n][p][1] + 0.5))
+                if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
+                    diff = np.array(
+                        [
+                            hm[py][px+1] - hm[py][px-1],
+                            hm[py+1][px]-hm[py-1][px]
+                        ]
+                    )
+                    coords[n][p] += np.sign(diff) * .25
+
+    preds = coords.copy()
+
+    # Transform back
+    for i in range(coords.shape[0]):
+        preds[i] = transform_preds(
+            coords[i], center[i], scale[i], [heatmap_width, heatmap_height]
+        )
+
+    return preds, maxvals
diff --git a/src/preparation/prepare_detection.py b/src/preparation/prepare_detection.py
new file mode 100644
index 0000000..d600c4c
--- /dev/null
+++ b/src/preparation/prepare_detection.py
@@ -0,0 +1,82 @@
+import csv
+
+import torch
+from torch.utils.data import DataLoader
+
+from tqdm import tqdm
+
+from datasets import DatasetSimple
+from detector.detector_yolov3 import DetectorYOLOv3
+from detector.detector_utils import preprocess_image
+from detector.utils import non_max_suppression, rescale_boxes
+
+Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
+
+
+def detection(dataset_base_path, image_list, output_file):
+    dataset = DatasetSimple(
+        dataset_base_path, image_list, transform=preprocess_image
+    )
+    data_loader = DataLoader(dataset, batch_size=30, shuffle=False, num_workers=8)
+    print(f"Data loaded: {len(data_loader)} batches")
+
+    file = open(output_file, "w")
+    writer = csv.writer(file)
+    writer.writerow(["image_name", "x", "y", "w", "h"])
+
+    detector = DetectorYOLOv3(
+        model_def="../detector/config/yolov3.cfg",
+        weights_path="../../models/yolov3.weights",
+    )
+
+    human_candidates = dict()
+    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
+        imgs = data[0].squeeze()
+        names = data[1]
+
+        # Configure input
+        input_imgs = torch.autograd.Variable(imgs.type(Tensor))
+
+        # Get detections
+        with torch.no_grad():
+            detections = detector.model(input_imgs)
+            detections = non_max_suppression(
+                detections, detector.conf_thres, detector.nms_thres
+            )
+
+        for j in range(imgs.shape[0]):
+            human_candidates[names[j]] = list()
+            if detections[j] is None:
+                continue
+
+            detection = detections[j].data.cpu().numpy()
+            detection = rescale_boxes(detection, detector.img_size, (240, 320))
+
+            for x1, y1, x2, y2, conf, cls_conf, cls_pred in detection:
+                box_w = x2 - x1
+                box_h = y2 - y1
+
+                if int(cls_pred) == 0:
+                    human_candidates[names[j]].append([x1, y1, box_w, box_h])
+
+            if len(human_candidates[names[j]]) < 1:
+                print(
+                    f"{names[j]}: Invalid detections ({len(human_candidates[names[j]])}), skipping"
+                )
+                continue
+
+            writer.writerow([names[j]] + human_candidates[names[j]][0])
+
+    file.close()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Detect people in dataset")
+    parser.add_argument("dataset_base_path")
+    parser.add_argument("image_list")
+    parser.add_argument("output_file")
+
+    args = parser.parse_args()
+    detection(**vars(args))
diff --git a/src/preparation/prepare_pose_estimation.py b/src/preparation/prepare_pose_estimation.py
new file mode 100644
index 0000000..5525d43
--- /dev/null
+++ b/src/preparation/prepare_pose_estimation.py
@@ -0,0 +1,77 @@
+import csv
+import itertools
+
+import torch
+from torch.utils.data import DataLoader
+import torchvision.transforms as transforms
+
+from tqdm import tqdm
+
+from datasets import DatasetDetections, CropToBox
+from pose_estimator.pose_estimator_hrnet import PoseEstimatorHRNet
+from pose_estimator.utils import *
+from visualization.utils import keypoints
+
+Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
+
+
+def pose_estimation(dataset_base_path, detection_list, output_file):
+
+    pose_estimator = PoseEstimatorHRNet(
+        config_path="../pose_estimator/inference-config.yaml",
+        weights_path="../../weights/pose_hrnet_w32_384x288.pth",
+    )
+    transform_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+    dataset = DatasetDetections(
+        dataset_base_path,
+        detection_list,
+        sample_transform=CropToBox(pose_estimator.config),
+        transform=transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transform_normalize,
+            ]
+        ),
+    )
+    data_loader = DataLoader(dataset, batch_size=200, shuffle=False, num_workers=8)
+    print(f"Data loaded: {len(data_loader)} batches")
+
+    file = open(output_file, "w")
+    writer = csv.writer(file)
+    header = [[f"{k}_x", f"{k}_y", f"{k}_conf"] for k in keypoints.values()]
+    writer.writerow(["image_name"] + list(itertools.chain.from_iterable(header)))
+
+    poses = dict()
+    for i, data in tqdm(enumerate(data_loader), total=len(data_loader)):
+        imgs = data[0].squeeze()
+        names = data[1]
+        centers, scales = data[2]
+
+        with torch.no_grad():
+            # compute output heatmap
+            output = pose_estimator.model(imgs)
+            preds, maxvals = get_final_preds(
+                pose_estimator.config,
+                output.clone().cpu().numpy(),
+                np.asarray(centers),
+                np.asarray(scales),
+            )
+
+            result = np.append(preds, maxvals, axis=2)
+
+            for j in range(imgs.shape[0]):
+                poses[names[j]] = result[j]
+                writer.writerow([names[j]] + list(result[j].reshape(-1)))
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Detect poses in dataset")
+    parser.add_argument("dataset_base_path")
+    parser.add_argument("detection_list", default='../../data/casia-b_detections.csv')
+    parser.add_argument("output_file", default='../../data/casia-b_pose_coco.csv')
+
+    args = parser.parse_args()
+    pose_estimation(**vars(args))
diff --git a/src/preparation/split_casia-b.py b/src/preparation/split_casia-b.py
new file mode 100644
index 0000000..f54f412
--- /dev/null
+++ b/src/preparation/split_casia-b.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import csv
+from tqdm import tqdm
+
+# Splitting data
+# Training set : first 74 ids
+# Test set     : rest 50 ids
+
+skeletons = pd.read_csv("../../data/casia-b_pose_coco.csv")
+
+header = list(skeletons)
+
+ids_train = list(range(1, 60))
+ids_valid = list(range(60, 75))
+ids_test = list(range(75, 125))
+
+balancing = {
+    "nm": 1,
+    "cl": 3,
+    "bg": 3
+}
+
+# Store the different sets in lists according to the indexes assigned above
+data = {"train": [], "valid": [], "train_valid": [], "test": [],
+        "train_balanced": [], "valid_balanced": [],  "train_valid_balanced": []}
+
+for skeleton in tqdm(skeletons.values.tolist()):
+    label = skeleton[0].split('/')[1].split('-')
+    p_id = int(label[0])
+    p_ws = label[1]
+
+    if p_id in ids_train:
+        data["train"].append(skeleton)
+        for _ in range(balancing[p_ws]):
+            data["train_balanced"].append(skeleton)
+
+    if p_id in ids_valid:
+        data["valid"].append(skeleton)
+        for _ in range(balancing[p_ws]):
+            data["valid_balanced"].append(skeleton)
+
+    if p_id in ids_valid or p_id in ids_train:
+        data["train_valid"].append(skeleton)
+        for _ in range(balancing[p_ws]):
+            data["train_valid_balanced"].append(skeleton)
+
+    if p_id in ids_test:
+        data["test"].append(skeleton)
+
+for split, lines in data.items():
+    print(f"Saving {split}...")
+    with open(f"../../data/casia-b_pose_{split}.csv", "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(header)
+        for line in lines:
+            writer.writerow(line)
diff --git a/src/train.py b/src/train.py
new file mode 100644
index 0000000..88420be
--- /dev/null
+++ b/src/train.py
@@ -0,0 +1,245 @@
+import sys
+import time
+
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from torchvision import transforms
+from ray import tune
+from ray.tune.schedulers import HyperBandScheduler
+
+from datasets import dataset_factory
+from datasets.augmentation import *
+from datasets.graph import Graph
+from evaluate import evaluate, _evaluate_casia_b
+from losses import SupConLoss
+
+from common import *
+from utils import AverageMeter
+
+
+def train(train_loader, model, criterion, optimizer, scheduler, scaler, epoch, opt):
+    """one epoch training"""
+    model.train()
+
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    losses = AverageMeter()
+
+    end = time.time()
+    for idx, (points, target) in enumerate(train_loader):
+        data_time.update(time.time() - end)
+
+        points = torch.cat([points[0], points[1]], dim=0)
+        labels = target[0]
+
+        if torch.cuda.is_available():
+            points = points.cuda(non_blocking=True)
+            labels = labels.cuda(non_blocking=True)
+        bsz = labels.shape[0]
+
+        with torch.cuda.amp.autocast(enabled=opt.use_amp):
+            # compute loss
+            features = model(points)
+            f1, f2 = torch.split(features, [bsz, bsz], dim=0)
+            features = torch.cat([f1.unsqueeze(1), f2.unsqueeze(1)], dim=1)
+            loss = criterion(features, labels)
+
+        # update metric
+        losses.update(loss.item(), bsz)
+
+        # SGD
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scheduler.step()
+        scaler.update()
+        optimizer.zero_grad()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        # print info
+        if (idx + 1) % opt.log_interval == 0:
+            print(
+                f"Train: [{epoch}][{idx + 1}/{len(train_loader)}]\t"
+                f"BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
+                f"DT {data_time.val:.3f} ({data_time.avg:.3f})\t"
+                f"loss {losses.val:.3f} ({losses.avg:.3f})"
+            )
+            sys.stdout.flush()
+
+    return losses.avg
+
+
+def main(opt):
+    opt = setup_environment(opt)
+    graph = Graph("coco")
+
+    # Dataset
+    transform = transforms.Compose(
+        [
+            MirrorPoses(opt.mirror_probability),
+            FlipSequence(opt.flip_probability),
+            RandomSelectSequence(opt.sequence_length),
+            ShuffleSequence(opt.shuffle),
+            PointNoise(std=opt.point_noise_std),
+            JointNoise(std=opt.joint_noise_std),
+            MultiInput(graph.connect_joint, opt.use_multi_branch),
+            ToTensor()
+        ],
+    )
+
+    dataset_class = dataset_factory(opt.dataset)
+    dataset = dataset_class(
+        opt.train_data_path,
+        train=True,
+        sequence_length=opt.sequence_length,
+        transform=TwoNoiseTransform(transform),
+    )
+
+    dataset_valid = dataset_class(
+        opt.valid_data_path,
+        sequence_length=opt.sequence_length,
+        transform=transforms.Compose(
+            [
+                SelectSequenceCenter(opt.sequence_length),
+                MultiInput(graph.connect_joint, opt.use_multi_branch),
+                ToTensor()
+            ]
+        ),
+    )
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=opt.batch_size,
+        num_workers=opt.num_workers,
+        pin_memory=True,
+        shuffle=True,
+    )
+
+    val_loader = torch.utils.data.DataLoader(
+        dataset_valid,
+        batch_size=opt.batch_size_validation,
+        num_workers=opt.num_workers,
+        pin_memory=True,
+    )
+
+    # Model & criterion
+    model = get_model_resgcn(graph, opt)
+    criterion = SupConLoss(temperature=opt.temp)
+
+    print("# parameters: ", count_parameters(model))
+
+    if opt.cuda:
+        model.cuda()
+        criterion.cuda()
+
+    # Trainer
+    optimizer, scheduler, scaler = get_trainer(model, opt, len(train_loader))
+
+    # Load checkpoint or weights
+    load_checkpoint(model, optimizer, scheduler, scaler, opt)
+
+    # Tensorboard
+    writer = SummaryWriter(log_dir=opt.tb_path)
+
+    sample_input = torch.zeros(opt.batch_size, model_args["num_input"], model_args["num_channel"],
+                               opt.sequence_length, graph.num_node).cuda()
+    writer.add_graph(model, input_to_model=sample_input)
+
+    best_acc = 0
+    loss = 0
+    for epoch in range(opt.start_epoch, opt.epochs + 1):
+        # train for one epoch
+        time1 = time.time()
+        loss = train(
+            train_loader, model, criterion, optimizer, scheduler, scaler, epoch, opt
+        )
+
+        time2 = time.time()
+        print(f"epoch {epoch}, total time {time2 - time1:.2f}")
+
+        # tensorboard logger
+        writer.add_scalar("loss/train", loss, epoch)
+        writer.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], epoch)
+
+        # evaluation
+        result, accuracy_avg, sub_accuracies, dataframe = evaluate(
+            val_loader, model, opt.evaluation_fn, use_flip=True
+        )
+        writer.add_text("accuracy/validation", dataframe.to_markdown(), epoch)
+        writer.add_scalar("accuracy/validation", accuracy_avg, epoch)
+        for key, sub_accuracy in sub_accuracies.items():
+            writer.add_scalar(f"accuracy/validation/{key}", sub_accuracy, epoch)
+
+        print(f"epoch {epoch}, avg accuracy {accuracy_avg:.4f}")
+        is_best = accuracy_avg > best_acc
+        if is_best:
+            best_acc = accuracy_avg
+
+        if opt.tune:
+            tune.report(accuracy=accuracy_avg)
+
+        if epoch % opt.save_interval == 0 or (is_best and epoch > opt.save_best_start * opt.epochs):
+            save_file = os.path.join(opt.save_folder, f"ckpt_epoch_{'best' if is_best else epoch}.pth")
+            save_model(model, optimizer, scheduler, scaler, opt, opt.epochs, save_file)
+
+    # save the last model
+    save_file = os.path.join(opt.save_folder, "last.pth")
+    save_model(model, optimizer, scheduler, scaler, opt, opt.epochs, save_file)
+
+    log_hyperparameter(writer, opt, best_acc, loss)
+
+    print(f"best accuracy: {best_acc*100:.2f}")
+
+
+def _inject_config(config):
+    opt_new = {k: config[k] if k in config.keys() else v for k, v in vars(opt).items()}
+    main(argparse.Namespace(**opt_new))
+
+
+def tune_():
+    hyperband = HyperBandScheduler(metric="accuracy", mode="max")
+
+    analysis = tune.run(
+        _inject_config,
+        config={},
+        stop={"accuracy": 0.90, "training_iteration": 100},
+        resources_per_trial={"gpu": 1},
+        num_samples=10,
+        scheduler=hyperband
+    )
+
+    print("Best config: ", analysis.get_best_config(metric="accuracy", mode="max"))
+
+    df = analysis.results_df
+    print(df)
+
+
+if __name__ == "__main__":
+    import datetime
+
+    opt = parse_option()
+
+    date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    opt.model_name = f"{date}_{opt.dataset}_{opt.network_name}" \
+                     f"_lr_{opt.learning_rate}_decay_{opt.weight_decay}_bsz_{opt.batch_size}"
+
+    if opt.exp_name:
+        opt.model_name += "_" + opt.exp_name
+
+    opt.model_path = f"../save/supcon_{opt.dataset}_models"
+    opt.tb_path = f"../save/supcon_{opt.dataset}_tensorboard/{opt.model_name}"
+
+    opt.save_folder = os.path.join(opt.model_path, opt.model_name)
+    if not os.path.isdir(opt.save_folder):
+        os.makedirs(opt.save_folder)
+
+    opt.evaluation_fn = None
+    if opt.dataset == "casia-b":
+        opt.evaluation_fn = _evaluate_casia_b
+
+    if opt.tune:
+        tune_()
+    else:
+        main(opt)
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000..0bdeab0
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,18 @@
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count