From 7a6ba15980f043f317227dc2ff4da03c02f120c7 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Fri, 29 May 2020 18:43:32 +0200
Subject: [PATCH 01/20] Add predicted box as output

---
 tf2_yolov4/heads/yolov3_head.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tf2_yolov4/heads/yolov3_head.py b/tf2_yolov4/heads/yolov3_head.py
index f3fa32f..ff28a6b 100644
--- a/tf2_yolov4/heads/yolov3_head.py
+++ b/tf2_yolov4/heads/yolov3_head.py
@@ -178,6 +178,7 @@ def yolov3_boxes_regression(feats_per_stage, anchors_per_stage):
     box_xy = tf.sigmoid(box_xy)
     objectness = tf.sigmoid(objectness)
     class_probs = tf.sigmoid(class_probs)
+    predicted_box = tf.concat((box_xy, box_wh), axis=-1)
 
     grid = tf.meshgrid(tf.range(grid_size_y), tf.range(grid_size_x))
     grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)  # [gy, gx, 1, 2]
@@ -191,7 +192,7 @@ def yolov3_boxes_regression(feats_per_stage, anchors_per_stage):
     box_x2y2 = box_xy + box_wh / 2
     bbox = tf.concat([box_x1y1, box_x2y2], axis=-1)
 
-    return bbox, objectness, class_probs
+    return bbox, objectness, class_probs, predicted_box
 
 
 def yolo_nms(yolo_feats, yolo_max_boxes, yolo_iou_threshold, yolo_score_threshold):

From 6290708aa11213a4c86803a4bf966380e0981aee Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Fri, 29 May 2020 18:43:40 +0200
Subject: [PATCH 02/20] Add ugly training script

---
 scripts/train.py | 201 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 scripts/train.py

diff --git a/scripts/train.py b/scripts/train.py
new file mode 100644
index 0000000..65070e2
--- /dev/null
+++ b/scripts/train.py
@@ -0,0 +1,201 @@
+"""
+Training script for Pascal VOC using tf2-yolov4
+"""
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from tf2_yolov4.anchors import YOLOV4_ANCHORS
+from tf2_yolov4.heads.yolov3_head import yolov3_boxes_regression
+from tf2_yolov4.model import YOLOv4
+
+YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+
+INPUT_SHAPE = (608, 608, 3)
+BATCH_SIZE = 1
+BOUNDING_BOXES_FIXED_NUMBER = 10
+
+
+def broadcast_iou(box_1, box_2):
+    # box_1: (..., (x1, y1, x2, y2))
+    # box_2: (N, (x1, y1, x2, y2))
+
+    # broadcast boxes
+    box_1 = tf.expand_dims(box_1, -2)
+    box_2 = tf.expand_dims(box_2, 0)
+    # new_shape: (..., N, (x1, y1, x2, y2))
+    new_shape = tf.broadcast_dynamic_shape(tf.shape(box_1), tf.shape(box_2))
+    box_1 = tf.broadcast_to(box_1, new_shape)
+    box_2 = tf.broadcast_to(box_2, new_shape)
+
+    int_w = tf.maximum(tf.minimum(box_1[..., 2], box_2[..., 2]) -
+                       tf.maximum(box_1[..., 0], box_2[..., 0]), 0)
+    int_h = tf.maximum(tf.minimum(box_1[..., 3], box_2[..., 3]) -
+                       tf.maximum(box_1[..., 1], box_2[..., 1]), 0)
+    int_area = int_w * int_h
+    box_1_area = (box_1[..., 2] - box_1[..., 0]) * (box_1[..., 3] - box_1[..., 1])
+    box_2_area = (box_2[..., 2] - box_2[..., 0]) * (box_2[..., 3] - box_2[..., 1])
+    return int_area / (box_1_area + box_2_area - int_area)
+
+
+def YoloLoss(anchors, ignore_thresh=0.5):
+    def yolo_loss(y_true, y_pred):
+        # 1. transform all pred outputs
+        # y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
+        pred_box, pred_obj, pred_class, pred_xywh = yolov3_boxes_regression(y_pred, anchors)
+        pred_xy = pred_xywh[..., 0:2]
+        pred_wh = pred_xywh[..., 2:4]
+
+        # 2. transform all true outputs
+        # y_true: (batch_size, grid, grid, anchors, (x1, y1, x2, y2, obj, cls))
+        true_box, true_obj, true_class_idx = tf.split(
+            y_true, (4, 1, 1), axis=-1)
+        true_xy = (true_box[..., 0:2] + true_box[..., 2:4]) / 2
+        true_wh = true_box[..., 2:4] - true_box[..., 0:2]
+
+        # give higher weights to small boxes
+        box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1]
+
+        # 3. inverting the pred box equations
+        grid_size = tf.shape(y_true)[1]
+        grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
+        grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
+        true_xy = true_xy * tf.cast(grid_size, tf.float32) - tf.cast(grid, tf.float32)
+        true_wh = tf.math.log(true_wh / anchors)
+        true_wh = tf.where(tf.math.is_inf(true_wh),
+                           tf.zeros_like(true_wh), true_wh)
+
+        # 4. calculate all masks
+        obj_mask = tf.squeeze(true_obj, -1)
+        # ignore false positive when iou is over threshold
+        best_iou = tf.map_fn(
+            lambda x: tf.reduce_max(broadcast_iou(x[0], tf.boolean_mask(
+                x[1], tf.cast(x[2], tf.bool))), axis=-1),
+            (pred_box, true_box, obj_mask),
+            tf.float32)
+        ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)
+
+        # 5. calculate all losses
+        xy_loss = obj_mask * box_loss_scale * tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
+        wh_loss = obj_mask * box_loss_scale * tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
+        obj_loss = tf.keras.losses.binary_crossentropy(true_obj, pred_obj)
+        obj_loss = obj_mask * obj_loss + (1 - obj_mask) * ignore_mask * obj_loss
+        # TODO: use binary_crossentropy instead
+        class_loss = obj_mask * tf.keras.losses.sparse_categorical_crossentropy(true_class_idx, pred_class)
+
+        # 6. sum over (batch, gridx, gridy, anchors) => (batch, 1)
+        xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
+        wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3))
+        obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3))
+        class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
+
+        return xy_loss + wh_loss + obj_loss + class_loss
+    return yolo_loss
+
+
+@tf.function
+def transform_targets_for_output(y_true, grid_size, anchor_idxs):
+    # y_true: (N, boxes, (x1, y1, x2, y2, class, best_anchor))
+    N = tf.shape(y_true)[0]
+
+    # y_true_out: (N, grid, grid, anchors, [x, y, w, h, obj, class])
+    y_true_out = tf.zeros(
+        (N, grid_size, grid_size, tf.shape(anchor_idxs)[0], 6))
+
+    anchor_idxs = tf.cast(anchor_idxs, tf.int32)
+
+    indexes = tf.TensorArray(tf.int32, 1, dynamic_size=True)
+    updates = tf.TensorArray(tf.float32, 1, dynamic_size=True)
+    idx = 0
+    for i in tf.range(N):
+        for j in tf.range(tf.shape(y_true)[1]):
+            if tf.equal(y_true[i][j][2], 0):
+                continue
+            anchor_eq = tf.equal(
+                anchor_idxs, tf.cast(y_true[i][j][5], tf.int32))
+
+            if tf.reduce_any(anchor_eq):
+                box = y_true[i][j][0:4]
+                box_xy = (y_true[i][j][0:2] + y_true[i][j][2:4]) / 2
+
+                anchor_idx = tf.cast(tf.where(anchor_eq), tf.int32)
+                grid_xy = tf.cast(box_xy // (1/grid_size), tf.int32)
+
+                # grid[y][x][anchor] = (tx, ty, bw, bh, obj, class)
+                indexes = indexes.write(
+                    idx, [i, grid_xy[1], grid_xy[0], anchor_idx[0][0]])
+                updates = updates.write(
+                    idx, [box[0], box[1], box[2], box[3], 1, y_true[i][j][4]])
+                idx += 1
+
+    return tf.tensor_scatter_nd_update(y_true_out, indexes.stack(), updates.stack())
+
+
+def transform_targets(y_train, anchors, anchor_masks, size):
+    y_outs = []
+    grid_size = size // 32
+
+    # calculate anchor index for true boxes
+    anchors = tf.cast(anchors, tf.float32)
+    anchor_area = anchors[..., 0] * anchors[..., 1]
+    box_wh = y_train[..., 2:4] - y_train[..., 0:2]
+    box_wh = tf.tile(tf.expand_dims(box_wh, -2),
+                     (1, 1, tf.shape(anchors)[0], 1))
+    box_area = box_wh[..., 0] * box_wh[..., 1]
+    intersection = tf.minimum(box_wh[..., 0], anchors[..., 0]) * tf.minimum(box_wh[..., 1], anchors[..., 1])
+    iou = intersection / (box_area + anchor_area - intersection)
+    anchor_idx = tf.cast(tf.argmax(iou, axis=-1), tf.float32)
+    anchor_idx = tf.expand_dims(anchor_idx, axis=-1)
+
+    y_train = tf.concat([y_train, anchor_idx], axis=-1)
+
+    for anchor_idxs in anchor_masks:
+        y_outs.append(transform_targets_for_output(
+            y_train, grid_size, anchor_idxs))
+        grid_size *= 2
+
+    return tuple(y_outs)
+
+
+def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_number):
+    box_number = tf.shape(bounding_boxes)[0]
+    paddings = [[0, pad_number - box_number], [0, 0]]
+
+    return tf.pad(bounding_boxes, paddings, constant_values=0.)
+
+
+ds_train = tfds.load('voc', split='train', shuffle_files=True)
+ds_train = ds_train.map(lambda el: (el["image"], el["objects"]))
+ds_train = ds_train.map(
+    lambda image, object: (
+        image,
+        tf.concat([object["bbox"], tf.expand_dims(tf.cast(object["label"], tf.float32), axis=-1)], axis=-1)
+    )
+)
+ds_train = ds_train.map(
+    lambda image, bounding_boxes: (
+        image,
+        pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_number=BOUNDING_BOXES_FIXED_NUMBER),
+    )
+)
+ds_train = ds_train.batch(BATCH_SIZE)
+ds_train = ds_train.map(
+    lambda image, bounding_box_with_class: (
+        tf.image.resize(image, INPUT_SHAPE[:2]) / 255.,
+        transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
+            bounding_box_with_class,
+            np.concatenate(YOLOV4_ANCHORS, axis=0),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of
+            YOLOV4_ANCHORS_MASKS,
+            INPUT_SHAPE[0],  # Assumes square input
+        )
+    )
+)
+
+model = YOLOv4(input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=80, training=True)
+
+optimizer = tf.keras.optimizers.Adam(lr=1e-4)
+loss = [YoloLoss(np.concatenate(YOLOV4_ANCHORS, axis=0)[mask]) for mask in YOLOV4_ANCHORS_MASKS]
+
+model.compile(optimizer=optimizer, loss=loss)
+
+history = model.fit(ds_train, epochs=2)

From 399d7659f16a9295140ae909a9a6c6c320b20d5a Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sun, 31 May 2020 14:28:21 +0200
Subject: [PATCH 03/20] Black

---
 scripts/train.py | 74 ++++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 65070e2..8c65ab3 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -28,10 +28,16 @@ def broadcast_iou(box_1, box_2):
     box_1 = tf.broadcast_to(box_1, new_shape)
     box_2 = tf.broadcast_to(box_2, new_shape)
 
-    int_w = tf.maximum(tf.minimum(box_1[..., 2], box_2[..., 2]) -
-                       tf.maximum(box_1[..., 0], box_2[..., 0]), 0)
-    int_h = tf.maximum(tf.minimum(box_1[..., 3], box_2[..., 3]) -
-                       tf.maximum(box_1[..., 1], box_2[..., 1]), 0)
+    int_w = tf.maximum(
+        tf.minimum(box_1[..., 2], box_2[..., 2])
+        - tf.maximum(box_1[..., 0], box_2[..., 0]),
+        0,
+    )
+    int_h = tf.maximum(
+        tf.minimum(box_1[..., 3], box_2[..., 3])
+        - tf.maximum(box_1[..., 1], box_2[..., 1]),
+        0,
+    )
     int_area = int_w * int_h
     box_1_area = (box_1[..., 2] - box_1[..., 0]) * (box_1[..., 3] - box_1[..., 1])
     box_2_area = (box_2[..., 2] - box_2[..., 0]) * (box_2[..., 3] - box_2[..., 1])
@@ -42,14 +48,15 @@ def YoloLoss(anchors, ignore_thresh=0.5):
     def yolo_loss(y_true, y_pred):
         # 1. transform all pred outputs
         # y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
-        pred_box, pred_obj, pred_class, pred_xywh = yolov3_boxes_regression(y_pred, anchors)
+        pred_box, pred_obj, pred_class, pred_xywh = yolov3_boxes_regression(
+            y_pred, anchors
+        )
         pred_xy = pred_xywh[..., 0:2]
         pred_wh = pred_xywh[..., 2:4]
 
         # 2. transform all true outputs
         # y_true: (batch_size, grid, grid, anchors, (x1, y1, x2, y2, obj, cls))
-        true_box, true_obj, true_class_idx = tf.split(
-            y_true, (4, 1, 1), axis=-1)
+        true_box, true_obj, true_class_idx = tf.split(y_true, (4, 1, 1), axis=-1)
         true_xy = (true_box[..., 0:2] + true_box[..., 2:4]) / 2
         true_wh = true_box[..., 2:4] - true_box[..., 0:2]
 
@@ -62,26 +69,38 @@ def yolo_loss(y_true, y_pred):
         grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
         true_xy = true_xy * tf.cast(grid_size, tf.float32) - tf.cast(grid, tf.float32)
         true_wh = tf.math.log(true_wh / anchors)
-        true_wh = tf.where(tf.math.is_inf(true_wh),
-                           tf.zeros_like(true_wh), true_wh)
+        true_wh = tf.where(tf.math.is_inf(true_wh), tf.zeros_like(true_wh), true_wh)
 
         # 4. calculate all masks
         obj_mask = tf.squeeze(true_obj, -1)
         # ignore false positive when iou is over threshold
         best_iou = tf.map_fn(
-            lambda x: tf.reduce_max(broadcast_iou(x[0], tf.boolean_mask(
-                x[1], tf.cast(x[2], tf.bool))), axis=-1),
+            lambda x: tf.reduce_max(
+                broadcast_iou(x[0], tf.boolean_mask(x[1], tf.cast(x[2], tf.bool))),
+                axis=-1,
+            ),
             (pred_box, true_box, obj_mask),
-            tf.float32)
+            tf.float32,
+        )
         ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)
 
         # 5. calculate all losses
-        xy_loss = obj_mask * box_loss_scale * tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
-        wh_loss = obj_mask * box_loss_scale * tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
+        xy_loss = (
+            obj_mask
+            * box_loss_scale
+            * tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
+        )
+        wh_loss = (
+            obj_mask
+            * box_loss_scale
+            * tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
+        )
         obj_loss = tf.keras.losses.binary_crossentropy(true_obj, pred_obj)
         obj_loss = obj_mask * obj_loss + (1 - obj_mask) * ignore_mask * obj_loss
         # TODO: use binary_crossentropy instead
-        class_loss = obj_mask * tf.keras.losses.sparse_categorical_crossentropy(true_class_idx, pred_class)
+        class_loss = obj_mask * tf.keras.losses.sparse_categorical_crossentropy(
+            true_class_idx, pred_class
+        )
 
         # 6. sum over (batch, gridx, gridy, anchors) => (batch, 1)
         xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
@@ -90,6 +109,7 @@ def yolo_loss(y_true, y_pred):
         class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
 
         return xy_loss + wh_loss + obj_loss + class_loss
+
     return yolo_loss
 
 
@@ -99,8 +119,7 @@ def transform_targets_for_output(y_true, grid_size, anchor_idxs):
     N = tf.shape(y_true)[0]
 
     # y_true_out: (N, grid, grid, anchors, [x, y, w, h, obj, class])
-    y_true_out = tf.zeros(
-        (N, grid_size, grid_size, tf.shape(anchor_idxs)[0], 6))
+    y_true_out = tf.zeros((N, grid_size, grid_size, tf.shape(anchor_idxs)[0], 6))
 
     anchor_idxs = tf.cast(anchor_idxs, tf.int32)
 
@@ -111,21 +130,22 @@ def transform_targets_for_output(y_true, grid_size, anchor_idxs):
         for j in tf.range(tf.shape(y_true)[1]):
             if tf.equal(y_true[i][j][2], 0):
                 continue
-            anchor_eq = tf.equal(
-                anchor_idxs, tf.cast(y_true[i][j][5], tf.int32))
+            anchor_eq = tf.equal(anchor_idxs, tf.cast(y_true[i][j][5], tf.int32))
 
             if tf.reduce_any(anchor_eq):
                 box = y_true[i][j][0:4]
                 box_xy = (y_true[i][j][0:2] + y_true[i][j][2:4]) / 2
 
                 anchor_idx = tf.cast(tf.where(anchor_eq), tf.int32)
-                grid_xy = tf.cast(box_xy // (1/grid_size), tf.int32)
+                grid_xy = tf.cast(box_xy // (1 / grid_size), tf.int32)
 
                 # grid[y][x][anchor] = (tx, ty, bw, bh, obj, class)
                 indexes = indexes.write(
-                    idx, [i, grid_xy[1], grid_xy[0], anchor_idx[0][0]])
+                    idx, [i, grid_xy[1], grid_xy[0], anchor_idx[0][0]]
+                )
                 updates = updates.write(
-                    idx, [box[0], box[1], box[2], box[3], 1, y_true[i][j][4]])
+                    idx, [box[0], box[1], box[2], box[3], 1, y_true[i][j][4]]
+                )
                 idx += 1
 
     return tf.tensor_scatter_nd_update(y_true_out, indexes.stack(), updates.stack())
@@ -139,10 +159,11 @@ def transform_targets(y_train, anchors, anchor_masks, size):
     anchors = tf.cast(anchors, tf.float32)
     anchor_area = anchors[..., 0] * anchors[..., 1]
     box_wh = y_train[..., 2:4] - y_train[..., 0:2]
-    box_wh = tf.tile(tf.expand_dims(box_wh, -2),
-                     (1, 1, tf.shape(anchors)[0], 1))
+    box_wh = tf.tile(tf.expand_dims(box_wh, -2), (1, 1, tf.shape(anchors)[0], 1))
     box_area = box_wh[..., 0] * box_wh[..., 1]
-    intersection = tf.minimum(box_wh[..., 0], anchors[..., 0]) * tf.minimum(box_wh[..., 1], anchors[..., 1])
+    intersection = tf.minimum(box_wh[..., 0], anchors[..., 0]) * tf.minimum(
+        box_wh[..., 1], anchors[..., 1]
+    )
     iou = intersection / (box_area + anchor_area - intersection)
     anchor_idx = tf.cast(tf.argmax(iou, axis=-1), tf.float32)
     anchor_idx = tf.expand_dims(anchor_idx, axis=-1)
@@ -150,8 +171,7 @@ def transform_targets(y_train, anchors, anchor_masks, size):
     y_train = tf.concat([y_train, anchor_idx], axis=-1)
 
     for anchor_idxs in anchor_masks:
-        y_outs.append(transform_targets_for_output(
-            y_train, grid_size, anchor_idxs))
+        y_outs.append(transform_targets_for_output(y_train, grid_size, anchor_idxs))
         grid_size *= 2
 
     return tuple(y_outs)

From 88e235e58e1931b0de5e74946541ad342ac31b7a Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sun, 31 May 2020 14:28:36 +0200
Subject: [PATCH 04/20] Add callbacks and validation data

---
 scripts/train.py | 90 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 8c65ab3..1fc9e4d 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -181,41 +181,75 @@ def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_num
     box_number = tf.shape(bounding_boxes)[0]
     paddings = [[0, pad_number - box_number], [0, 0]]
 
-    return tf.pad(bounding_boxes, paddings, constant_values=0.)
-
-
-ds_train = tfds.load('voc', split='train', shuffle_files=True)
-ds_train = ds_train.map(lambda el: (el["image"], el["objects"]))
-ds_train = ds_train.map(
-    lambda image, object: (
-        image,
-        tf.concat([object["bbox"], tf.expand_dims(tf.cast(object["label"], tf.float32), axis=-1)], axis=-1)
+    return tf.pad(bounding_boxes, paddings, constant_values=0.0)
+
+
+def prepare_dataset(dataset):
+    dataset = dataset.map(lambda el: (el["image"], el["objects"]))
+    dataset = dataset.map(
+        lambda image, object: (
+            image,
+            tf.concat(
+                [
+                    object["bbox"],
+                    tf.expand_dims(tf.cast(object["label"], tf.float32), axis=-1),
+                ],
+                axis=-1,
+            ),
+        )
     )
-)
-ds_train = ds_train.map(
-    lambda image, bounding_boxes: (
-        image,
-        pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_number=BOUNDING_BOXES_FIXED_NUMBER),
+    dataset = dataset.map(
+        lambda image, bounding_boxes: (
+            image,
+            pad_bounding_boxes_to_fixed_number_of_bounding_boxes(
+                bounding_boxes, pad_number=BOUNDING_BOXES_FIXED_NUMBER
+            ),
+        )
     )
-)
-ds_train = ds_train.batch(BATCH_SIZE)
-ds_train = ds_train.map(
-    lambda image, bounding_box_with_class: (
-        tf.image.resize(image, INPUT_SHAPE[:2]) / 255.,
-        transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
-            bounding_box_with_class,
-            np.concatenate(YOLOV4_ANCHORS, axis=0),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of
-            YOLOV4_ANCHORS_MASKS,
-            INPUT_SHAPE[0],  # Assumes square input
+    dataset = dataset.batch(BATCH_SIZE)
+    dataset = dataset.map(
+        lambda image, bounding_box_with_class: (
+            tf.image.resize(image, INPUT_SHAPE[:2]) / 255.0,
+            transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
+                bounding_box_with_class,
+                np.concatenate(
+                    YOLOV4_ANCHORS, axis=0
+                ),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of anchors
+                YOLOV4_ANCHORS_MASKS,
+                INPUT_SHAPE[0],  # Assumes square input
+            ),
         )
     )
-)
 
-model = YOLOv4(input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=80, training=True)
+    return dataset
+
+
+voc_dataset = tfds.load("voc", shuffle_files=True)
+ds_train, ds_test = voc_dataset["train"], voc_dataset["test"]
+ds_train = prepare_dataset(ds_train)
+ds_test = prepare_dataset(ds_test)
+
+model = YOLOv4(
+    input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=80, training=True
+)
 
 optimizer = tf.keras.optimizers.Adam(lr=1e-4)
-loss = [YoloLoss(np.concatenate(YOLOV4_ANCHORS, axis=0)[mask]) for mask in YOLOV4_ANCHORS_MASKS]
+loss = [
+    YoloLoss(np.concatenate(YOLOV4_ANCHORS, axis=0)[mask])
+    for mask in YOLOV4_ANCHORS_MASKS
+]
 
 model.compile(optimizer=optimizer, loss=loss)
 
-history = model.fit(ds_train, epochs=2)
+history = model.fit(
+    ds_train,
+    validation_data=ds_test,
+    validation_steps=100,
+    epochs=2,
+    callbacks=[
+        tf.keras.callbacks.TensorBoard(log_dir="./logs"),
+        tf.keras.callbacks.ModelCheckpoint(
+            "yolov4_best.h5", save_best_only=True, save_weights_only=True
+        ),
+    ],
+)

From 3aced1e0ed62a4afe3728079f3c419edeb1f0a71 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sun, 31 May 2020 16:03:02 +0200
Subject: [PATCH 05/20] Update training script

---
 scripts/train.py    | 35 ++++++++++++++++++++++++-----------
 tests/test_model.py |  2 +-
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 1fc9e4d..e9d834a 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -12,8 +12,9 @@
 YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
 
 INPUT_SHAPE = (608, 608, 3)
-BATCH_SIZE = 1
-BOUNDING_BOXES_FIXED_NUMBER = 10
+BATCH_SIZE = 16
+BOUNDING_BOXES_FIXED_NUMBER = 50
+PASCAL_VOC_NUM_CLASSES = 20
 
 
 def broadcast_iou(box_1, box_2):
@@ -184,7 +185,7 @@ def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_num
     return tf.pad(bounding_boxes, paddings, constant_values=0.0)
 
 
-def prepare_dataset(dataset):
+def prepare_dataset(dataset, shuffle=True):
     dataset = dataset.map(lambda el: (el["image"], el["objects"]))
     dataset = dataset.map(
         lambda image, object: (
@@ -196,7 +197,8 @@ def prepare_dataset(dataset):
                 ],
                 axis=-1,
             ),
-        )
+        ),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
     )
     dataset = dataset.map(
         lambda image, bounding_boxes: (
@@ -204,8 +206,18 @@ def prepare_dataset(dataset):
             pad_bounding_boxes_to_fixed_number_of_bounding_boxes(
                 bounding_boxes, pad_number=BOUNDING_BOXES_FIXED_NUMBER
             ),
-        )
+        ),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    )
+    dataset = dataset.map(
+        lambda image, bounding_box: (
+            tf.image.resize(image, INPUT_SHAPE[:2]) / 255.0,
+            bounding_box,
+        ),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
     )
+    if shuffle:
+        dataset = dataset.shuffle(buffer_size=1000)
     dataset = dataset.batch(BATCH_SIZE)
     dataset = dataset.map(
         lambda image, bounding_box_with_class: (
@@ -218,7 +230,8 @@ def prepare_dataset(dataset):
                 YOLOV4_ANCHORS_MASKS,
                 INPUT_SHAPE[0],  # Assumes square input
             ),
-        )
+        ),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
     )
 
     return dataset
@@ -226,11 +239,11 @@ def prepare_dataset(dataset):
 
 voc_dataset = tfds.load("voc", shuffle_files=True)
 ds_train, ds_test = voc_dataset["train"], voc_dataset["test"]
-ds_train = prepare_dataset(ds_train)
-ds_test = prepare_dataset(ds_test)
+ds_train = prepare_dataset(ds_train, shuffle=True)
+ds_test = prepare_dataset(ds_test, shuffle=False)
 
 model = YOLOv4(
-    input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=80, training=True
+    input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=PASCAL_VOC_NUM_CLASSES, training=True
 )
 
 optimizer = tf.keras.optimizers.Adam(lr=1e-4)
@@ -244,8 +257,8 @@ def prepare_dataset(dataset):
 history = model.fit(
     ds_train,
     validation_data=ds_test,
-    validation_steps=100,
-    epochs=2,
+    validation_steps=10,
+    epochs=100,
     callbacks=[
         tf.keras.callbacks.TensorBoard(log_dir="./logs"),
         tf.keras.callbacks.ModelCheckpoint(
diff --git a/tests/test_model.py b/tests/test_model.py
index 945a86a..9e6be5c 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -37,7 +37,7 @@ def test_model_should_predict_valid_shapes_at_inference(
 
 @pytest.mark.parametrize("input_shape", [(32, 33, 3), (33, 32, 3)])
 def test_model_instanciation_should_fail_with_input_shapes_not_multiple_of_32(
-    input_shape
+    input_shape,
 ):
     with pytest.raises(ValueError):
         YOLOv4(input_shape, 80, [])

From 3c3921245a63fdace9faa9c11012d6440c3d0b27 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Wed, 3 Jun 2020 18:08:29 +0200
Subject: [PATCH 06/20] Progressive training with frozen layers

---
 scripts/train.py | 60 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index e9d834a..c62a48f 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,6 +1,9 @@
 """
 Training script for Pascal VOC using tf2-yolov4
 """
+from datetime import datetime
+from pathlib import Path
+
 import numpy as np
 import tensorflow as tf
 import tensorflow_datasets as tfds
@@ -12,10 +15,12 @@
 YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
 
 INPUT_SHAPE = (608, 608, 3)
-BATCH_SIZE = 16
+BATCH_SIZE = 8
 BOUNDING_BOXES_FIXED_NUMBER = 50
 PASCAL_VOC_NUM_CLASSES = 20
 
+LOG_DIR = Path("./logs") / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
+
 
 def broadcast_iou(box_1, box_2):
     # box_1: (..., (x1, y1, x2, y2))
@@ -245,24 +250,67 @@ def prepare_dataset(dataset, shuffle=True):
 model = YOLOv4(
     input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=PASCAL_VOC_NUM_CLASSES, training=True
 )
+darknet_weights = Path("./yolov4.h5")
+if darknet_weights.exists():
+    model.load_weights(str(darknet_weights), by_name=True, skip_mismatch=True)
+    print("Darknet weights loaded.")
 
-optimizer = tf.keras.optimizers.Adam(lr=1e-4)
+optimizer = tf.keras.optimizers.Adam(1e-4)
 loss = [
     YoloLoss(np.concatenate(YOLOV4_ANCHORS, axis=0)[mask])
     for mask in YOLOV4_ANCHORS_MASKS
 ]
 
+model.summary()
+# Start training: 5 epochs with backbone + neck frozen
+ALL_FROZEN_EPOCH_NUMBER = 10
+for layer in model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers:
+    layer.trainable = False
+model.compile(optimizer=optimizer, loss=loss)
+history = model.fit(
+    ds_train,
+    validation_data=ds_test,
+    validation_steps=10,
+    epochs=ALL_FROZEN_EPOCH_NUMBER,
+    callbacks=[
+        tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
+        tf.keras.callbacks.ModelCheckpoint(
+            "yolov4_all_frozen.h5", save_best_only=True, save_weights_only=True
+        ),
+    ],
+)
+# Keep training: 10 epochs with backbone frozen -- unfreeze neck
+BACKBONE_FROZEN_EPOCH_NUMBER = 10
+for layer in model.get_layer("YOLOv4_neck").layers:
+    layer.trainable = False
+model.compile(optimizer=optimizer, loss=loss)
+history = model.fit(
+    ds_train,
+    validation_data=ds_test,
+    validation_steps=10,
+    epochs=BACKBONE_FROZEN_EPOCH_NUMBER + ALL_FROZEN_EPOCH_NUMBER,
+    initial_epoch=ALL_FROZEN_EPOCH_NUMBER,
+    callbacks=[
+        tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
+        tf.keras.callbacks.ModelCheckpoint(
+            "yolov4_backbone_frozen.h5", save_best_only=True, save_weights_only=True, verbose=True,
+        ),
+    ],
+)
+# Final training: 35 epochs with all weights unfrozen
+for layer in model.get_layer("CSPDarknet53").layers:
+    layer.trainable = True
 model.compile(optimizer=optimizer, loss=loss)
-
 history = model.fit(
     ds_train,
     validation_data=ds_test,
     validation_steps=10,
-    epochs=100,
+    epochs=50,
+    initial_epoch=ALL_FROZEN_EPOCH_NUMBER + BACKBONE_FROZEN_EPOCH_NUMBER,
     callbacks=[
-        tf.keras.callbacks.TensorBoard(log_dir="./logs"),
+        tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
         tf.keras.callbacks.ModelCheckpoint(
-            "yolov4_best.h5", save_best_only=True, save_weights_only=True
+            "yolov4_full.h5", save_best_only=True, save_weights_only=True
         ),
     ],
 )

From b4afbc38671e8e7fc61d501f6c1497a3ef433eca Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Wed, 3 Jun 2020 18:14:49 +0200
Subject: [PATCH 07/20] Stop resizing and dividing by 255. twice

---
 scripts/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/train.py b/scripts/train.py
index c62a48f..a8ac34f 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -226,7 +226,7 @@ def prepare_dataset(dataset, shuffle=True):
     dataset = dataset.batch(BATCH_SIZE)
     dataset = dataset.map(
         lambda image, bounding_box_with_class: (
-            tf.image.resize(image, INPUT_SHAPE[:2]) / 255.0,
+            image,
             transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
                 bounding_box_with_class,
                 np.concatenate(

From 1ebef2a42a32438e8281e5c5bed32f527f16c64e Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Wed, 3 Jun 2020 18:25:15 +0200
Subject: [PATCH 08/20] Script executed if __main__

---
 scripts/train.py | 145 ++++++++++++++++++++++++-----------------------
 1 file changed, 73 insertions(+), 72 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index a8ac34f..db9f95c 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -242,75 +242,76 @@ def prepare_dataset(dataset, shuffle=True):
     return dataset
 
 
-voc_dataset = tfds.load("voc", shuffle_files=True)
-ds_train, ds_test = voc_dataset["train"], voc_dataset["test"]
-ds_train = prepare_dataset(ds_train, shuffle=True)
-ds_test = prepare_dataset(ds_test, shuffle=False)
-
-model = YOLOv4(
-    input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=PASCAL_VOC_NUM_CLASSES, training=True
-)
-darknet_weights = Path("./yolov4.h5")
-if darknet_weights.exists():
-    model.load_weights(str(darknet_weights), by_name=True, skip_mismatch=True)
-    print("Darknet weights loaded.")
-
-optimizer = tf.keras.optimizers.Adam(1e-4)
-loss = [
-    YoloLoss(np.concatenate(YOLOV4_ANCHORS, axis=0)[mask])
-    for mask in YOLOV4_ANCHORS_MASKS
-]
-
-model.summary()
-# Start training: 5 epochs with backbone + neck frozen
-ALL_FROZEN_EPOCH_NUMBER = 10
-for layer in model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers:
-    layer.trainable = False
-model.compile(optimizer=optimizer, loss=loss)
-history = model.fit(
-    ds_train,
-    validation_data=ds_test,
-    validation_steps=10,
-    epochs=ALL_FROZEN_EPOCH_NUMBER,
-    callbacks=[
-        tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
-        tf.keras.callbacks.ModelCheckpoint(
-            "yolov4_all_frozen.h5", save_best_only=True, save_weights_only=True
-        ),
-    ],
-)
-# Keep training: 10 epochs with backbone frozen -- unfreeze neck
-BACKBONE_FROZEN_EPOCH_NUMBER = 10
-for layer in model.get_layer("YOLOv4_neck").layers:
-    layer.trainable = False
-model.compile(optimizer=optimizer, loss=loss)
-history = model.fit(
-    ds_train,
-    validation_data=ds_test,
-    validation_steps=10,
-    epochs=BACKBONE_FROZEN_EPOCH_NUMBER + ALL_FROZEN_EPOCH_NUMBER,
-    initial_epoch=ALL_FROZEN_EPOCH_NUMBER,
-    callbacks=[
-        tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
-        tf.keras.callbacks.ModelCheckpoint(
-            "yolov4_backbone_frozen.h5", save_best_only=True, save_weights_only=True, verbose=True,
-        ),
-    ],
-)
-# Final training: 35 epochs with all weights unfrozen
-for layer in model.get_layer("CSPDarknet53").layers:
-    layer.trainable = True
-model.compile(optimizer=optimizer, loss=loss)
-history = model.fit(
-    ds_train,
-    validation_data=ds_test,
-    validation_steps=10,
-    epochs=50,
-    initial_epoch=ALL_FROZEN_EPOCH_NUMBER + BACKBONE_FROZEN_EPOCH_NUMBER,
-    callbacks=[
-        tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
-        tf.keras.callbacks.ModelCheckpoint(
-            "yolov4_full.h5", save_best_only=True, save_weights_only=True
-        ),
-    ],
-)
+if __name__ == "__main__":
+    voc_dataset = tfds.load("voc", shuffle_files=True)
+    ds_train, ds_test = voc_dataset["train"], voc_dataset["test"]
+    ds_train = prepare_dataset(ds_train, shuffle=True)
+    ds_test = prepare_dataset(ds_test, shuffle=False)
+
+    model = YOLOv4(
+        input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=PASCAL_VOC_NUM_CLASSES, training=True
+    )
+    darknet_weights = Path("./yolov4.h5")
+    if darknet_weights.exists():
+        model.load_weights(str(darknet_weights), by_name=True, skip_mismatch=True)
+        print("Darknet weights loaded.")
+
+    optimizer = tf.keras.optimizers.Adam(1e-4)
+    loss = [
+        YoloLoss(np.concatenate(YOLOV4_ANCHORS, axis=0)[mask])
+        for mask in YOLOV4_ANCHORS_MASKS
+    ]
+
+    model.summary()
+    # Start training: 5 epochs with backbone + neck frozen
+    ALL_FROZEN_EPOCH_NUMBER = 10
+    for layer in model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers:
+        layer.trainable = False
+    model.compile(optimizer=optimizer, loss=loss)
+    history = model.fit(
+        ds_train,
+        validation_data=ds_test,
+        validation_steps=10,
+        epochs=ALL_FROZEN_EPOCH_NUMBER,
+        callbacks=[
+            tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
+            tf.keras.callbacks.ModelCheckpoint(
+                "yolov4_all_frozen.h5", save_best_only=True, save_weights_only=True
+            ),
+        ],
+    )
+    # Keep training: 10 epochs with backbone frozen -- unfreeze neck
+    BACKBONE_FROZEN_EPOCH_NUMBER = 10
+    for layer in model.get_layer("YOLOv4_neck").layers:
+        layer.trainable = False
+    model.compile(optimizer=optimizer, loss=loss)
+    history = model.fit(
+        ds_train,
+        validation_data=ds_test,
+        validation_steps=10,
+        epochs=BACKBONE_FROZEN_EPOCH_NUMBER + ALL_FROZEN_EPOCH_NUMBER,
+        initial_epoch=ALL_FROZEN_EPOCH_NUMBER,
+        callbacks=[
+            tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
+            tf.keras.callbacks.ModelCheckpoint(
+                "yolov4_backbone_frozen.h5", save_best_only=True, save_weights_only=True, verbose=True,
+            ),
+        ],
+    )
+    # Final training: 35 epochs with all weights unfrozen
+    for layer in model.get_layer("CSPDarknet53").layers:
+        layer.trainable = True
+    model.compile(optimizer=optimizer, loss=loss)
+    history = model.fit(
+        ds_train,
+        validation_data=ds_test,
+        validation_steps=10,
+        epochs=50,
+        initial_epoch=ALL_FROZEN_EPOCH_NUMBER + BACKBONE_FROZEN_EPOCH_NUMBER,
+        callbacks=[
+            tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
+            tf.keras.callbacks.ModelCheckpoint(
+                "yolov4_full.h5", save_best_only=True, save_weights_only=True
+            ),
+        ],
+    )

From d3184bb5c92803a9cbae5d535aff5ee4d0ef5416 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sat, 6 Jun 2020 10:38:05 +0200
Subject: [PATCH 09/20] Change bbox order to match what YOLO expects

---
 scripts/train.py | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index db9f95c..1e747fc 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -21,6 +21,10 @@
 
 LOG_DIR = Path("./logs") / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
 
+ALL_FROZEN_EPOCH_NUMBER = 10
+BACKBONE_FROZEN_EPOCH_NUMBER = 10
+TOTAL_NUMBER_OF_EPOCHS = 50
+
 
 def broadcast_iou(box_1, box_2):
     # box_1: (..., (x1, y1, x2, y2))
@@ -197,7 +201,15 @@ def prepare_dataset(dataset, shuffle=True):
             image,
             tf.concat(
                 [
-                    object["bbox"],
+                    tf.stack(
+                        [
+                            object["bbox"][:, 1],
+                            object["bbox"][:, 0],
+                            object["bbox"][:, 3],
+                            object["bbox"][:, 2],
+                        ],
+                        axis=-1,
+                    ),
                     tf.expand_dims(tf.cast(object["label"], tf.float32), axis=-1),
                 ],
                 axis=-1,
@@ -249,7 +261,10 @@ def prepare_dataset(dataset, shuffle=True):
     ds_test = prepare_dataset(ds_test, shuffle=False)
 
     model = YOLOv4(
-        input_shape=INPUT_SHAPE, anchors=YOLOV4_ANCHORS, num_classes=PASCAL_VOC_NUM_CLASSES, training=True
+        input_shape=INPUT_SHAPE,
+        anchors=YOLOV4_ANCHORS,
+        num_classes=PASCAL_VOC_NUM_CLASSES,
+        training=True,
     )
     darknet_weights = Path("./yolov4.h5")
     if darknet_weights.exists():
@@ -264,8 +279,9 @@ def prepare_dataset(dataset, shuffle=True):
 
     model.summary()
     # Start training: 5 epochs with backbone + neck frozen
-    ALL_FROZEN_EPOCH_NUMBER = 10
-    for layer in model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers:
+    for layer in (
+        model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers
+    ):
         layer.trainable = False
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
@@ -281,9 +297,8 @@ def prepare_dataset(dataset, shuffle=True):
         ],
     )
     # Keep training: 10 epochs with backbone frozen -- unfreeze neck
-    BACKBONE_FROZEN_EPOCH_NUMBER = 10
     for layer in model.get_layer("YOLOv4_neck").layers:
-        layer.trainable = False
+        layer.trainable = True
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
         ds_train,
@@ -294,7 +309,10 @@ def prepare_dataset(dataset, shuffle=True):
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
-                "yolov4_backbone_frozen.h5", save_best_only=True, save_weights_only=True, verbose=True,
+                "yolov4_backbone_frozen.h5",
+                save_best_only=True,
+                save_weights_only=True,
+                verbose=True,
             ),
         ],
     )
@@ -306,12 +324,18 @@ def prepare_dataset(dataset, shuffle=True):
         ds_train,
         validation_data=ds_test,
         validation_steps=10,
-        epochs=50,
+        epochs=TOTAL_NUMBER_OF_EPOCHS,
         initial_epoch=ALL_FROZEN_EPOCH_NUMBER + BACKBONE_FROZEN_EPOCH_NUMBER,
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
                 "yolov4_full.h5", save_best_only=True, save_weights_only=True
             ),
+            tf.keras.callbacks.ModelCheckpoint(
+                "yolov4_train_loss.h5",
+                save_best_only=True,
+                save_weights_only=True,
+                monitor="loss",
+            ),
         ],
     )

From 0da167ea273f257d1d3adbe2eeb6f9c9618df8ac Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sat, 6 Jun 2020 10:46:54 +0200
Subject: [PATCH 10/20] Try reversing the anchors

---
 scripts/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 1e747fc..44ae69f 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -242,7 +242,7 @@ def prepare_dataset(dataset, shuffle=True):
             transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
                 bounding_box_with_class,
                 np.concatenate(
-                    YOLOV4_ANCHORS, axis=0
+                    list(reversed(YOLOV4_ANCHORS)), axis=0
                 ),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of anchors
                 YOLOV4_ANCHORS_MASKS,
                 INPUT_SHAPE[0],  # Assumes square input
@@ -273,7 +273,7 @@ def prepare_dataset(dataset, shuffle=True):
 
     optimizer = tf.keras.optimizers.Adam(1e-4)
     loss = [
-        YoloLoss(np.concatenate(YOLOV4_ANCHORS, axis=0)[mask])
+        YoloLoss(np.concatenate(list(reversed(YOLOV4_ANCHORS)), axis=0)[mask])
         for mask in YOLOV4_ANCHORS_MASKS
     ]
 

From b7436abec528a847c37fd0dd289920a2f034fc01 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sat, 6 Jun 2020 15:04:35 +0200
Subject: [PATCH 11/20] Normalized anchors

---
 scripts/train.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 44ae69f..ed3e0f5 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -8,17 +8,18 @@
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
-from tf2_yolov4.anchors import YOLOV4_ANCHORS
+from tf2_yolov4.anchors import YOLOV4_ANCHORS, compute_normalized_anchors
 from tf2_yolov4.heads.yolov3_head import yolov3_boxes_regression
 from tf2_yolov4.model import YOLOv4
 
-YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
-
 INPUT_SHAPE = (608, 608, 3)
 BATCH_SIZE = 8
 BOUNDING_BOXES_FIXED_NUMBER = 50
 PASCAL_VOC_NUM_CLASSES = 20
 
+YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+YOLOV4_ANCHORS_NORMALIZED = compute_normalized_anchors(YOLOV4_ANCHORS, INPUT_SHAPE)
+
 LOG_DIR = Path("./logs") / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
 
 ALL_FROZEN_EPOCH_NUMBER = 10
@@ -242,7 +243,7 @@ def prepare_dataset(dataset, shuffle=True):
             transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
                 bounding_box_with_class,
                 np.concatenate(
-                    list(reversed(YOLOV4_ANCHORS)), axis=0
+                    list(reversed(YOLOV4_ANCHORS_NORMALIZED)), axis=0
                 ),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of anchors
                 YOLOV4_ANCHORS_MASKS,
                 INPUT_SHAPE[0],  # Assumes square input
@@ -273,7 +274,9 @@ def prepare_dataset(dataset, shuffle=True):
 
     optimizer = tf.keras.optimizers.Adam(1e-4)
     loss = [
-        YoloLoss(np.concatenate(list(reversed(YOLOV4_ANCHORS)), axis=0)[mask])
+        YoloLoss(
+            np.concatenate(list(reversed(YOLOV4_ANCHORS_NORMALIZED)), axis=0)[mask]
+        )
         for mask in YOLOV4_ANCHORS_MASKS
     ]
 

From b6f362092794e3491904a5ed7e4fce700d21524c Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sat, 6 Jun 2020 15:04:48 +0200
Subject: [PATCH 12/20] Save models.h5 inside log dir

---
 scripts/train.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index ed3e0f5..6a3190e 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -295,7 +295,7 @@ def prepare_dataset(dataset, shuffle=True):
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
-                "yolov4_all_frozen.h5", save_best_only=True, save_weights_only=True
+                str(LOG_DIR / "yolov4_all_frozen.h5"), save_best_only=True, save_weights_only=True
             ),
         ],
     )
@@ -312,7 +312,7 @@ def prepare_dataset(dataset, shuffle=True):
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
-                "yolov4_backbone_frozen.h5",
+                str(LOG_DIR / "yolov4_backbone_frozen.h5"),
                 save_best_only=True,
                 save_weights_only=True,
                 verbose=True,
@@ -332,10 +332,12 @@ def prepare_dataset(dataset, shuffle=True):
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
-                "yolov4_full.h5", save_best_only=True, save_weights_only=True
+                str(LOG_DIR / "yolov4_full.h5"),
+                save_best_only=True,
+                save_weights_only=True,
             ),
             tf.keras.callbacks.ModelCheckpoint(
-                "yolov4_train_loss.h5",
+                str(LOG_DIR / "yolov4_train_loss.h5"),
                 save_best_only=True,
                 save_weights_only=True,
                 monitor="loss",

From 41b9370433a232688bc24a09fc87816b75c53a85 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sat, 6 Jun 2020 15:23:19 +0200
Subject: [PATCH 13/20] Add test script to plot box, run black

---
 scripts/test.py  | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 scripts/train.py |  4 +-
 2 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 scripts/test.py

diff --git a/scripts/test.py b/scripts/test.py
new file mode 100644
index 0000000..a3b80ec
--- /dev/null
+++ b/scripts/test.py
@@ -0,0 +1,95 @@
+import matplotlib.pyplot as plt
+import tensorflow as tf
+
+from tf2_yolov4.anchors import YOLOV4_ANCHORS
+from tf2_yolov4.model import YOLOv4
+
+HEIGHT, WIDTH = (608, 608)
+
+image = tf.io.read_file("../notebooks/images/cars.jpg")
+image = tf.image.decode_image(image)
+image = tf.image.resize(image, (HEIGHT, WIDTH))
+images = tf.expand_dims(image, axis=0) / 255.0
+
+model = YOLOv4(
+    input_shape=(HEIGHT, WIDTH, 3),
+    anchors=YOLOV4_ANCHORS,
+    num_classes=20,
+    training=False,
+    yolo_max_boxes=100,
+    yolo_iou_threshold=0.5,
+    yolo_score_threshold=0.5,
+)
+
+model.load_weights("../yolov4_full.h5")
+model.summary()
+
+boxes, scores, classes, valid_detections = model.predict(images)
+
+CLASSES = [
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
+
+# colors for visualization
+COLORS = [
+    [0.000, 0.447, 0.741],
+    [0.850, 0.325, 0.098],
+    [0.929, 0.694, 0.125],
+    [0.494, 0.184, 0.556],
+    [0.466, 0.674, 0.188],
+    [0.301, 0.745, 0.933],
+]
+
+
+def plot_results(pil_img, boxes, scores, classes):
+    plt.figure(figsize=(16, 10))
+    plt.imshow(pil_img)
+    ax = plt.gca()
+
+    for (xmin, ymin, xmax, ymax), score, cl in zip(
+        boxes.tolist(), scores.tolist(), classes.tolist()
+    ):
+        if score > 0:
+            ax.add_patch(
+                plt.Rectangle(
+                    (xmin, ymin),
+                    xmax - xmin,
+                    ymax - ymin,
+                    fill=False,
+                    color=COLORS[cl % 6],
+                    linewidth=3,
+                )
+            )
+            text = f"{CLASSES[cl]}: {score:0.2f}"
+            ax.text(
+                xmin, ymin, text, fontsize=15, bbox=dict(facecolor="yellow", alpha=0.5)
+            )
+    plt.axis("off")
+    plt.show()
+
+
+plot_results(
+    images[0],
+    boxes[0] * [WIDTH, HEIGHT, WIDTH, HEIGHT],
+    scores[0],
+    classes[0].astype(int),
+)
diff --git a/scripts/train.py b/scripts/train.py
index 6a3190e..71ace58 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -295,7 +295,9 @@ def prepare_dataset(dataset, shuffle=True):
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
-                str(LOG_DIR / "yolov4_all_frozen.h5"), save_best_only=True, save_weights_only=True
+                str(LOG_DIR / "yolov4_all_frozen.h5"),
+                save_best_only=True,
+                save_weights_only=True,
             ),
         ],
     )

From ab7762893ffe1ca221423ecd887f9e87487c7912 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Fri, 12 Jun 2020 11:13:42 +0200
Subject: [PATCH 14/20] Impact output order change in network on training
 script

---
 scripts/test.py  |   2 +-
 scripts/train.py | 109 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/scripts/test.py b/scripts/test.py
index a3b80ec..3bc898a 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -18,7 +18,7 @@
     training=False,
     yolo_max_boxes=100,
     yolo_iou_threshold=0.5,
-    yolo_score_threshold=0.5,
+    yolo_score_threshold=0.15,
 )
 
 model.load_weights("../yolov4_full.h5")
diff --git a/scripts/train.py b/scripts/train.py
index 71ace58..c0b301b 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -12,7 +12,7 @@
 from tf2_yolov4.heads.yolov3_head import yolov3_boxes_regression
 from tf2_yolov4.model import YOLOv4
 
-INPUT_SHAPE = (608, 608, 3)
+INPUT_SHAPE = (416, 416, 3)
 BATCH_SIZE = 8
 BOUNDING_BOXES_FIXED_NUMBER = 50
 PASCAL_VOC_NUM_CLASSES = 20
@@ -22,7 +22,7 @@
 
 LOG_DIR = Path("./logs") / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
 
-ALL_FROZEN_EPOCH_NUMBER = 10
+ALL_FROZEN_EPOCH_NUMBER = 15
 BACKBONE_FROZEN_EPOCH_NUMBER = 10
 TOTAL_NUMBER_OF_EPOCHS = 50
 
@@ -185,7 +185,7 @@ def transform_targets(y_train, anchors, anchor_masks, size):
         y_outs.append(transform_targets_for_output(y_train, grid_size, anchor_idxs))
         grid_size *= 2
 
-    return tuple(y_outs)
+    return tuple(reversed(y_outs))
 
 
 def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_number):
@@ -195,7 +195,43 @@ def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_num
     return tf.pad(bounding_boxes, paddings, constant_values=0.0)
 
 
-def prepare_dataset(dataset, shuffle=True):
+def random_flip_right_with_bounding_boxes(images, bounding_boxes):
+    apply_flip = tf.random.uniform(shape=[]) > 0.5
+    if apply_flip:
+        images = tf.image.flip_left_right(images)
+        bounding_boxes = tf.stack(
+            [
+                1.0 - bounding_boxes[..., 2],
+                bounding_boxes[..., 1],
+                1.0 - bounding_boxes[..., 0],
+                bounding_boxes[..., 3],
+                bounding_boxes[..., 4],
+            ],
+            axis=-1,
+        )
+
+    return images, bounding_boxes
+
+
+def augment_images(images, bounding_boxes):
+    # Image transformations that do not affect bounding boxes
+    images = tf.image.random_hue(images, 0.15)
+    images = tf.image.random_brightness(images, 0.15)
+
+    # Transformations that affect bounding boxes
+    images, bounding_boxes = random_flip_right_with_bounding_boxes(
+        images, bounding_boxes
+    )
+
+    return images, bounding_boxes
+
+
+def prepare_dataset(
+    dataset,
+    shuffle=True,
+    apply_data_augmentation=False,
+    transform_to_bbox_by_stage=True,
+):
     dataset = dataset.map(lambda el: (el["image"], el["objects"]))
     dataset = dataset.map(
         lambda image, object: (
@@ -218,15 +254,6 @@ def prepare_dataset(dataset, shuffle=True):
         ),
         num_parallel_calls=tf.data.experimental.AUTOTUNE,
     )
-    dataset = dataset.map(
-        lambda image, bounding_boxes: (
-            image,
-            pad_bounding_boxes_to_fixed_number_of_bounding_boxes(
-                bounding_boxes, pad_number=BOUNDING_BOXES_FIXED_NUMBER
-            ),
-        ),
-        num_parallel_calls=tf.data.experimental.AUTOTUNE,
-    )
     dataset = dataset.map(
         lambda image, bounding_box: (
             tf.image.resize(image, INPUT_SHAPE[:2]) / 255.0,
@@ -234,32 +261,57 @@ def prepare_dataset(dataset, shuffle=True):
         ),
         num_parallel_calls=tf.data.experimental.AUTOTUNE,
     )
+    if apply_data_augmentation:
+        dataset = dataset.map(
+            augment_images, num_parallel_calls=tf.data.experimental.AUTOTUNE
+        )
     if shuffle:
         dataset = dataset.shuffle(buffer_size=1000)
-    dataset = dataset.batch(BATCH_SIZE)
     dataset = dataset.map(
-        lambda image, bounding_box_with_class: (
+        lambda image, bounding_boxes: (
             image,
-            transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
-                bounding_box_with_class,
-                np.concatenate(
-                    list(reversed(YOLOV4_ANCHORS_NORMALIZED)), axis=0
-                ),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of anchors
-                YOLOV4_ANCHORS_MASKS,
-                INPUT_SHAPE[0],  # Assumes square input
+            pad_bounding_boxes_to_fixed_number_of_bounding_boxes(
+                bounding_boxes, pad_number=BOUNDING_BOXES_FIXED_NUMBER
             ),
         ),
         num_parallel_calls=tf.data.experimental.AUTOTUNE,
     )
+    dataset = dataset.batch(BATCH_SIZE)
 
-    return dataset
+    if transform_to_bbox_by_stage:
+        dataset = dataset.map(
+            lambda image, bounding_box_with_class: (
+                image,
+                transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
+                    bounding_box_with_class,
+                    np.concatenate(
+                        list(YOLOV4_ANCHORS_NORMALIZED), axis=0
+                    ),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of anchors
+                    YOLOV4_ANCHORS_MASKS,
+                    INPUT_SHAPE[0],  # Assumes square input
+                ),
+            ),
+            num_parallel_calls=tf.data.experimental.AUTOTUNE,
+        )
+    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return dataset.repeat()
 
 
 if __name__ == "__main__":
-    voc_dataset = tfds.load("voc", shuffle_files=True)
+    voc_dataset, infos = tfds.load("voc", with_info=True, shuffle_files=True)
     ds_train, ds_test = voc_dataset["train"], voc_dataset["test"]
-    ds_train = prepare_dataset(ds_train, shuffle=True)
-    ds_test = prepare_dataset(ds_test, shuffle=False)
+    ds_train = prepare_dataset(
+        ds_train,
+        shuffle=True,
+        apply_data_augmentation=True,
+        transform_to_bbox_by_stage=True,
+    )
+    ds_test = prepare_dataset(
+        ds_test,
+        shuffle=False,
+        apply_data_augmentation=False,
+        transform_to_bbox_by_stage=True,
+    )
 
     model = YOLOv4(
         input_shape=INPUT_SHAPE,
@@ -275,7 +327,7 @@ def prepare_dataset(dataset, shuffle=True):
     optimizer = tf.keras.optimizers.Adam(1e-4)
     loss = [
         YoloLoss(
-            np.concatenate(list(reversed(YOLOV4_ANCHORS_NORMALIZED)), axis=0)[mask]
+            np.concatenate(list(YOLOV4_ANCHORS_NORMALIZED), axis=0)[mask]
         )
         for mask in YOLOV4_ANCHORS_MASKS
     ]
@@ -289,6 +341,7 @@ def prepare_dataset(dataset, shuffle=True):
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
         ds_train,
+        steps_per_epoch=infos.splits["train"].num_examples // BATCH_SIZE,
         validation_data=ds_test,
         validation_steps=10,
         epochs=ALL_FROZEN_EPOCH_NUMBER,
@@ -307,6 +360,7 @@ def prepare_dataset(dataset, shuffle=True):
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
         ds_train,
+        steps_per_epoch=infos.splits["train"].num_examples // BATCH_SIZE,
         validation_data=ds_test,
         validation_steps=10,
         epochs=BACKBONE_FROZEN_EPOCH_NUMBER + ALL_FROZEN_EPOCH_NUMBER,
@@ -327,6 +381,7 @@ def prepare_dataset(dataset, shuffle=True):
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
         ds_train,
+        steps_per_epoch=infos.splits["train"].num_examples // BATCH_SIZE,
         validation_data=ds_test,
         validation_steps=10,
         epochs=TOTAL_NUMBER_OF_EPOCHS,

From 3a4f70cc7b2c2214a80c3e0f438c8d515903fe5e Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Sun, 21 Jun 2020 16:45:07 +0200
Subject: [PATCH 15/20] Pascal VOC 2012

---
 scripts/test.py  |  6 +++++-
 scripts/train.py | 21 ++++++++++++---------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/scripts/test.py b/scripts/test.py
index 3bc898a..91efe95 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -20,8 +20,12 @@
     yolo_iou_threshold=0.5,
     yolo_score_threshold=0.15,
 )
+for layer in (
+        model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers
+):
+    layer.trainable = False
 
-model.load_weights("../yolov4_full.h5")
+model.load_weights("../yolov4_all_frozen.h5")
 model.summary()
 
 boxes, scores, classes, valid_detections = model.predict(images)
diff --git a/scripts/train.py b/scripts/train.py
index c0b301b..cab7c39 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -14,7 +14,7 @@
 
 INPUT_SHAPE = (416, 416, 3)
 BATCH_SIZE = 8
-BOUNDING_BOXES_FIXED_NUMBER = 50
+BOUNDING_BOXES_FIXED_NUMBER = 60
 PASCAL_VOC_NUM_CLASSES = 20
 
 YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
@@ -298,8 +298,8 @@ def prepare_dataset(
 
 
 if __name__ == "__main__":
-    voc_dataset, infos = tfds.load("voc", with_info=True, shuffle_files=True)
-    ds_train, ds_test = voc_dataset["train"], voc_dataset["test"]
+    voc_dataset, infos = tfds.load("voc/2012", with_info=True, shuffle_files=True)
+    ds_train, ds_test = voc_dataset["train"], voc_dataset["validation"]
     ds_train = prepare_dataset(
         ds_train,
         shuffle=True,
@@ -313,6 +313,9 @@ def prepare_dataset(
         transform_to_bbox_by_stage=True,
     )
 
+    steps_per_epoch = infos.splits["train"].num_examples // BATCH_SIZE
+    validation_steps = infos.splits["validation"].num_examples // BATCH_SIZE
+
     model = YOLOv4(
         input_shape=INPUT_SHAPE,
         anchors=YOLOV4_ANCHORS,
@@ -341,9 +344,9 @@ def prepare_dataset(
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
         ds_train,
-        steps_per_epoch=infos.splits["train"].num_examples // BATCH_SIZE,
+        steps_per_epoch=steps_per_epoch,
         validation_data=ds_test,
-        validation_steps=10,
+        validation_steps=validation_steps,
         epochs=ALL_FROZEN_EPOCH_NUMBER,
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
@@ -360,9 +363,9 @@ def prepare_dataset(
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
         ds_train,
-        steps_per_epoch=infos.splits["train"].num_examples // BATCH_SIZE,
+        steps_per_epoch=steps_per_epoch,
         validation_data=ds_test,
-        validation_steps=10,
+        validation_steps=validation_steps,
         epochs=BACKBONE_FROZEN_EPOCH_NUMBER + ALL_FROZEN_EPOCH_NUMBER,
         initial_epoch=ALL_FROZEN_EPOCH_NUMBER,
         callbacks=[
@@ -381,9 +384,9 @@ def prepare_dataset(
     model.compile(optimizer=optimizer, loss=loss)
     history = model.fit(
         ds_train,
-        steps_per_epoch=infos.splits["train"].num_examples // BATCH_SIZE,
+        steps_per_epoch=steps_per_epoch,
         validation_data=ds_test,
-        validation_steps=10,
+        validation_steps=validation_steps,
         epochs=TOTAL_NUMBER_OF_EPOCHS,
         initial_epoch=ALL_FROZEN_EPOCH_NUMBER + BACKBONE_FROZEN_EPOCH_NUMBER,
         callbacks=[

From a7ba2421b3da40e166a5f888b292113d5998c883 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Fri, 3 Jul 2020 11:07:19 +0200
Subject: [PATCH 16/20] Fix training loop. Improve test scripts.

---
 scripts/test.py                 | 102 +++++++++++++++++---------------
 scripts/train.py                |  13 ++--
 tf2_yolov4/heads/yolov3_head.py |   4 +-
 3 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/scripts/test.py b/scripts/test.py
index 91efe95..01d166d 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -5,6 +5,35 @@
 from tf2_yolov4.model import YOLOv4
 
 HEIGHT, WIDTH = (608, 608)
+INPUT_SHAPE = (HEIGHT, WIDTH, 3)
+
+PASCAL_VOC_CLASSES = [
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus",
+    "car", "cat", "chair", "cow", "diningtable", "dog", "horse",
+    "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+]
+
+COCO_CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+    'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
+    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
+    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
+    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
+    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
+    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
+    'chair', 'couch', 'potted plant', 'bed', 'dining table',
+    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
+    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
+    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
+    'toothbrush'
+]
+
+# Switch this variable between PASCAL_VOC_CLASSES and COCO_CLASSES depending
+# on your training, or define your own set of classes.
+CLASSES = PASCAL_VOC_CLASSES
+
 
 image = tf.io.read_file("../notebooks/images/cars.jpg")
 image = tf.image.decode_image(image)
@@ -14,45 +43,17 @@
 model = YOLOv4(
     input_shape=(HEIGHT, WIDTH, 3),
     anchors=YOLOV4_ANCHORS,
-    num_classes=20,
+    num_classes=len(CLASSES),
     training=False,
     yolo_max_boxes=100,
     yolo_iou_threshold=0.5,
     yolo_score_threshold=0.15,
 )
-for layer in (
-        model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers
-):
-    layer.trainable = False
-
-model.load_weights("../yolov4_all_frozen.h5")
+model.load_weights("../yolov4_full.h5")
 model.summary()
 
 boxes, scores, classes, valid_detections = model.predict(images)
 
-CLASSES = [
-    "aeroplane",
-    "bicycle",
-    "bird",
-    "boat",
-    "bottle",
-    "bus",
-    "car",
-    "cat",
-    "chair",
-    "cow",
-    "diningtable",
-    "dog",
-    "horse",
-    "motorbike",
-    "person",
-    "pottedplant",
-    "sheep",
-    "sofa",
-    "train",
-    "tvmonitor",
-]
-
 # colors for visualization
 COLORS = [
     [0.000, 0.447, 0.741],
@@ -69,24 +70,31 @@ def plot_results(pil_img, boxes, scores, classes):
     plt.imshow(pil_img)
     ax = plt.gca()
 
-    for (xmin, ymin, xmax, ymax), score, cl in zip(
-        boxes.tolist(), scores.tolist(), classes.tolist()
-    ):
-        if score > 0:
-            ax.add_patch(
-                plt.Rectangle(
-                    (xmin, ymin),
-                    xmax - xmin,
-                    ymax - ymin,
-                    fill=False,
-                    color=COLORS[cl % 6],
-                    linewidth=3,
-                )
-            )
-            text = f"{CLASSES[cl]}: {score:0.2f}"
-            ax.text(
-                xmin, ymin, text, fontsize=15, bbox=dict(facecolor="yellow", alpha=0.5)
+    predictions_with_positive_score = [
+        (box, score, box_class)
+        for box, score, box_class in zip(
+            boxes.tolist(), scores.tolist(), classes.tolist()
+        )
+        if score > 0
+    ]
+    for (xmin, ymin, xmax, ymax), score, cl in predictions_with_positive_score:
+        color = COLORS[cl % 6]
+        ax.add_patch(
+            plt.Rectangle(
+                (xmin, ymin),
+                xmax - xmin,
+                ymax - ymin,
+                fill=False,
+                color=color,
+                linewidth=3,
             )
+        )
+        text = f"{CLASSES[cl]}: {score:0.2f}"
+        ax.text(
+            xmin, ymin, text, color="white",
+            fontsize=15, fontweight="bold",
+            bbox=dict(facecolor=color, alpha=0.7),
+        )
     plt.axis("off")
     plt.show()
 
diff --git a/scripts/train.py b/scripts/train.py
index cab7c39..084ea33 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -12,7 +12,7 @@
 from tf2_yolov4.heads.yolov3_head import yolov3_boxes_regression
 from tf2_yolov4.model import YOLOv4
 
-INPUT_SHAPE = (416, 416, 3)
+INPUT_SHAPE = (608, 608, 3)
 BATCH_SIZE = 8
 BOUNDING_BOXES_FIXED_NUMBER = 60
 PASCAL_VOC_NUM_CLASSES = 20
@@ -185,7 +185,7 @@ def transform_targets(y_train, anchors, anchor_masks, size):
         y_outs.append(transform_targets_for_output(y_train, grid_size, anchor_idxs))
         grid_size *= 2
 
-    return tuple(reversed(y_outs))
+    return tuple(y_outs)
 
 
 def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_number):
@@ -322,20 +322,17 @@ def prepare_dataset(
         num_classes=PASCAL_VOC_NUM_CLASSES,
         training=True,
     )
-    darknet_weights = Path("./yolov4.h5")
+    darknet_weights = Path(__file__).parent / "yolov4.h5"
     if darknet_weights.exists():
         model.load_weights(str(darknet_weights), by_name=True, skip_mismatch=True)
         print("Darknet weights loaded.")
 
     optimizer = tf.keras.optimizers.Adam(1e-4)
     loss = [
-        YoloLoss(
-            np.concatenate(list(YOLOV4_ANCHORS_NORMALIZED), axis=0)[mask]
-        )
+        YoloLoss(np.concatenate(list(YOLOV4_ANCHORS_NORMALIZED), axis=0)[mask])
         for mask in YOLOV4_ANCHORS_MASKS
     ]
 
-    model.summary()
     # Start training: 5 epochs with backbone + neck frozen
     for layer in (
         model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers
@@ -378,7 +375,7 @@ def prepare_dataset(
             ),
         ],
     )
-    # Final training: 35 epochs with all weights unfrozen
+    # Final training
     for layer in model.get_layer("CSPDarknet53").layers:
         layer.trainable = True
     model.compile(optimizer=optimizer, loss=loss)
diff --git a/tf2_yolov4/heads/yolov3_head.py b/tf2_yolov4/heads/yolov3_head.py
index ff28a6b..e2bcaeb 100644
--- a/tf2_yolov4/heads/yolov3_head.py
+++ b/tf2_yolov4/heads/yolov3_head.py
@@ -95,7 +95,7 @@ def yolov3_head(
     if training:
         return tf.keras.Model(
             [input_1, input_2, input_3],
-            [output_1, output_2, output_3],
+            [output_3, output_2, output_1],
             name="YOLOv3_head",
         )
 
@@ -120,7 +120,7 @@ def yolov3_head(
             yolo_score_threshold=yolo_score_threshold,
         ),
         name="yolov4_nms",
-    )([predictions_1, predictions_2, predictions_3])
+    )([predictions_3, predictions_2, predictions_1])
 
     return tf.keras.Model([input_1, input_2, input_3], output, name="YOLOv3_head")
 

From 1239ea11fc85bc2ecfbb8d3c712f265db63f0e59 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Fri, 3 Jul 2020 14:52:40 +0200
Subject: [PATCH 17/20] Improve it

---
 scripts/train.py       | 351 ++++++-----------------------------------
 tf2_yolov4/anchors.py  |   2 +
 tf2_yolov4/datasets.py | 188 ++++++++++++++++++++++
 tf2_yolov4/losses.py   | 101 ++++++++++++
 4 files changed, 342 insertions(+), 300 deletions(-)
 create mode 100644 tf2_yolov4/datasets.py
 create mode 100644 tf2_yolov4/losses.py

diff --git a/scripts/train.py b/scripts/train.py
index 084ea33..29d5445 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -4,332 +4,64 @@
 from datetime import datetime
 from pathlib import Path
 
+import click
 import numpy as np
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
-from tf2_yolov4.anchors import YOLOV4_ANCHORS, compute_normalized_anchors
-from tf2_yolov4.heads.yolov3_head import yolov3_boxes_regression
+from tf2_yolov4.anchors import (
+    YOLOV4_ANCHORS,
+    YOLOV4_ANCHORS_MASKS,
+    compute_normalized_anchors,
+)
+from tf2_yolov4.datasets import prepare_dataset
+from tf2_yolov4.losses import YoloV3Loss
 from tf2_yolov4.model import YOLOv4
 
 INPUT_SHAPE = (608, 608, 3)
-BATCH_SIZE = 8
-BOUNDING_BOXES_FIXED_NUMBER = 60
-PASCAL_VOC_NUM_CLASSES = 20
 
-YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
-YOLOV4_ANCHORS_NORMALIZED = compute_normalized_anchors(YOLOV4_ANCHORS, INPUT_SHAPE)
 
-LOG_DIR = Path("./logs") / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
+def launch_training(batch_size, weights_path, all_frozen_epoch_number, backbone_frozen_epoch_number, num_epochs, dataset_name="voc"):
+    LOG_DIR = Path("./logs") / dataset_name / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
 
-ALL_FROZEN_EPOCH_NUMBER = 15
-BACKBONE_FROZEN_EPOCH_NUMBER = 10
-TOTAL_NUMBER_OF_EPOCHS = 50
+    voc_dataset, infos = tfds.load(dataset_name, with_info=True, shuffle_files=True)
 
-
-def broadcast_iou(box_1, box_2):
-    # box_1: (..., (x1, y1, x2, y2))
-    # box_2: (N, (x1, y1, x2, y2))
-
-    # broadcast boxes
-    box_1 = tf.expand_dims(box_1, -2)
-    box_2 = tf.expand_dims(box_2, 0)
-    # new_shape: (..., N, (x1, y1, x2, y2))
-    new_shape = tf.broadcast_dynamic_shape(tf.shape(box_1), tf.shape(box_2))
-    box_1 = tf.broadcast_to(box_1, new_shape)
-    box_2 = tf.broadcast_to(box_2, new_shape)
-
-    int_w = tf.maximum(
-        tf.minimum(box_1[..., 2], box_2[..., 2])
-        - tf.maximum(box_1[..., 0], box_2[..., 0]),
-        0,
-    )
-    int_h = tf.maximum(
-        tf.minimum(box_1[..., 3], box_2[..., 3])
-        - tf.maximum(box_1[..., 1], box_2[..., 1]),
-        0,
-    )
-    int_area = int_w * int_h
-    box_1_area = (box_1[..., 2] - box_1[..., 0]) * (box_1[..., 3] - box_1[..., 1])
-    box_2_area = (box_2[..., 2] - box_2[..., 0]) * (box_2[..., 3] - box_2[..., 1])
-    return int_area / (box_1_area + box_2_area - int_area)
-
-
-def YoloLoss(anchors, ignore_thresh=0.5):
-    def yolo_loss(y_true, y_pred):
-        # 1. transform all pred outputs
-        # y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
-        pred_box, pred_obj, pred_class, pred_xywh = yolov3_boxes_regression(
-            y_pred, anchors
-        )
-        pred_xy = pred_xywh[..., 0:2]
-        pred_wh = pred_xywh[..., 2:4]
-
-        # 2. transform all true outputs
-        # y_true: (batch_size, grid, grid, anchors, (x1, y1, x2, y2, obj, cls))
-        true_box, true_obj, true_class_idx = tf.split(y_true, (4, 1, 1), axis=-1)
-        true_xy = (true_box[..., 0:2] + true_box[..., 2:4]) / 2
-        true_wh = true_box[..., 2:4] - true_box[..., 0:2]
-
-        # give higher weights to small boxes
-        box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1]
-
-        # 3. inverting the pred box equations
-        grid_size = tf.shape(y_true)[1]
-        grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
-        grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
-        true_xy = true_xy * tf.cast(grid_size, tf.float32) - tf.cast(grid, tf.float32)
-        true_wh = tf.math.log(true_wh / anchors)
-        true_wh = tf.where(tf.math.is_inf(true_wh), tf.zeros_like(true_wh), true_wh)
-
-        # 4. calculate all masks
-        obj_mask = tf.squeeze(true_obj, -1)
-        # ignore false positive when iou is over threshold
-        best_iou = tf.map_fn(
-            lambda x: tf.reduce_max(
-                broadcast_iou(x[0], tf.boolean_mask(x[1], tf.cast(x[2], tf.bool))),
-                axis=-1,
-            ),
-            (pred_box, true_box, obj_mask),
-            tf.float32,
-        )
-        ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)
-
-        # 5. calculate all losses
-        xy_loss = (
-            obj_mask
-            * box_loss_scale
-            * tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
-        )
-        wh_loss = (
-            obj_mask
-            * box_loss_scale
-            * tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
-        )
-        obj_loss = tf.keras.losses.binary_crossentropy(true_obj, pred_obj)
-        obj_loss = obj_mask * obj_loss + (1 - obj_mask) * ignore_mask * obj_loss
-        # TODO: use binary_crossentropy instead
-        class_loss = obj_mask * tf.keras.losses.sparse_categorical_crossentropy(
-            true_class_idx, pred_class
-        )
-
-        # 6. sum over (batch, gridx, gridy, anchors) => (batch, 1)
-        xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
-        wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3))
-        obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3))
-        class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
-
-        return xy_loss + wh_loss + obj_loss + class_loss
-
-    return yolo_loss
-
-
-@tf.function
-def transform_targets_for_output(y_true, grid_size, anchor_idxs):
-    # y_true: (N, boxes, (x1, y1, x2, y2, class, best_anchor))
-    N = tf.shape(y_true)[0]
-
-    # y_true_out: (N, grid, grid, anchors, [x, y, w, h, obj, class])
-    y_true_out = tf.zeros((N, grid_size, grid_size, tf.shape(anchor_idxs)[0], 6))
-
-    anchor_idxs = tf.cast(anchor_idxs, tf.int32)
-
-    indexes = tf.TensorArray(tf.int32, 1, dynamic_size=True)
-    updates = tf.TensorArray(tf.float32, 1, dynamic_size=True)
-    idx = 0
-    for i in tf.range(N):
-        for j in tf.range(tf.shape(y_true)[1]):
-            if tf.equal(y_true[i][j][2], 0):
-                continue
-            anchor_eq = tf.equal(anchor_idxs, tf.cast(y_true[i][j][5], tf.int32))
-
-            if tf.reduce_any(anchor_eq):
-                box = y_true[i][j][0:4]
-                box_xy = (y_true[i][j][0:2] + y_true[i][j][2:4]) / 2
-
-                anchor_idx = tf.cast(tf.where(anchor_eq), tf.int32)
-                grid_xy = tf.cast(box_xy // (1 / grid_size), tf.int32)
-
-                # grid[y][x][anchor] = (tx, ty, bw, bh, obj, class)
-                indexes = indexes.write(
-                    idx, [i, grid_xy[1], grid_xy[0], anchor_idx[0][0]]
-                )
-                updates = updates.write(
-                    idx, [box[0], box[1], box[2], box[3], 1, y_true[i][j][4]]
-                )
-                idx += 1
-
-    return tf.tensor_scatter_nd_update(y_true_out, indexes.stack(), updates.stack())
-
-
-def transform_targets(y_train, anchors, anchor_masks, size):
-    y_outs = []
-    grid_size = size // 32
-
-    # calculate anchor index for true boxes
-    anchors = tf.cast(anchors, tf.float32)
-    anchor_area = anchors[..., 0] * anchors[..., 1]
-    box_wh = y_train[..., 2:4] - y_train[..., 0:2]
-    box_wh = tf.tile(tf.expand_dims(box_wh, -2), (1, 1, tf.shape(anchors)[0], 1))
-    box_area = box_wh[..., 0] * box_wh[..., 1]
-    intersection = tf.minimum(box_wh[..., 0], anchors[..., 0]) * tf.minimum(
-        box_wh[..., 1], anchors[..., 1]
-    )
-    iou = intersection / (box_area + anchor_area - intersection)
-    anchor_idx = tf.cast(tf.argmax(iou, axis=-1), tf.float32)
-    anchor_idx = tf.expand_dims(anchor_idx, axis=-1)
-
-    y_train = tf.concat([y_train, anchor_idx], axis=-1)
-
-    for anchor_idxs in anchor_masks:
-        y_outs.append(transform_targets_for_output(y_train, grid_size, anchor_idxs))
-        grid_size *= 2
-
-    return tuple(y_outs)
-
-
-def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_number):
-    box_number = tf.shape(bounding_boxes)[0]
-    paddings = [[0, pad_number - box_number], [0, 0]]
-
-    return tf.pad(bounding_boxes, paddings, constant_values=0.0)
-
-
-def random_flip_right_with_bounding_boxes(images, bounding_boxes):
-    apply_flip = tf.random.uniform(shape=[]) > 0.5
-    if apply_flip:
-        images = tf.image.flip_left_right(images)
-        bounding_boxes = tf.stack(
-            [
-                1.0 - bounding_boxes[..., 2],
-                bounding_boxes[..., 1],
-                1.0 - bounding_boxes[..., 0],
-                bounding_boxes[..., 3],
-                bounding_boxes[..., 4],
-            ],
-            axis=-1,
-        )
-
-    return images, bounding_boxes
-
-
-def augment_images(images, bounding_boxes):
-    # Image transformations that do not affect bounding boxes
-    images = tf.image.random_hue(images, 0.15)
-    images = tf.image.random_brightness(images, 0.15)
-
-    # Transformations that affect bounding boxes
-    images, bounding_boxes = random_flip_right_with_bounding_boxes(
-        images, bounding_boxes
-    )
-
-    return images, bounding_boxes
-
-
-def prepare_dataset(
-    dataset,
-    shuffle=True,
-    apply_data_augmentation=False,
-    transform_to_bbox_by_stage=True,
-):
-    dataset = dataset.map(lambda el: (el["image"], el["objects"]))
-    dataset = dataset.map(
-        lambda image, object: (
-            image,
-            tf.concat(
-                [
-                    tf.stack(
-                        [
-                            object["bbox"][:, 1],
-                            object["bbox"][:, 0],
-                            object["bbox"][:, 3],
-                            object["bbox"][:, 2],
-                        ],
-                        axis=-1,
-                    ),
-                    tf.expand_dims(tf.cast(object["label"], tf.float32), axis=-1),
-                ],
-                axis=-1,
-            ),
-        ),
-        num_parallel_calls=tf.data.experimental.AUTOTUNE,
-    )
-    dataset = dataset.map(
-        lambda image, bounding_box: (
-            tf.image.resize(image, INPUT_SHAPE[:2]) / 255.0,
-            bounding_box,
-        ),
-        num_parallel_calls=tf.data.experimental.AUTOTUNE,
-    )
-    if apply_data_augmentation:
-        dataset = dataset.map(
-            augment_images, num_parallel_calls=tf.data.experimental.AUTOTUNE
-        )
-    if shuffle:
-        dataset = dataset.shuffle(buffer_size=1000)
-    dataset = dataset.map(
-        lambda image, bounding_boxes: (
-            image,
-            pad_bounding_boxes_to_fixed_number_of_bounding_boxes(
-                bounding_boxes, pad_number=BOUNDING_BOXES_FIXED_NUMBER
-            ),
-        ),
-        num_parallel_calls=tf.data.experimental.AUTOTUNE,
-    )
-    dataset = dataset.batch(BATCH_SIZE)
-
-    if transform_to_bbox_by_stage:
-        dataset = dataset.map(
-            lambda image, bounding_box_with_class: (
-                image,
-                transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
-                    bounding_box_with_class,
-                    np.concatenate(
-                        list(YOLOV4_ANCHORS_NORMALIZED), axis=0
-                    ),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of anchors
-                    YOLOV4_ANCHORS_MASKS,
-                    INPUT_SHAPE[0],  # Assumes square input
-                ),
-            ),
-            num_parallel_calls=tf.data.experimental.AUTOTUNE,
-        )
-    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-    return dataset.repeat()
-
-
-if __name__ == "__main__":
-    voc_dataset, infos = tfds.load("voc/2012", with_info=True, shuffle_files=True)
     ds_train, ds_test = voc_dataset["train"], voc_dataset["validation"]
     ds_train = prepare_dataset(
         ds_train,
+        shape=INPUT_SHAPE,
+        batch_size=batch_size,
         shuffle=True,
         apply_data_augmentation=True,
         transform_to_bbox_by_stage=True,
     )
     ds_test = prepare_dataset(
         ds_test,
+        shape=INPUT_SHAPE,
+        batch_size=batch_size,
         shuffle=False,
         apply_data_augmentation=False,
         transform_to_bbox_by_stage=True,
     )
 
-    steps_per_epoch = infos.splits["train"].num_examples // BATCH_SIZE
-    validation_steps = infos.splits["validation"].num_examples // BATCH_SIZE
+    steps_per_epoch = infos.splits["train"].num_examples // batch_size
+    validation_steps = infos.splits["validation"].num_examples // batch_size
+    num_classes = infos.features["objects"]["label"].num_classes
 
     model = YOLOv4(
         input_shape=INPUT_SHAPE,
         anchors=YOLOV4_ANCHORS,
-        num_classes=PASCAL_VOC_NUM_CLASSES,
+        num_classes=num_classes,
         training=True,
     )
-    darknet_weights = Path(__file__).parent / "yolov4.h5"
-    if darknet_weights.exists():
-        model.load_weights(str(darknet_weights), by_name=True, skip_mismatch=True)
+    if weights_path is not None:
+        model.load_weights(str(weights_path), by_name=True, skip_mismatch=True)
         print("Darknet weights loaded.")
 
     optimizer = tf.keras.optimizers.Adam(1e-4)
+    normalized_anchors = compute_normalized_anchors(YOLOV4_ANCHORS, INPUT_SHAPE)
     loss = [
-        YoloLoss(np.concatenate(list(YOLOV4_ANCHORS_NORMALIZED), axis=0)[mask])
+        YoloV3Loss(np.concatenate(list(normalized_anchors), axis=0)[mask])
         for mask in YOLOV4_ANCHORS_MASKS
     ]
 
@@ -339,59 +71,63 @@ def prepare_dataset(
     ):
         layer.trainable = False
     model.compile(optimizer=optimizer, loss=loss)
-    history = model.fit(
+    model.fit(
         ds_train,
         steps_per_epoch=steps_per_epoch,
         validation_data=ds_test,
         validation_steps=validation_steps,
-        epochs=ALL_FROZEN_EPOCH_NUMBER,
+        epochs=all_frozen_epoch_number,
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
                 str(LOG_DIR / "yolov4_all_frozen.h5"),
                 save_best_only=True,
                 save_weights_only=True,
+                monitor="val_loss",
             ),
         ],
     )
+
     # Keep training: 10 epochs with backbone frozen -- unfreeze neck
     for layer in model.get_layer("YOLOv4_neck").layers:
         layer.trainable = True
     model.compile(optimizer=optimizer, loss=loss)
-    history = model.fit(
+    model.fit(
         ds_train,
         steps_per_epoch=steps_per_epoch,
         validation_data=ds_test,
         validation_steps=validation_steps,
-        epochs=BACKBONE_FROZEN_EPOCH_NUMBER + ALL_FROZEN_EPOCH_NUMBER,
-        initial_epoch=ALL_FROZEN_EPOCH_NUMBER,
+        epochs=backbone_frozen_epoch_number + all_frozen_epoch_number,
+        initial_epoch=all_frozen_epoch_number,
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
                 str(LOG_DIR / "yolov4_backbone_frozen.h5"),
                 save_best_only=True,
                 save_weights_only=True,
-                verbose=True,
+                monitor="val_loss",
             ),
         ],
     )
+
     # Final training
     for layer in model.get_layer("CSPDarknet53").layers:
         layer.trainable = True
     model.compile(optimizer=optimizer, loss=loss)
-    history = model.fit(
+    model.fit(
         ds_train,
         steps_per_epoch=steps_per_epoch,
         validation_data=ds_test,
         validation_steps=validation_steps,
-        epochs=TOTAL_NUMBER_OF_EPOCHS,
-        initial_epoch=ALL_FROZEN_EPOCH_NUMBER + BACKBONE_FROZEN_EPOCH_NUMBER,
+        epochs=num_epochs,
+        initial_epoch=all_frozen_epoch_number + backbone_frozen_epoch_number,
         callbacks=[
             tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR),
             tf.keras.callbacks.ModelCheckpoint(
                 str(LOG_DIR / "yolov4_full.h5"),
                 save_best_only=True,
                 save_weights_only=True,
+                monitor="val_loss",
             ),
             tf.keras.callbacks.ModelCheckpoint(
                 str(LOG_DIR / "yolov4_train_loss.h5"),
@@ -401,3 +137,18 @@ def prepare_dataset(
             ),
         ],
     )
+
+
+@click.command()
+@click.option("--batch_size", type=int, default=16, help="Size of mini-batch")
+@click.option("--weights_path", type=click.Path(exists=True), default=None, help="Path to pretrained weights")
+@click.option("--all_frozen_epoch_number", type=int, default=20, help="Number of epochs to perform with backbone and neck frozen")
+@click.option("--backbone_frozen_epoch_number", type=int, default=10, help="Number of epochs to perform with backbone frozen")
+@click.option("--num_epochs", type=int, default=50, help="Total number of epochs to perform")
+@click.option("--dataset_name", type=str, default="voc", help="Dataset used during training. Refer to TensorFlow Datasets documentation for dataset names.")
+def launch_training_command(batch_size, weights_path, all_frozen_epoch_number, backbone_frozen_epoch_number, num_epochs, dataset_name):
+    launch_training(batch_size, weights_path, all_frozen_epoch_number, backbone_frozen_epoch_number, num_epochs, dataset_name)
+
+
+if __name__ == "__main__":
+    launch_training_command()
diff --git a/tf2_yolov4/anchors.py b/tf2_yolov4/anchors.py
index 8d2dcde..e3f4076 100644
--- a/tf2_yolov4/anchors.py
+++ b/tf2_yolov4/anchors.py
@@ -9,6 +9,8 @@
     np.array([(142, 110), (192, 243), (459, 401)], np.float32),
 ]
 
+YOLOV4_ANCHORS_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+
 YOLOV3_ANCHORS = [
     np.array([(10, 13), (16, 30), (33, 23)], np.float32),
     np.array([(30, 61), (62, 45), (59, 119)], np.float32),
diff --git a/tf2_yolov4/datasets.py b/tf2_yolov4/datasets.py
new file mode 100644
index 0000000..a973ea1
--- /dev/null
+++ b/tf2_yolov4/datasets.py
@@ -0,0 +1,188 @@
+import numpy as np
+import tensorflow as tf
+
+from tf2_yolov4.anchors import (
+    YOLOV4_ANCHORS,
+    YOLOV4_ANCHORS_MASKS,
+    compute_normalized_anchors,
+)
+
+BOUNDING_BOXES_FIXED_NUMBER = 60
+
+
+@tf.function
+def transform_targets_for_output(y_true, grid_size, anchor_idxs):
+    # y_true: (N, boxes, (x1, y1, x2, y2, class, best_anchor))
+    N = tf.shape(y_true)[0]
+
+    # y_true_out: (N, grid, grid, anchors, [x, y, w, h, obj, class])
+    y_true_out = tf.zeros((N, grid_size, grid_size, tf.shape(anchor_idxs)[0], 6))
+
+    anchor_idxs = tf.cast(anchor_idxs, tf.int32)
+
+    indexes = tf.TensorArray(tf.int32, 1, dynamic_size=True)
+    updates = tf.TensorArray(tf.float32, 1, dynamic_size=True)
+    idx = 0
+    for i in tf.range(N):
+        for j in tf.range(tf.shape(y_true)[1]):
+            if tf.equal(y_true[i][j][2], 0):
+                continue
+            anchor_eq = tf.equal(anchor_idxs, tf.cast(y_true[i][j][5], tf.int32))
+
+            if tf.reduce_any(anchor_eq):
+                box = y_true[i][j][0:4]
+                box_xy = (y_true[i][j][0:2] + y_true[i][j][2:4]) / 2
+
+                anchor_idx = tf.cast(tf.where(anchor_eq), tf.int32)
+                grid_xy = tf.cast(box_xy // (1 / grid_size), tf.int32)
+
+                # grid[y][x][anchor] = (tx, ty, bw, bh, obj, class)
+                indexes = indexes.write(
+                    idx, [i, grid_xy[1], grid_xy[0], anchor_idx[0][0]]
+                )
+                updates = updates.write(
+                    idx, [box[0], box[1], box[2], box[3], 1, y_true[i][j][4]]
+                )
+                idx += 1
+
+    return tf.tensor_scatter_nd_update(y_true_out, indexes.stack(), updates.stack())
+
+
+def transform_targets(y_train, anchors, anchor_masks, size):
+    y_outs = []
+    grid_size = size // 32
+
+    # calculate anchor index for true boxes
+    anchors = tf.cast(anchors, tf.float32)
+    anchor_area = anchors[..., 0] * anchors[..., 1]
+    box_wh = y_train[..., 2:4] - y_train[..., 0:2]
+    box_wh = tf.tile(tf.expand_dims(box_wh, -2), (1, 1, tf.shape(anchors)[0], 1))
+    box_area = box_wh[..., 0] * box_wh[..., 1]
+    intersection = tf.minimum(box_wh[..., 0], anchors[..., 0]) * tf.minimum(
+        box_wh[..., 1], anchors[..., 1]
+    )
+    iou = intersection / (box_area + anchor_area - intersection)
+    anchor_idx = tf.cast(tf.argmax(iou, axis=-1), tf.float32)
+    anchor_idx = tf.expand_dims(anchor_idx, axis=-1)
+
+    y_train = tf.concat([y_train, anchor_idx], axis=-1)
+
+    for anchor_idxs in anchor_masks:
+        y_outs.append(transform_targets_for_output(y_train, grid_size, anchor_idxs))
+        grid_size *= 2
+
+    return tuple(y_outs)
+
+
+def pad_bounding_boxes_to_fixed_number_of_bounding_boxes(bounding_boxes, pad_number):
+    box_number = tf.shape(bounding_boxes)[0]
+    paddings = [[0, pad_number - box_number], [0, 0]]
+
+    return tf.pad(bounding_boxes, paddings, constant_values=0.0)
+
+
+def random_flip_right_with_bounding_boxes(images, bounding_boxes):
+    apply_flip = tf.random.uniform(shape=[]) > 0.5
+    if apply_flip:
+        images = tf.image.flip_left_right(images)
+        bounding_boxes = tf.stack(
+            [
+                1.0 - bounding_boxes[..., 2],
+                bounding_boxes[..., 1],
+                1.0 - bounding_boxes[..., 0],
+                bounding_boxes[..., 3],
+                bounding_boxes[..., 4],
+            ],
+            axis=-1,
+        )
+
+    return images, bounding_boxes
+
+
+def augment_images(images, bounding_boxes):
+    # Image transformations that do not affect bounding boxes
+    images = tf.image.random_hue(images, 0.15)
+    images = tf.image.random_brightness(images, 0.15)
+
+    # Transformations that affect bounding boxes
+    images, bounding_boxes = random_flip_right_with_bounding_boxes(
+        images, bounding_boxes
+    )
+
+    return images, bounding_boxes
+
+
+def prepare_dataset(
+    dataset,
+    shape,
+    batch_size,
+    shuffle=True,
+    apply_data_augmentation=False,
+    transform_to_bbox_by_stage=True,
+    pad_number_of_boxes=BOUNDING_BOXES_FIXED_NUMBER,
+    anchors=YOLOV4_ANCHORS,
+):
+    normalized_anchors = compute_normalized_anchors(anchors, shape)
+    dataset = dataset.map(lambda el: (el["image"], el["objects"]))
+    dataset = dataset.map(
+        lambda image, object: (
+            image,
+            tf.concat(
+                [
+                    tf.stack(
+                        [
+                            object["bbox"][:, 1],
+                            object["bbox"][:, 0],
+                            object["bbox"][:, 3],
+                            object["bbox"][:, 2],
+                        ],
+                        axis=-1,
+                    ),
+                    tf.expand_dims(tf.cast(object["label"], tf.float32), axis=-1),
+                ],
+                axis=-1,
+            ),
+        ),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    )
+    dataset = dataset.map(
+        lambda image, bounding_box: (
+            tf.image.resize(image, shape[:2]) / 255.0,
+            bounding_box,
+        ),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    )
+    if apply_data_augmentation:
+        dataset = dataset.map(
+            augment_images, num_parallel_calls=tf.data.experimental.AUTOTUNE
+        )
+    if shuffle:
+        dataset = dataset.shuffle(buffer_size=1000)
+    dataset = dataset.map(
+        lambda image, bounding_boxes: (
+            image,
+            pad_bounding_boxes_to_fixed_number_of_bounding_boxes(
+                bounding_boxes, pad_number=pad_number_of_boxes
+            ),
+        ),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    )
+    dataset = dataset.batch(batch_size)
+
+    if transform_to_bbox_by_stage:
+        dataset = dataset.map(
+            lambda image, bounding_box_with_class: (
+                image,
+                transform_targets(  # Comes straight from https://github.com/zzh8829/yolov3-tf2/
+                    bounding_box_with_class,
+                    np.concatenate(
+                        list(normalized_anchors), axis=0
+                    ),  # Must concatenate because in zzh8829/yolov3-tf2, it's a list of anchors
+                    YOLOV4_ANCHORS_MASKS,
+                    shape[0],  # Assumes square input
+                ),
+            ),
+            num_parallel_calls=tf.data.experimental.AUTOTUNE,
+        )
+    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return dataset.repeat()
diff --git a/tf2_yolov4/losses.py b/tf2_yolov4/losses.py
new file mode 100644
index 0000000..7ca43a0
--- /dev/null
+++ b/tf2_yolov4/losses.py
@@ -0,0 +1,101 @@
+# TODO: Cite zzh/yolov3-tf2
+import tensorflow as tf
+
+from tf2_yolov4.heads.yolov3_head import yolov3_boxes_regression
+
+
+def broadcast_iou(box_1, box_2):
+    # box_1: (..., (x1, y1, x2, y2))
+    # box_2: (N, (x1, y1, x2, y2))
+
+    # broadcast boxes
+    box_1 = tf.expand_dims(box_1, -2)
+    box_2 = tf.expand_dims(box_2, 0)
+    # new_shape: (..., N, (x1, y1, x2, y2))
+    new_shape = tf.broadcast_dynamic_shape(tf.shape(box_1), tf.shape(box_2))
+    box_1 = tf.broadcast_to(box_1, new_shape)
+    box_2 = tf.broadcast_to(box_2, new_shape)
+
+    int_w = tf.maximum(
+        tf.minimum(box_1[..., 2], box_2[..., 2])
+        - tf.maximum(box_1[..., 0], box_2[..., 0]),
+        0,
+    )
+    int_h = tf.maximum(
+        tf.minimum(box_1[..., 3], box_2[..., 3])
+        - tf.maximum(box_1[..., 1], box_2[..., 1]),
+        0,
+    )
+    int_area = int_w * int_h
+    box_1_area = (box_1[..., 2] - box_1[..., 0]) * (box_1[..., 3] - box_1[..., 1])
+    box_2_area = (box_2[..., 2] - box_2[..., 0]) * (box_2[..., 3] - box_2[..., 1])
+    return int_area / (box_1_area + box_2_area - int_area)
+
+
+def YoloV3Loss(anchors, ignore_thresh=0.5):
+    def yolo_loss(y_true, y_pred):
+        # 1. transform all pred outputs
+        # y_pred: (batch_size, grid, grid, anchors, (x, y, w, h, obj, ...cls))
+        pred_box, pred_obj, pred_class, pred_xywh = yolov3_boxes_regression(
+            y_pred, anchors
+        )
+        pred_xy = pred_xywh[..., 0:2]
+        pred_wh = pred_xywh[..., 2:4]
+
+        # 2. transform all true outputs
+        # y_true: (batch_size, grid, grid, anchors, (x1, y1, x2, y2, obj, cls))
+        true_box, true_obj, true_class_idx = tf.split(y_true, (4, 1, 1), axis=-1)
+        true_xy = (true_box[..., 0:2] + true_box[..., 2:4]) / 2
+        true_wh = true_box[..., 2:4] - true_box[..., 0:2]
+
+        # give higher weights to small boxes
+        box_loss_scale = 2 - true_wh[..., 0] * true_wh[..., 1]
+
+        # 3. inverting the pred box equations
+        grid_size = tf.shape(y_true)[1]
+        grid = tf.meshgrid(tf.range(grid_size), tf.range(grid_size))
+        grid = tf.expand_dims(tf.stack(grid, axis=-1), axis=2)
+        true_xy = true_xy * tf.cast(grid_size, tf.float32) - tf.cast(grid, tf.float32)
+        true_wh = tf.math.log(true_wh / anchors)
+        true_wh = tf.where(tf.math.is_inf(true_wh), tf.zeros_like(true_wh), true_wh)
+
+        # 4. calculate all masks
+        obj_mask = tf.squeeze(true_obj, -1)
+        # ignore false positive when iou is over threshold
+        best_iou = tf.map_fn(
+            lambda x: tf.reduce_max(
+                broadcast_iou(x[0], tf.boolean_mask(x[1], tf.cast(x[2], tf.bool))),
+                axis=-1,
+            ),
+            (pred_box, true_box, obj_mask),
+            tf.float32,
+        )
+        ignore_mask = tf.cast(best_iou < ignore_thresh, tf.float32)
+
+        # 5. calculate all losses
+        xy_loss = (
+            obj_mask
+            * box_loss_scale
+            * tf.reduce_sum(tf.square(true_xy - pred_xy), axis=-1)
+        )
+        wh_loss = (
+            obj_mask
+            * box_loss_scale
+            * tf.reduce_sum(tf.square(true_wh - pred_wh), axis=-1)
+        )
+        obj_loss = tf.keras.losses.binary_crossentropy(true_obj, pred_obj)
+        obj_loss = obj_mask * obj_loss + (1 - obj_mask) * ignore_mask * obj_loss
+        # TODO: use binary_crossentropy instead
+        class_loss = obj_mask * tf.keras.losses.sparse_categorical_crossentropy(
+            true_class_idx, pred_class
+        )
+
+        # 6. sum over (batch, gridx, gridy, anchors) => (batch, 1)
+        xy_loss = tf.reduce_sum(xy_loss, axis=(1, 2, 3))
+        wh_loss = tf.reduce_sum(wh_loss, axis=(1, 2, 3))
+        obj_loss = tf.reduce_sum(obj_loss, axis=(1, 2, 3))
+        class_loss = tf.reduce_sum(class_loss, axis=(1, 2, 3))
+
+        return xy_loss + wh_loss + obj_loss + class_loss
+
+    return yolo_loss

From b639169704b6934c26d5231451af7c67891c4feb Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Fri, 24 Jul 2020 15:30:26 +0200
Subject: [PATCH 18/20] Reduce threshold

---
 scripts/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/test.py b/scripts/test.py
index 01d166d..b68ac2c 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -46,7 +46,7 @@
     num_classes=len(CLASSES),
     training=False,
     yolo_max_boxes=100,
-    yolo_iou_threshold=0.5,
+    yolo_iou_threshold=0.3,
     yolo_score_threshold=0.15,
 )
 model.load_weights("../yolov4_full.h5")

From 085108ff992775f1e477ce71b8d942630464ceb8 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Fri, 24 Jul 2020 15:35:08 +0200
Subject: [PATCH 19/20] Last modifications

---
 scripts/train.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 29d5445..f8b059b 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -65,7 +65,6 @@ def launch_training(batch_size, weights_path, all_frozen_epoch_number, backbone_
         for mask in YOLOV4_ANCHORS_MASKS
     ]
 
-    # Start training: 5 epochs with backbone + neck frozen
     for layer in (
         model.get_layer("CSPDarknet53").layers + model.get_layer("YOLOv4_neck").layers
     ):
@@ -88,7 +87,6 @@ def launch_training(batch_size, weights_path, all_frozen_epoch_number, backbone_
         ],
     )
 
-    # Keep training: 10 epochs with backbone frozen -- unfreeze neck
     for layer in model.get_layer("YOLOv4_neck").layers:
         layer.trainable = True
     model.compile(optimizer=optimizer, loss=loss)

From 4cda74b98ebce74488793deff6d4deba9e87aa67 Mon Sep 17 00:00:00 2001
From: Raphael Meudec <raphaelm@sicara.com>
Date: Thu, 6 Aug 2020 08:53:24 +0200
Subject: [PATCH 20/20] Black + fix tests

---
 scripts/test.py           | 125 ++++++++++++++++++++++++++++++++------
 scripts/train.py          |  63 ++++++++++++++++---
 tests/conftest.py         |   1 +
 tests/test_model.py       |   4 +-
 tests/test_yolov3_head.py |   4 +-
 5 files changed, 165 insertions(+), 32 deletions(-)

diff --git a/scripts/test.py b/scripts/test.py
index b68ac2c..f0ece6f 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -8,26 +8,109 @@
 INPUT_SHAPE = (HEIGHT, WIDTH, 3)
 
 PASCAL_VOC_CLASSES = [
-    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus",
-    "car", "cat", "chair", "cow", "diningtable", "dog", "horse",
-    "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
 ]
 
 COCO_CLASSES = [
-    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-    'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
-    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
-    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
-    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
-    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
-    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
-    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
-    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
-    'chair', 'couch', 'potted plant', 'bed', 'dining table',
-    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
-    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
-    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
-    'toothbrush'
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
 ]
 
 # Switch this variable between PASCAL_VOC_CLASSES and COCO_CLASSES depending
@@ -91,8 +174,12 @@ def plot_results(pil_img, boxes, scores, classes):
         )
         text = f"{CLASSES[cl]}: {score:0.2f}"
         ax.text(
-            xmin, ymin, text, color="white",
-            fontsize=15, fontweight="bold",
+            xmin,
+            ymin,
+            text,
+            color="white",
+            fontsize=15,
+            fontweight="bold",
             bbox=dict(facecolor=color, alpha=0.7),
         )
     plt.axis("off")
diff --git a/scripts/train.py b/scripts/train.py
index f8b059b..39f2e63 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -21,8 +21,17 @@
 INPUT_SHAPE = (608, 608, 3)
 
 
-def launch_training(batch_size, weights_path, all_frozen_epoch_number, backbone_frozen_epoch_number, num_epochs, dataset_name="voc"):
-    LOG_DIR = Path("./logs") / dataset_name / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
+def launch_training(
+    batch_size,
+    weights_path,
+    all_frozen_epoch_number,
+    backbone_frozen_epoch_number,
+    num_epochs,
+    dataset_name="voc",
+):
+    LOG_DIR = (
+        Path("./logs") / dataset_name / datetime.now().strftime("%m-%d-%Y %H:%M:%S")
+    )
 
     voc_dataset, infos = tfds.load(dataset_name, with_info=True, shuffle_files=True)
 
@@ -139,13 +148,49 @@ def launch_training(batch_size, weights_path, all_frozen_epoch_number, backbone_
 
 @click.command()
 @click.option("--batch_size", type=int, default=16, help="Size of mini-batch")
-@click.option("--weights_path", type=click.Path(exists=True), default=None, help="Path to pretrained weights")
-@click.option("--all_frozen_epoch_number", type=int, default=20, help="Number of epochs to perform with backbone and neck frozen")
-@click.option("--backbone_frozen_epoch_number", type=int, default=10, help="Number of epochs to perform with backbone frozen")
-@click.option("--num_epochs", type=int, default=50, help="Total number of epochs to perform")
-@click.option("--dataset_name", type=str, default="voc", help="Dataset used during training. Refer to TensorFlow Datasets documentation for dataset names.")
-def launch_training_command(batch_size, weights_path, all_frozen_epoch_number, backbone_frozen_epoch_number, num_epochs, dataset_name):
-    launch_training(batch_size, weights_path, all_frozen_epoch_number, backbone_frozen_epoch_number, num_epochs, dataset_name)
+@click.option(
+    "--weights_path",
+    type=click.Path(exists=True),
+    default=None,
+    help="Path to pretrained weights",
+)
+@click.option(
+    "--all_frozen_epoch_number",
+    type=int,
+    default=20,
+    help="Number of epochs to perform with backbone and neck frozen",
+)
+@click.option(
+    "--backbone_frozen_epoch_number",
+    type=int,
+    default=10,
+    help="Number of epochs to perform with backbone frozen",
+)
+@click.option(
+    "--num_epochs", type=int, default=50, help="Total number of epochs to perform"
+)
+@click.option(
+    "--dataset_name",
+    type=str,
+    default="voc",
+    help="Dataset used during training. Refer to TensorFlow Datasets documentation for dataset names.",
+)
+def launch_training_command(
+    batch_size,
+    weights_path,
+    all_frozen_epoch_number,
+    backbone_frozen_epoch_number,
+    num_epochs,
+    dataset_name,
+):
+    launch_training(
+        batch_size,
+        weights_path,
+        all_frozen_epoch_number,
+        backbone_frozen_epoch_number,
+        num_epochs,
+        dataset_name,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/conftest.py b/tests/conftest.py
index c665729..bdc522e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import pytest
+import tensorflow as tf
 
 from tf2_yolov4.anchors import YOLOV4_ANCHORS
 from tf2_yolov4.backbones.csp_darknet53 import csp_darknet53
diff --git a/tests/test_model.py b/tests/test_model.py
index 9e6be5c..2c19957 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -15,9 +15,9 @@ def test_model_should_predict_valid_shapes_at_training(yolov4_training, num_clas
         tf.random.uniform((n_images, 416, 416, 3))
     )
 
-    assert output_1.shape == (n_images, 52, 52, 3, expected_head_shape)
+    assert output_1.shape == (n_images, 13, 13, 3, expected_head_shape)
     assert output_2.shape == (n_images, 26, 26, 3, expected_head_shape)
-    assert output_3.shape == (n_images, 13, 13, 3, expected_head_shape)
+    assert output_3.shape == (n_images, 52, 52, 3, expected_head_shape)
 
 
 def test_model_should_predict_valid_shapes_at_inference(
diff --git a/tests/test_yolov3_head.py b/tests/test_yolov3_head.py
index 283867d..b24af98 100644
--- a/tests/test_yolov3_head.py
+++ b/tests/test_yolov3_head.py
@@ -6,9 +6,9 @@ def test_head_should_have_valid_output_shapes_training(
     expected_head_shape = (num_classes + objectness_score_shape) + bounding_box_shape
 
     output_1, output_2, output_3 = yolov3_head_416_training.outputs
-    assert output_1.shape.as_list() == [None, 52, 52, 3, expected_head_shape]
+    assert output_1.shape.as_list() == [None, 13, 13, 3, expected_head_shape]
     assert output_2.shape.as_list() == [None, 26, 26, 3, expected_head_shape]
-    assert output_3.shape.as_list() == [None, 13, 13, 3, expected_head_shape]
+    assert output_3.shape.as_list() == [None, 52, 52, 3, expected_head_shape]
 
 
 def test_head_should_have_valid_output_shapes_inference(