From 5446421326214bf0395ce0fec75e3c16530be9da Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Wed, 14 Aug 2024 12:46:29 +0200
Subject: [PATCH 01/12] Add YOLOV8 instance segmentation model using YOLACT

---
 keras_cv/src/models/__init__.py               |   1 +
 keras_cv/src/models/segmentation/__init__.py  |   3 +
 .../yolo_v8_segmentation/__init__.py          |   1 +
 .../yolo_v8_segmentation.py                   | 552 ++++++++++++++++++
 4 files changed, 557 insertions(+)
 create mode 100644 keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py
 create mode 100644 keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py

diff --git a/keras_cv/src/models/__init__.py b/keras_cv/src/models/__init__.py
index 77513eb8d8..ed52ed3c97 100644
--- a/keras_cv/src/models/__init__.py
+++ b/keras_cv/src/models/__init__.py
@@ -219,6 +219,7 @@
 from keras_cv.src.models.segmentation import SAMPromptEncoder
 from keras_cv.src.models.segmentation import SegmentAnythingModel
 from keras_cv.src.models.segmentation import TwoWayTransformer
+from keras_cv.src.models.segmentation import YOLOV8Segmentation
 from keras_cv.src.models.segmentation.segformer.segformer_aliases import (
     SegFormer,
 )
diff --git a/keras_cv/src/models/segmentation/__init__.py b/keras_cv/src/models/segmentation/__init__.py
index cec25eb010..b94278e6cb 100644
--- a/keras_cv/src/models/segmentation/__init__.py
+++ b/keras_cv/src/models/segmentation/__init__.py
@@ -21,3 +21,6 @@
     SegmentAnythingModel,
 )
 from keras_cv.src.models.segmentation.segment_anything import TwoWayTransformer
+from keras_cv.src.models.segmentation.yolo_v8_segmentation import (
+    YOLOV8Segmentation,
+)
diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py
new file mode 100644
index 0000000000..cf2303e0e7
--- /dev/null
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py
@@ -0,0 +1 @@
+from .yolo_v8_segmentation import YOLOV8Segmentation
diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
new file mode 100644
index 0000000000..144ec490a8
--- /dev/null
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
@@ -0,0 +1,552 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import warnings
+
+from keras_cv.src import bounding_box
+from keras_cv.src import layers
+from keras_cv.src.api_export import keras_cv_export
+from keras_cv.src.backend import keras
+from keras_cv.src.backend import ops
+from keras_cv.src.losses.ciou_loss import CIoULoss
+from keras_cv.src.models.backbones.backbone_presets import backbone_presets
+from keras_cv.src.models.backbones.backbone_presets import (
+    backbone_presets_with_weights,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+    apply_path_aggregation_fpn,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+    apply_yolo_v8_head as build_YOLOV8_detection_head,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+    decode_regression_to_boxes,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+    dist2bbox,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+    get_anchors,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector_presets import (  # noqa: E501
+    yolo_v8_detector_presets,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_label_encoder import (  # noqa: E501
+    YOLOV8LabelEncoder,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_layers import (
+    apply_conv_bn,
+)
+from keras_cv.src.models.task import Task
+from keras_cv.src.utils.python_utils import classproperty
+from keras_cv.src.utils.train import get_feature_extractor
+
+
+def build_mask_prototypes(x, dimension, num_prototypes, name="prototypes"):
+    """Builds protonet module. The outputs of this tensor are linearly combined
+    with the regressed mask coefficients to produce the predicted masks.
+    This is an implementation of the module proposed in YOLACT
+    https://arxiv.org/abs/1904.02689.
+
+    Args:
+        x: tensor, representing the output of a low backone featuremap i.e. P3.
+        dimension: integer, inner number of channels used for mask prototypes.
+        num_prototypes: integer, number of mask prototypes to build predictions.
+        name: string, a prefix for names of layers used by the prototypes.
+
+    Returns:
+        Tensor whose resolution is double than the inputted tensor.
+        This tensor is used as a base to build linear combinations of masks.
+    """
+    x = apply_conv_bn(x, dimension, 3, name=f"{name}_0")
+    x = apply_conv_bn(x, dimension, 3, name=f"{name}_1")
+    x = apply_conv_bn(x, dimension, 3, name=f"{name}_2")
+    upsampling_kwargs = {"interpolation": "bilinear", "name": f"{name}_3"}
+    x = keras.layers.UpSampling2D((2, 2), **upsampling_kwargs)(x)
+    x = apply_conv_bn(x, num_prototypes, 1, name=name)
+    return x
+
+
+def build_branch_mask_coefficients(x, dimension, num_prototypes, branch_arg):
+    """Builds mask coefficients of a single branch as in Figure 4 of
+    YOLACT https://arxiv.org/abs/1904.02689.
+
+    Args:
+        x: tensor, representing the outputs of a single branch of FPN i.e. P3.
+        dimension: integer, inner number of channels used for mask coefficients.
+        num_prototypes: integer, number of mask prototypes to build predictions.
+        branch_arg: integer, representing the branch number. This is used to
+        build the name of the tensors.
+
+    Returns:
+        Tensor representing the coefficients used to regress the outputted masks
+        of a single branch.
+    """
+    name = f"branch_{branch_arg}_mask_coefficients"
+    x = apply_conv_bn(x, dimension, 3, name=f"{name}_0")
+    x = apply_conv_bn(x, dimension, 3, name=f"{name}_1")
+    x = keras.layers.Conv2D(num_prototypes, 1, name=f"{name}_2")(x)
+    x = keras.layers.Reshape((-1, num_prototypes), name=f"{name}_3")(x)
+    return x
+
+
+def build_mask_coefficients(branches, num_prototypes, dimension=32):
+    """Builds all mask coefficients used to combine the prototypes masks.
+
+    Args:
+        branches: list of tensors, representing the outputs of a backbone model.
+        num_prototypes: integer, number of mask prototypes to build predictions.
+        dimension: integer, inner number of channels used for mask coefficients.
+
+    Returns:
+        Tensor representing the linear coefficients for regressing masks.
+    """
+    coefficients = []
+    for branch_arg, branch in enumerate(branches):
+        branch_coefficients = build_branch_mask_coefficients(
+            branch, dimension, num_prototypes, branch_arg
+        )
+        coefficients.append(branch_coefficients)
+    return keras.layers.Concatenate(axis=1, name="coefficients")(coefficients)
+
+
+def combine_linearly_prototypes(coefficients, prototypes):
+    """Linearly combines prototypes masks using the predicted coefficients.
+    This applies equation 1 of YOLACT https://arxiv.org/abs/1904.02689.
+
+    Args:
+        coefficients: tensor representing the linear coefficients of the
+            prototypes masks.
+        prototypes: tensor representing a base of masks that can be
+            linearly combined to produce predicted masks.
+
+    Returns:
+        Tensor representing all the predicted masks.
+    """
+    masks = ops.sigmoid(ops.einsum("bnm,bhwm->bnhw", coefficients, prototypes))
+    return masks
+
+
+def build_segmentation_head(branches, dimension, num_prototypes):
+    """Builds a YOLACT https://arxiv.org/abs/1904.02689 segmentation head
+    by predicting prototype masks, their linear coefficients, and combining
+    them to build the predicted masks.
+
+    Args:
+        branches: list of tensors, representing the outputs of a backbone model.
+        dimension: integer, inner number of channels used for mask prototypes.
+        num_prototypes: integer, number of mask prototypes to build predictions.
+
+    Returns:
+        Tensor representing all the predicted masks.
+    """
+    prototypes = build_mask_prototypes(branches[0], dimension, num_prototypes)
+    coefficients = build_mask_coefficients(branches, num_prototypes)
+    masks = combine_linearly_prototypes(coefficients, prototypes)
+    return masks
+
+
+def split_masks(masks, num_classes):
+    """Splits single channel segmentation mask into different class channels.
+
+    Args:
+        masks: tensor representing ground truth masks using a single
+        channel consisting of integers representing the pixel class.
+        num_classes: integer, total number of classes in the dataset.
+
+    Returns:
+        tensor representing each class mask in a different channel.
+    """
+    splitted_masks = []
+    for class_arg in range(num_classes):
+        splitted_masks.append(masks == class_arg)
+    splitted_masks = ops.concatenate(splitted_masks, axis=-1)
+    splitted_masks = ops.cast(splitted_masks, float)
+    return splitted_masks
+
+
+def repeat_masks(masks, class_labels, num_classes):
+    """Repeats ground truth masks by gathering each ground truth mask
+    channel using the assigned class label. This is used to build a
+    tensor with the same shape as the predicted masks in order to
+    compute the loss.
+
+    Args:
+        masks: tensor representing ground truth masks using a single
+        channel consisting of integers representing the pixel class.
+        class_labels: tensor, with the assigned class labels in each anchor box.
+            The class labels are in a one-hot encoding vector form.
+        num_classes: integer, total number of classes in the dataset.
+
+    Returns:
+        tensor representing each class mask in a different channel.
+    """
+    class_args = ops.argmax(class_labels, axis=-1)
+    batch_shape = class_args.shape[0]
+    class_args = ops.reshape(class_args, (batch_shape, 1, 1, -1))
+    masks = split_masks(masks, num_classes)
+    repeated_masks = ops.take_along_axis(masks, class_args, axis=-1)
+    return repeated_masks
+
+
+def unpack_input(data):
+    classes = data["bounding_boxes"]["classes"]
+    boxes = data["bounding_boxes"]["boxes"]
+    segmentation_masks = data["segmentation_masks"]
+    y = {
+        "classes": classes,
+        "boxes": boxes,
+        "segmentation_masks": segmentation_masks,
+    }
+    return data["images"], y
+
+
+@keras_cv_export(
+    [
+        "keras_cv.models.YOLOV8Segmentation",
+        "keras_cv.models.segmentation.YOLOV8Segmentation",
+    ]
+)
+class YOLOV8Segmentation(Task):
+    """Implements the YOLOV8 architecture for instance segmentation."""
+
+    def __init__(
+        self,
+        backbone,
+        num_classes,
+        bounding_box_format,
+        fpn_depth=2,
+        label_encoder=None,
+        prediction_decoder=None,
+        prototype_dimension=256,
+        num_prototypes=32,
+        **kwargs,
+    ):
+        extractor_levels = ["P3", "P4", "P5"]
+        extractor_layer_names = [
+            backbone.pyramid_level_inputs[i] for i in extractor_levels
+        ]
+        feature_extractor = get_feature_extractor(
+            backbone, extractor_layer_names, extractor_levels
+        )
+
+        images = keras.layers.Input(feature_extractor.input_shape[1:])
+        features = list(feature_extractor(images).values())
+
+        branches = apply_path_aggregation_fpn(
+            features, fpn_depth, name="pa_fpn"
+        )
+
+        masks = build_segmentation_head(
+            branches, prototype_dimension, num_prototypes
+        )
+
+        detection_head = build_YOLOV8_detection_head(branches, num_classes)
+        boxes, classes = detection_head["boxes"], detection_head["classes"]
+
+        # TODO remove no-op layer to overwrite metric name for pretty printing.
+        boxes = keras.layers.Concatenate(axis=1, name="box")([boxes])
+        scores = keras.layers.Concatenate(axis=1, name="class")([classes])
+        masks = keras.layers.Concatenate(axis=1, name="masks")([masks])
+
+        outputs = {"boxes": boxes, "classes": scores, "masks": masks}
+        super().__init__(inputs=images, outputs=outputs, **kwargs)
+
+        self.bounding_box_format = bounding_box_format
+        self._prediction_decoder = (
+            prediction_decoder
+            or layers.NonMaxSuppression(
+                bounding_box_format=bounding_box_format,
+                from_logits=False,
+                confidence_threshold=0.2,
+                iou_threshold=0.7,
+            )
+        )
+        self.backbone = backbone
+        self.fpn_depth = fpn_depth
+        self.num_classes = num_classes
+        self.label_encoder = label_encoder or YOLOV8LabelEncoder(
+            num_classes=num_classes
+        )
+
+    def compile(
+        self,
+        box_loss,
+        classification_loss,
+        segmentation_loss,
+        box_loss_weight=7.5,
+        classification_loss_weight=0.5,
+        segmentation_loss_weight=6.125,
+        metrics=None,
+        **kwargs,
+    ):
+        """Compiles the YOLOV8Detector.
+
+        `compile()` mirrors the standard Keras `compile()` method, but has one
+        key distinction -- two losses must be provided: `box_loss` and
+        `classification_loss`.
+
+        Args:
+            box_loss: a Keras loss to use for box offset regression. A
+                preconfigured loss is given when the string "ciou" is passed.
+            classification_loss: a Keras loss to use for box classification. A
+                preconfigured loss is provided when the string
+                "binary_crossentropy" is passed.
+            segmentation_loss:a Keras loss for segmentation.
+            box_loss_weight: (optional) float, a scaling factor for the box
+                loss. Defaults to 7.5.
+            classification_loss_weight: (optional) float, a scaling factor for
+                the classification loss. Defaults to 0.5.
+            segmentation_loss_weight: (optional) float, a scaling factor for
+                the classification loss. Defaults to 6.125.
+            kwargs: most other `keras.Model.compile()` arguments are supported
+                and propagated to the `keras.Model` class.
+        """
+        if metrics is not None:
+            raise ValueError("User metrics not yet supported for YOLOV8")
+
+        if isinstance(box_loss, str):
+            if box_loss == "ciou":
+                box_loss = CIoULoss(bounding_box_format="xyxy", reduction="sum")
+            elif box_loss == "iou":
+                warnings.warn(
+                    "YOLOV8 recommends using CIoU loss, but was configured to "
+                    "use standard IoU. Consider using `box_loss='ciou'` "
+                    "instead."
+                )
+            else:
+                raise ValueError(
+                    f"Invalid box loss for YOLOV8Detector: {box_loss}. Box "
+                    "loss should be a keras.Loss or the string 'ciou'."
+                )
+        if isinstance(classification_loss, str):
+            if classification_loss == "binary_crossentropy":
+                classification_loss = keras.losses.BinaryCrossentropy(
+                    reduction="sum"
+                )
+            else:
+                raise ValueError(
+                    "Invalid classification loss for YOLOV8Detector: "
+                    f"{classification_loss}. Classification loss should be a "
+                    "keras.Loss or the string 'binary_crossentropy'."
+                )
+
+        if isinstance(segmentation_loss, str):
+            if segmentation_loss == "binary_crossentropy":
+                segmentation_loss = keras.losses.BinaryCrossentropy(
+                    reduction="sum"
+                )
+            else:
+                raise ValueError(
+                    "Invalid segmentation loss for YOLOV8Detector: "
+                    f"{classification_loss}. Classification loss should be a "
+                    "keras.Loss or the string 'binary_crossentropy'."
+                )
+
+        self.box_loss = box_loss
+        self.classification_loss = classification_loss
+        self.segmentation_loss = segmentation_loss
+        self.box_loss_weight = box_loss_weight
+        self.classification_loss_weight = classification_loss_weight
+        self.segmentation_loss_weight = segmentation_loss_weight
+
+        losses = {
+            "box": self.box_loss,
+            "class": self.classification_loss,
+            "masks": self.segmentation_loss,
+        }
+
+        super().compile(loss=losses, **kwargs)
+
+    def train_step(self, *args):
+        data = args[-1]
+        args = args[:-1]
+        x, y = unpack_input(data)
+        return super().train_step(*args, (x, y))
+
+    def test_step(self, *args):
+        data = args[-1]
+        args = args[:-1]
+        x, y = unpack_input(data)
+        return super().test_step(*args, (x, y))
+
+    def compute_loss(self, x, y, y_pred, sample_weight=None, **kwargs):
+        box_pred, cls_pred = y_pred["boxes"], y_pred["classes"]
+
+        pred_boxes = decode_regression_to_boxes(box_pred)
+        pred_scores = cls_pred
+
+        anchor_points, stride_tensor = get_anchors(image_shape=x.shape[1:])
+        stride_tensor = ops.expand_dims(stride_tensor, axis=-1)
+
+        gt_labels = y["classes"]
+
+        mask_gt = ops.all(y["boxes"] > -1.0, axis=-1, keepdims=True)
+        gt_bboxes = bounding_box.convert_format(
+            y["boxes"],
+            source=self.bounding_box_format,
+            target="xyxy",
+            images=x,
+        )
+
+        pred_bboxes = dist2bbox(pred_boxes, anchor_points)
+
+        target_bboxes, target_scores, fg_mask = self.label_encoder(
+            pred_scores,
+            ops.cast(pred_bboxes * stride_tensor, gt_bboxes.dtype),
+            anchor_points * stride_tensor,
+            gt_labels,
+            gt_bboxes,
+            mask_gt,
+        )
+
+        target_bboxes /= stride_tensor
+        target_scores_sum = ops.maximum(ops.sum(target_scores), 1)
+        box_weight = ops.expand_dims(
+            ops.sum(target_scores, axis=-1) * fg_mask,
+            axis=-1,
+        )
+
+        target_masks = y["segmentation_masks"]
+        _, num_priors, H_mask, W_mask = y_pred["masks"].shape
+        target_masks = ops.image.resize(target_masks, (H_mask, W_mask))
+        target_masks = repeat_masks(
+            target_masks, target_scores, self.num_classes
+        )
+        batch_size, H_mask, W_mask, num_anchors = target_masks.shape
+        target_masks = ops.reshape(
+            target_masks, (batch_size, num_anchors, H_mask, W_mask)
+        )
+
+        y_true = {
+            "box": target_bboxes * fg_mask[..., None],
+            "class": target_scores,
+            "masks": target_masks * fg_mask[..., None, None],
+        }
+        y_pred = {
+            "box": pred_bboxes * fg_mask[..., None],
+            "class": pred_scores,
+            "masks": y_pred["masks"] * fg_mask[..., None, None],
+        }
+        sample_weights = {
+            "box": self.box_loss_weight * box_weight / target_scores_sum,
+            "class": self.classification_loss_weight / target_scores_sum,
+            "masks": self.segmentation_loss_weight / target_scores_sum,
+        }
+
+        return super().compute_loss(
+            x=x, y=y_true, y_pred=y_pred, sample_weight=sample_weights, **kwargs
+        )
+
+    def decode_predictions(
+        self,
+        pred,
+        images,
+    ):
+        boxes = pred["boxes"]
+        scores = pred["classes"]
+
+        boxes = decode_regression_to_boxes(boxes)
+
+        anchor_points, stride_tensor = get_anchors(image_shape=images.shape[1:])
+        stride_tensor = ops.expand_dims(stride_tensor, axis=-1)
+
+        box_preds = dist2bbox(boxes, anchor_points) * stride_tensor
+        box_preds = bounding_box.convert_format(
+            box_preds,
+            source="xyxy",
+            target=self.bounding_box_format,
+            images=images,
+        )
+
+        return self.prediction_decoder(box_preds, scores)
+
+    def predict_step(self, *args):
+        outputs = super().predict_step(*args)
+        if isinstance(outputs, tuple):
+            return self.decode_predictions(outputs[0], args[-1]), outputs[1]
+        else:
+            return self.decode_predictions(outputs, args[-1])
+
+    @property
+    def prediction_decoder(self):
+        return self._prediction_decoder
+
+    @prediction_decoder.setter
+    def prediction_decoder(self, prediction_decoder):
+        if prediction_decoder.bounding_box_format != self.bounding_box_format:
+            raise ValueError(
+                "Expected `prediction_decoder` and YOLOV8Detector to "
+                "use the same `bounding_box_format`, but got "
+                "`prediction_decoder.bounding_box_format="
+                f"{prediction_decoder.bounding_box_format}`, and "
+                "`self.bounding_box_format="
+                f"{self.bounding_box_format}`."
+            )
+        self._prediction_decoder = prediction_decoder
+        self.make_predict_function(force=True)
+        self.make_train_function(force=True)
+        self.make_test_function(force=True)
+
+    def get_config(self):
+        return {
+            "num_classes": self.num_classes,
+            "bounding_box_format": self.bounding_box_format,
+            "fpn_depth": self.fpn_depth,
+            "backbone": keras.saving.serialize_keras_object(self.backbone),
+            "label_encoder": keras.saving.serialize_keras_object(
+                self.label_encoder
+            ),
+            "prediction_decoder": keras.saving.serialize_keras_object(
+                self._prediction_decoder
+            ),
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        config["backbone"] = keras.saving.deserialize_keras_object(
+            config["backbone"]
+        )
+        label_encoder = config.get("label_encoder")
+        if label_encoder is not None and isinstance(label_encoder, dict):
+            config["label_encoder"] = keras.saving.deserialize_keras_object(
+                label_encoder
+            )
+        prediction_decoder = config.get("prediction_decoder")
+        if prediction_decoder is not None and isinstance(
+            prediction_decoder, dict
+        ):
+            config["prediction_decoder"] = (
+                keras.saving.deserialize_keras_object(prediction_decoder)
+            )
+        return cls(**config)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return copy.deepcopy({**backbone_presets, **yolo_v8_detector_presets})
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return copy.deepcopy(
+            {**backbone_presets_with_weights, **yolo_v8_detector_presets}
+        )
+
+    @classproperty
+    def backbone_presets(cls):
+        """Dictionary of preset names and configurations of compatible
+        backbones."""
+        return copy.deepcopy(backbone_presets)

From 3d2e62c9219bee0770c138623ed97f288e640476 Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Fri, 6 Sep 2024 08:33:08 +0200
Subject: [PATCH 02/12] Add indices to output for mask post processing
 selection

---
 keras_cv/src/layers/object_detection/non_max_suppression.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/keras_cv/src/layers/object_detection/non_max_suppression.py b/keras_cv/src/layers/object_detection/non_max_suppression.py
index 45993258e4..fc657e09f5 100644
--- a/keras_cv/src/layers/object_detection/non_max_suppression.py
+++ b/keras_cv/src/layers/object_detection/non_max_suppression.py
@@ -159,6 +159,7 @@ def call(
             image_shape=image_shape,
         )
         bounding_boxes = {
+            "idx": idx,
             "boxes": box_prediction,
             "confidence": confidence_prediction,
             "classes": ops.argmax(class_prediction, axis=-1),

From d79112c38b2543b7e3927bb4d9b8efab9ec41834 Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Fri, 6 Sep 2024 12:58:43 +0200
Subject: [PATCH 03/12] Add YOLOV8 instance segmentation encoder

---
 .../yolo_v8_label_encoder.py                  | 275 ++++++++++++++++++
 1 file changed, 275 insertions(+)
 create mode 100644 keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
new file mode 100644
index 0000000000..9cee3bf146
--- /dev/null
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
@@ -0,0 +1,275 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Label encoder for YOLOV8. This uses the TOOD Task Aligned Assigner approach.
+See https://arxiv.org/abs/2108.07755 for more info, as well as a reference
+implementation at https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
+"""  # noqa: E501
+
+import tensorflow as tf
+
+from keras_cv.src import bounding_box
+from keras_cv.src.api_export import keras_cv_export
+from keras_cv.src.backend import keras
+from keras_cv.src.backend import ops
+from keras_cv.src.bounding_box.iou import compute_ciou
+
+
+def is_anchor_center_within_box(anchors, gt_bboxes):
+    return ops.all(
+        ops.logical_and(
+            gt_bboxes[:, :, None, :2] < anchors,
+            gt_bboxes[:, :, None, 2:] > anchors,
+        ),
+        axis=-1,
+    )
+
+
+@keras_cv_export("keras_cv.models.yolov8.LabelEncoder")
+class YOLOV8LabelEncoder(keras.layers.Layer):
+    """
+    Encodes ground truth boxes to target boxes and class labels for training a
+    YOLOV8 model. This is an implementation of the Task-aligned sample
+    assignment scheme proposed in https://arxiv.org/abs/2108.07755.
+
+    Args:
+        num_classes: integer, the number of classes in the training dataset
+        max_anchor_matches: optional integer, the maximum number of anchors to
+            match with any given ground truth box. For example, when the default
+            10 is used, the 10 candidate anchor points with the highest
+            alignment score are matched with a ground truth box. If less than 10
+            candidate anchors exist, all candidates will be matched to the box.
+        alpha: float, a parameter to control the influence of class predictions
+            on the alignment score of an anchor box. This is the alpha parameter
+            in equation 9 of https://arxiv.org/pdf/2108.07755.pdf.
+        beta: float, a parameter to control the influence of box IOUs on the
+            alignment score of an anchor box. This is the beta parameter in
+            equation 9 of https://arxiv.org/pdf/2108.07755.pdf.
+        epsilon: float, a small number used for numerical stability in division
+            (to avoid diving by zero), and used as a threshold to eliminate very
+            small matches based on alignment scores of approximately zero.
+    """
+
+    def __init__(
+        self,
+        num_classes,
+        max_anchor_matches=10,
+        alpha=0.5,
+        beta=6.0,
+        epsilon=1e-9,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_anchor_matches = max_anchor_matches
+        self.num_classes = num_classes
+        self.alpha = alpha
+        self.beta = beta
+        self.epsilon = epsilon
+
+    def assign(
+        self, scores, decode_bboxes, anchors, gt_labels, gt_bboxes, gt_mask
+    ):
+        """Assigns ground-truth boxes to anchors.
+
+        Uses the task-aligned assignment strategy for matching ground truth
+        and anchor boxes based on prediction scores and IoU.
+        """
+        num_anchors = anchors.shape[0]
+
+        # Box scores are the predicted scores for each anchor, ground truth box
+        # pair. Only the predicted score for the class of the GT box is included
+        # Shape: (B, num_gt_boxes, num_anchors) (after transpose)
+        bbox_scores = ops.take_along_axis(
+            scores,
+            ops.cast(ops.maximum(gt_labels[:, None, :], 0), "int32"),
+            axis=-1,
+        )
+        bbox_scores = ops.transpose(bbox_scores, (0, 2, 1))
+
+        # Overlaps are the IoUs of each predicted box and each GT box.
+        # Shape: (B, num_gt_boxes, num_anchors)
+        overlaps = compute_ciou(
+            ops.expand_dims(gt_bboxes, axis=2),
+            ops.expand_dims(decode_bboxes, axis=1),
+            bounding_box_format="xyxy",
+        )
+
+        # Alignment metrics are a combination of box scores and overlaps, per
+        # the task-aligned-assignment formula.
+        # Metrics are forced to 0 for boxes which have been masked in the GT
+        # input (e.g. due to padding)
+        alignment_metrics = ops.power(bbox_scores, self.alpha) * ops.power(
+            overlaps, self.beta
+        )
+        alignment_metrics = ops.where(gt_mask, alignment_metrics, 0)
+
+        # Only anchors which are inside of relevant GT boxes are considered
+        # for assignment.
+        # This is a boolean tensor of shape (B, num_gt_boxes, num_anchors)
+        matching_anchors_in_gt_boxes = is_anchor_center_within_box(
+            anchors, gt_bboxes
+        )
+        alignment_metrics = ops.where(
+            matching_anchors_in_gt_boxes, alignment_metrics, 0
+        )
+
+        # The top-k highest alignment metrics are used to select K candidate
+        # anchors for each GT box.
+        candidate_metrics, candidate_idxs = ops.top_k(
+            alignment_metrics, self.max_anchor_matches
+        )
+        candidate_idxs = ops.where(candidate_metrics > 0, candidate_idxs, -1)
+
+        # We now compute a dense grid of anchors and GT boxes. This is useful
+        # for picking a GT box when an anchor matches to 2, as well as returning
+        # to a dense format for a mask of which anchors have been matched.
+        anchors_matched_gt_box = ops.zeros_like(overlaps)
+        for k in range(self.max_anchor_matches):
+            anchors_matched_gt_box += ops.one_hot(
+                candidate_idxs[:, :, k], num_anchors
+            )
+
+        # We zero-out the overlap for anchor, GT box pairs which don't match.
+        overlaps *= anchors_matched_gt_box
+        # In cases where one anchor matches to 2 GT boxes, we pick the GT box
+        # with the highest overlap as a max.
+        gt_box_matches_per_anchor = ops.argmax(overlaps, axis=1)
+        gt_box_matches_per_anchor_mask = ops.max(overlaps, axis=1) > 0
+        # TODO(ianstenbit): Once ops.take_along_axis supports -1 in Torch,
+        # replace gt_box_matches_per_anchor with
+        # ops.where(
+        #     ops.max(overlaps, axis=1) > 0, ops.argmax(overlaps, axis=1), -1
+        # )
+        # and get rid of the manual masking
+        gt_box_matches_per_anchor = ops.cast(gt_box_matches_per_anchor, "int32")
+
+        # We select the GT boxes and labels that correspond to anchor matches.
+        bbox_labels = ops.take_along_axis(
+            gt_bboxes, gt_box_matches_per_anchor[:, :, None], axis=1
+        )
+        bbox_labels = ops.where(
+            gt_box_matches_per_anchor_mask[:, :, None], bbox_labels, -1
+        )
+        class_labels = ops.take_along_axis(
+            gt_labels, gt_box_matches_per_anchor, axis=1
+        )
+        class_labels = ops.where(
+            gt_box_matches_per_anchor_mask, class_labels, -1
+        )
+
+        class_labels = ops.one_hot(
+            ops.cast(class_labels, "int32"), self.num_classes
+        )
+
+        # Finally, we normalize an anchor's class labels based on the relative
+        # strength of the anchors match with the corresponding GT box.
+        alignment_metrics *= anchors_matched_gt_box
+        max_alignment_per_gt_box = ops.max(
+            alignment_metrics, axis=-1, keepdims=True
+        )
+        max_overlap_per_gt_box = ops.max(overlaps, axis=-1, keepdims=True)
+
+        normalized_alignment_metrics = ops.max(
+            alignment_metrics
+            * max_overlap_per_gt_box
+            / (max_alignment_per_gt_box + self.epsilon),
+            axis=-2,
+        )
+        class_labels *= normalized_alignment_metrics[:, :, None]
+
+        # On TF backend, the final "4" becomes a dynamic shape so we include
+        # this to force it to a static shape of 4. This does not actually
+        # reshape the Tensor.
+        bbox_labels = ops.reshape(bbox_labels, (-1, num_anchors, 4))
+        return (
+            ops.stop_gradient(bbox_labels),
+            ops.stop_gradient(class_labels),
+            ops.stop_gradient(
+                # ops.cast(gt_box_matches_per_anchor > -1, "float32")
+                ops.cast(gt_box_matches_per_anchor > 0, "float32")
+            ),
+        )
+
+    def call(
+        self, scores, decode_bboxes, anchors, gt_labels, gt_bboxes, gt_mask
+    ):
+        """Computes target boxes and classes for anchors.
+
+        Args:
+            scores: a Float Tensor of shape (batch_size, num_anchors,
+                num_classes) representing predicted class scores for each
+                anchor.
+            decode_bboxes: a Float Tensor of shape (batch_size, num_anchors, 4)
+                representing predicted boxes for each anchor.
+            anchors: a Float Tensor of shape (batch_size, num_anchors, 2)
+                representing the xy coordinates of the center of each anchor.
+            gt_labels: a Float Tensor of shape (batch_size, num_gt_boxes)
+                representing the classes of ground truth boxes.
+            gt_bboxes: a Float Tensor of shape (batch_size, num_gt_boxes, 4)
+                representing the ground truth bounding boxes in xyxy format.
+            gt_mask: A Boolean Tensor of shape (batch_size, num_gt_boxes)
+                representing whether a box in `gt_bboxes` is a real box or a
+                non-box that exists due to padding.
+
+        Returns:
+            A tuple of the following:
+                - A Float Tensor of shape (batch_size, num_anchors, 4)
+                    representing box targets for the model.
+                - A Float Tensor of shape (batch_size, num_anchors, num_classes)
+                    representing class targets for the model.
+                - A Boolean Tensor of shape (batch_size, num_anchors)
+                    representing whether each anchor was a match with a ground
+                    truth box. Anchors that didn't match with a ground truth
+                    box should be excluded from both class and box losses.
+        """
+        if isinstance(gt_bboxes, tf.RaggedTensor):
+            dense_bounding_boxes = bounding_box.to_dense(
+                {"boxes": gt_bboxes, "classes": gt_labels},
+            )
+            gt_bboxes = dense_bounding_boxes["boxes"]
+            gt_labels = dense_bounding_boxes["classes"]
+
+        if isinstance(gt_mask, tf.RaggedTensor):
+            gt_mask = gt_mask.to_tensor()
+
+        max_num_boxes = ops.shape(gt_bboxes)[1]
+
+        # If there are no GT boxes in the batch, we short-circuit and return
+        # empty targets to avoid NaNs.
+        return ops.cond(
+            ops.array(max_num_boxes > 0),
+            lambda: self.assign(
+                scores, decode_bboxes, anchors, gt_labels, gt_bboxes, gt_mask
+            ),
+            lambda: (
+                ops.zeros_like(decode_bboxes),
+                ops.zeros_like(scores),
+                ops.zeros_like(scores[..., 0]),
+            ),
+        )
+
+    def count_params(self):
+        # The label encoder has no weights, so we short-circuit the weight
+        # counting to avoid having to `build` this layer unnecessarily.
+        return 0
+
+    def get_config(self):
+        config = {
+            "max_anchor_matches": self.max_anchor_matches,
+            "num_classes": self.num_classes,
+            "alpha": self.alpha,
+            "beta": self.beta,
+            "epsilon": self.epsilon,
+        }
+        base_config = super().get_config()
+        return dict(list(base_config.items()) + list(config.items()))

From dfba5b6aadedf17e877ad970817f21c77d2c987b Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Fri, 6 Sep 2024 12:59:17 +0200
Subject: [PATCH 04/12] Add YOLOV8 backbone features for instance segmentation
 model

---
 .../yolo_v8_segmentation/yolo_v8_backbone.py  | 380 ++++++++++++++++++
 1 file changed, 380 insertions(+)
 create mode 100644 keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py
new file mode 100644
index 0000000000..4719096d3c
--- /dev/null
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py
@@ -0,0 +1,380 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras_cv.src.backend import keras
+from keras_cv.src.backend import ops
+
+BATCH_NORM_EPSILON = 1e-3
+BATCH_NORM_MOMENTUM = 0.97
+BOX_REGRESSION_CHANNELS = 64
+
+
+# TODO(ianstenbit): Remove this method once we're using CSPDarkNet backbone
+# (Calls to it should be inlined in the detector head)
+def apply_conv_bn(
+    inputs,
+    output_channel,
+    kernel_size=1,
+    strides=1,
+    activation="swish",
+    name="conv_bn",
+):
+    if kernel_size > 1:
+        inputs = keras.layers.ZeroPadding2D(
+            padding=kernel_size // 2, name=f"{name}_pad"
+        )(inputs)
+
+    x = keras.layers.Conv2D(
+        filters=output_channel,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding="valid",
+        use_bias=False,
+        name=f"{name}_conv",
+    )(inputs)
+    x = keras.layers.BatchNormalization(
+        momentum=BATCH_NORM_MOMENTUM,
+        epsilon=BATCH_NORM_EPSILON,
+        name=f"{name}_bn",
+    )(x)
+    x = keras.layers.Activation(activation, name=name)(x)
+    return x
+
+
+# TODO(ianstenbit): Remove this method once we're using CSPDarkNet backbone
+# Calls to it should instead call the CSP block from the DarkNet implementation.
+def apply_csp_block(
+    inputs,
+    channels=-1,
+    depth=2,
+    shortcut=True,
+    expansion=0.5,
+    activation="swish",
+    name="csp_block",
+):
+    channel_axis = -1
+    channels = channels if channels > 0 else inputs.shape[channel_axis]
+    hidden_channels = int(channels * expansion)
+
+    pre = apply_conv_bn(
+        inputs,
+        hidden_channels * 2,
+        kernel_size=1,
+        activation=activation,
+        name=f"{name}_pre",
+    )
+    short, deep = ops.split(pre, 2, axis=channel_axis)
+
+    out = [short, deep]
+    for id in range(depth):
+        deep = apply_conv_bn(
+            deep,
+            hidden_channels,
+            kernel_size=3,
+            activation=activation,
+            name=f"{name}_pre_{id}_1",
+        )
+        deep = apply_conv_bn(
+            deep,
+            hidden_channels,
+            kernel_size=3,
+            activation=activation,
+            name=f"{name}_pre_{id}_2",
+        )
+        deep = (out[-1] + deep) if shortcut else deep
+        out.append(deep)
+    out = ops.concatenate(out, axis=channel_axis)
+    out = apply_conv_bn(
+        out,
+        channels,
+        kernel_size=1,
+        activation=activation,
+        name=f"{name}_output",
+    )
+    return out
+
+
+def get_anchors(
+    image_shape,
+    strides=[8, 16, 32],
+    base_anchors=[0.5, 0.5],
+):
+    """Gets anchor points for YOLOV8.
+
+    YOLOV8 uses anchor points representing the center of proposed boxes, and
+    matches ground truth boxes to anchors based on center points.
+
+    Args:
+        image_shape: tuple or list of two integers representing the height and
+            width of input images, respectively.
+        strides: tuple of list of integers, the size of the strides across the
+            image size that should be used to create anchors.
+        base_anchors: tuple or list of two integers representing the offset from
+            (0,0) to start creating the center of anchor boxes, relative to the
+            stride. For example, using the default (0.5, 0.5) creates the first
+            anchor box for each stride such that its center is half of a stride
+            from the edge of the image.
+
+    Returns:
+        A tuple of anchor centerpoints and anchor strides. Multiplying the
+        two together will yield the centerpoints in absolute x,y format.
+
+    """
+    base_anchors = ops.array(base_anchors, dtype="float32")
+
+    all_anchors = []
+    all_strides = []
+    for stride in strides:
+        hh_centers = ops.arange(0, image_shape[0], stride)
+        ww_centers = ops.arange(0, image_shape[1], stride)
+        ww_grid, hh_grid = ops.meshgrid(ww_centers, hh_centers)
+        grid = ops.cast(
+            ops.reshape(ops.stack([hh_grid, ww_grid], 2), [-1, 1, 2]),
+            "float32",
+        )
+        anchors = (
+            ops.expand_dims(
+                base_anchors * ops.array([stride, stride], "float32"), 0
+            )
+            + grid
+        )
+        anchors = ops.reshape(anchors, [-1, 2])
+        all_anchors.append(anchors)
+        all_strides.append(ops.repeat(stride, anchors.shape[0]))
+
+    all_anchors = ops.cast(ops.concatenate(all_anchors, axis=0), "float32")
+    all_strides = ops.cast(ops.concatenate(all_strides, axis=0), "float32")
+
+    all_anchors = all_anchors / all_strides[:, None]
+
+    # Swap the x and y coordinates of the anchors.
+    all_anchors = ops.concatenate(
+        [all_anchors[:, 1, None], all_anchors[:, 0, None]], axis=-1
+    )
+    return all_anchors, all_strides
+
+
+def apply_path_aggregation_fpn(features, depth=3, name="fpn"):
+    """Applies the Feature Pyramid Network (FPN) to the outputs of a backbone.
+
+    Args:
+        features: list of tensors representing the P3, P4, and P5 outputs of the
+            backbone.
+        depth: integer, the depth of the CSP blocks used in the FPN.
+        name: string, a prefix for names of layers used by the FPN.
+
+    Returns:
+        A list of three tensors whose shapes are the same as the three inputs,
+        but which are dependent on each of the three inputs to combine the high
+        resolution of the P3 inputs with the strong feature representations of
+        the P5 inputs.
+
+    """
+    p3, p4, p5 = features
+
+    # Upsample P5 and concatenate with P4, then apply a CSPBlock.
+    p5_upsampled = ops.repeat(ops.repeat(p5, 2, axis=1), 2, axis=2)
+    p4p5 = ops.concatenate([p5_upsampled, p4], axis=-1)
+    p4p5 = apply_csp_block(
+        p4p5,
+        channels=p4.shape[-1],
+        depth=depth,
+        shortcut=False,
+        activation="swish",
+        name=f"{name}_p4p5",
+    )
+
+    # Upsample P4P5 and concatenate with P3, then apply a CSPBlock.
+    p4p5_upsampled = ops.repeat(ops.repeat(p4p5, 2, axis=1), 2, axis=2)
+    p3p4p5 = ops.concatenate([p4p5_upsampled, p3], axis=-1)
+    p3p4p5 = apply_csp_block(
+        p3p4p5,
+        channels=p3.shape[-1],
+        depth=depth,
+        shortcut=False,
+        activation="swish",
+        name=f"{name}_p3p4p5",
+    )
+
+    # Downsample P3P4P5, concatenate with P4P5, and apply a CSP Block.
+    p3p4p5_d1 = apply_conv_bn(
+        p3p4p5,
+        p3p4p5.shape[-1],
+        kernel_size=3,
+        strides=2,
+        activation="swish",
+        name=f"{name}_p3p4p5_downsample1",
+    )
+    p3p4p5_d1 = ops.concatenate([p3p4p5_d1, p4p5], axis=-1)
+    p3p4p5_d1 = apply_csp_block(
+        p3p4p5_d1,
+        channels=p4p5.shape[-1],
+        shortcut=False,
+        activation="swish",
+        name=f"{name}_p3p4p5_downsample1_block",
+    )
+
+    # Downsample the resulting P3P4P5 again, concatenate with P5, and apply
+    # another CSP Block.
+    p3p4p5_d2 = apply_conv_bn(
+        p3p4p5_d1,
+        p3p4p5_d1.shape[-1],
+        kernel_size=3,
+        strides=2,
+        activation="swish",
+        name=f"{name}_p3p4p5_downsample2",
+    )
+    p3p4p5_d2 = ops.concatenate([p3p4p5_d2, p5], axis=-1)
+    p3p4p5_d2 = apply_csp_block(
+        p3p4p5_d2,
+        channels=p5.shape[-1],
+        shortcut=False,
+        activation="swish",
+        name=f"{name}_p3p4p5_downsample2_block",
+    )
+
+    return [p3p4p5, p3p4p5_d1, p3p4p5_d2]
+
+
+def decode_regression_to_boxes(preds):
+    """Decodes the results of the YOLOV8Detector forward-pass into boxes.
+
+    Returns left / top / right / bottom predictions with respect to anchor
+    points.
+
+    Each coordinate is encoded with 16 predicted values. Those predictions are
+    softmaxed and multiplied by [0..15] to make predictions. The resulting
+    predictions are relative to the stride of an anchor box (and correspondingly
+    relative to the scale of the feature map from which the predictions came).
+    """
+    preds_bbox = keras.layers.Reshape((-1, 4, BOX_REGRESSION_CHANNELS // 4))(
+        preds
+    )
+    preds_bbox = ops.nn.softmax(preds_bbox, axis=-1) * ops.arange(
+        BOX_REGRESSION_CHANNELS // 4, dtype="float32"
+    )
+    return ops.sum(preds_bbox, axis=-1)
+
+
+def dist2bbox(distance, anchor_points):
+    """Decodes distance predictions into xyxy boxes.
+
+    Input left / top / right / bottom predictions are transformed into xyxy box
+    predictions based on anchor points.
+
+    The resulting xyxy predictions must be scaled by the stride of their
+    corresponding anchor points to yield an absolute xyxy box.
+    """
+    left_top, right_bottom = ops.split(distance, 2, axis=-1)
+    x1y1 = anchor_points - left_top
+    x2y2 = anchor_points + right_bottom
+    return ops.concatenate((x1y1, x2y2), axis=-1)  # xyxy bbox
+
+
+def apply_yolo_v8_head(
+    inputs,
+    num_classes,
+    name="yolo_v8_head",
+):
+    """Applies a YOLOV8 head.
+
+    Makes box and class predictions based on the output of a feature pyramid
+    network.
+
+    Args:
+        inputs: list of tensors output by the Feature Pyramid Network, should
+            have the same shape as the P3, P4, and P5 outputs of the backbone.
+        num_classes: integer, the number of classes that a bounding box could
+            possibly be assigned to.
+        name: string, a prefix for names of layers used by the head.
+
+    Returns: A dictionary with two entries. The "boxes" entry contains box
+        regression predictions, while the "classes" entry contains class
+        predictions.
+    """
+    # 64 is the default number of channels, as 16 components are used to predict
+    # each of the 4 offsets for corner points of a bounding box with respect
+    # to the center point. In cases where the input has much higher resolution
+    # (e.g. the P3 input has >256 channels), we use additional channels for
+    # the intermediate conv layers. This is only true for very large backbones.
+    box_channels = max(BOX_REGRESSION_CHANNELS, inputs[0].shape[-1] // 4)
+
+    # We use at least num_classes channels for intermediate conv layer for class
+    # predictions. In most cases, the P3 input has many more channels than the
+    # number of classes, so we preserve those channels until the final layer.
+    class_channels = max(num_classes, inputs[0].shape[-1])
+
+    # We compute box and class predictions for each of the feature maps from
+    # the FPN and then combine them.
+    outputs = []
+    for id, feature in enumerate(inputs):
+        cur_name = f"{name}_{id+1}"
+
+        box_predictions = apply_conv_bn(
+            feature,
+            box_channels,
+            kernel_size=3,
+            activation="swish",
+            name=f"{cur_name}_box_1",
+        )
+        box_predictions = apply_conv_bn(
+            box_predictions,
+            box_channels,
+            kernel_size=3,
+            activation="swish",
+            name=f"{cur_name}_box_2",
+        )
+        box_predictions = keras.layers.Conv2D(
+            filters=BOX_REGRESSION_CHANNELS,
+            kernel_size=1,
+            name=f"{cur_name}_box_3_conv",
+        )(box_predictions)
+
+        class_predictions = apply_conv_bn(
+            feature,
+            class_channels,
+            kernel_size=3,
+            activation="swish",
+            name=f"{cur_name}_class_1",
+        )
+        class_predictions = apply_conv_bn(
+            class_predictions,
+            class_channels,
+            kernel_size=3,
+            activation="swish",
+            name=f"{cur_name}_class_2",
+        )
+        class_predictions = keras.layers.Conv2D(
+            filters=num_classes,
+            kernel_size=1,
+            name=f"{cur_name}_class_3_conv",
+        )(class_predictions)
+        class_predictions = keras.layers.Activation(
+            "sigmoid", name=f"{cur_name}_classifier"
+        )(class_predictions)
+
+        out = ops.concatenate([box_predictions, class_predictions], axis=-1)
+        out = keras.layers.Reshape(
+            [-1, out.shape[-1]], name=f"{cur_name}_output_reshape"
+        )(out)
+        outputs.append(out)
+
+    outputs = ops.concatenate(outputs, axis=1)
+    outputs = keras.layers.Activation(
+        "linear", dtype="float32", name="box_outputs"
+    )(outputs)
+
+    return {
+        "boxes": outputs[:, :, :BOX_REGRESSION_CHANNELS],
+        "classes": outputs[:, :, BOX_REGRESSION_CHANNELS:],
+    }

From 6bcef934302e46e05abdc881ff79a3678dcf521f Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Fri, 6 Sep 2024 13:00:24 +0200
Subject: [PATCH 05/12] Add Keras copyright

---
 .../segmentation/yolo_v8_segmentation/__init__.py   | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py
index cf2303e0e7..4659dd7878 100644
--- a/keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/__init__.py
@@ -1 +1,14 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .yolo_v8_segmentation import YOLOV8Segmentation

From 7ab9ae0bcbe0270822b6a7f927237f76da1dc8cc Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Fri, 6 Sep 2024 13:02:33 +0200
Subject: [PATCH 06/12] Update YOLOV8 instance segmentation model and loss

---
 .../yolo_v8_segmentation.py                   | 488 ++++++++++++++----
 1 file changed, 384 insertions(+), 104 deletions(-)

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
index 144ec490a8..869bae1594 100644
--- a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
@@ -14,50 +14,59 @@
 import copy
 import warnings
 
+from keras.layers import Activation
+from keras.layers import Concatenate
+from keras.layers import Conv2D
+from keras.layers import Input
+from keras.layers import Reshape
+from keras.layers import UpSampling2D
+from keras.losses import BinaryCrossentropy
+
 from keras_cv.src import bounding_box
-from keras_cv.src import layers
 from keras_cv.src.api_export import keras_cv_export
 from keras_cv.src.backend import keras
 from keras_cv.src.backend import ops
+from keras_cv.src.layers import NonMaxSuppression
 from keras_cv.src.losses.ciou_loss import CIoULoss
 from keras_cv.src.models.backbones.backbone_presets import backbone_presets
 from keras_cv.src.models.backbones.backbone_presets import (
     backbone_presets_with_weights,
 )
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector_presets import (  # noqa: E501
+    yolo_v8_detector_presets,
+)
+from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_layers import (
+    apply_conv_bn,
+)
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (  # noqa: E501
     apply_path_aggregation_fpn,
 )
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
-    apply_yolo_v8_head as build_YOLOV8_detection_head,
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
+    apply_yolo_v8_head,
 )
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
     decode_regression_to_boxes,
 )
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
     dist2bbox,
 )
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector import (
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
     get_anchors,
 )
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_detector_presets import (  # noqa: E501
-    yolo_v8_detector_presets,
-)
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_label_encoder import (  # noqa: E501
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_label_encoder import (  # noqa: E501
     YOLOV8LabelEncoder,
 )
-from keras_cv.src.models.object_detection.yolo_v8.yolo_v8_layers import (
-    apply_conv_bn,
-)
 from keras_cv.src.models.task import Task
 from keras_cv.src.utils.python_utils import classproperty
 from keras_cv.src.utils.train import get_feature_extractor
 
 
 def build_mask_prototypes(x, dimension, num_prototypes, name="prototypes"):
-    """Builds protonet module. The outputs of this tensor are linearly combined
-    with the regressed mask coefficients to produce the predicted masks.
-    This is an implementation of the module proposed in YOLACT
-    https://arxiv.org/abs/1904.02689.
+    """Builds mask prototype network.
+
+    The outputs of this module are linearly combined with the regressed mask
+    coefficients to produce the predicted masks. This is an implementation of
+    the module proposed in YOLACT https://arxiv.org/abs/1904.02689.
 
     Args:
         x: tensor, representing the output of a low backone featuremap i.e. P3.
@@ -72,15 +81,15 @@ def build_mask_prototypes(x, dimension, num_prototypes, name="prototypes"):
     x = apply_conv_bn(x, dimension, 3, name=f"{name}_0")
     x = apply_conv_bn(x, dimension, 3, name=f"{name}_1")
     x = apply_conv_bn(x, dimension, 3, name=f"{name}_2")
-    upsampling_kwargs = {"interpolation": "bilinear", "name": f"{name}_3"}
-    x = keras.layers.UpSampling2D((2, 2), **upsampling_kwargs)(x)
-    x = apply_conv_bn(x, num_prototypes, 1, name=name)
+    x = UpSampling2D((2, 2), "channels_last", "bilinear", name=f"{name}_3")(x)
+    x = apply_conv_bn(x, dimension, 3, name=f"{name}_4")
+    x = Conv2D(num_prototypes, 1, padding="same", name=f"{name}_5")(x)
+    x = Activation("relu", name=name)(x)
     return x
 
 
 def build_branch_mask_coefficients(x, dimension, num_prototypes, branch_arg):
-    """Builds mask coefficients of a single branch as in Figure 4 of
-    YOLACT https://arxiv.org/abs/1904.02689.
+    """Builds mask coefficients of a single branch.
 
     Args:
         x: tensor, representing the outputs of a single branch of FPN i.e. P3.
@@ -96,13 +105,16 @@ def build_branch_mask_coefficients(x, dimension, num_prototypes, branch_arg):
     name = f"branch_{branch_arg}_mask_coefficients"
     x = apply_conv_bn(x, dimension, 3, name=f"{name}_0")
     x = apply_conv_bn(x, dimension, 3, name=f"{name}_1")
-    x = keras.layers.Conv2D(num_prototypes, 1, name=f"{name}_2")(x)
-    x = keras.layers.Reshape((-1, num_prototypes), name=f"{name}_3")(x)
+    x = Conv2D(num_prototypes, 1, name=f"{name}_2")(x)
+    x = Activation("tanh", name=f"{name}_3")(x)
+    x = Reshape((-1, num_prototypes), name=f"{name}_4")(x)
     return x
 
 
-def build_mask_coefficients(branches, num_prototypes, dimension=32):
-    """Builds all mask coefficients used to combine the prototypes masks.
+def build_mask_coefficients(branches, num_prototypes, dimension):
+    """Builds all mask coefficients.
+
+    This coefficients represent the linear terms used to combine the masks.
 
     Args:
         branches: list of tensors, representing the outputs of a backbone model.
@@ -118,11 +130,12 @@ def build_mask_coefficients(branches, num_prototypes, dimension=32):
             branch, dimension, num_prototypes, branch_arg
         )
         coefficients.append(branch_coefficients)
-    return keras.layers.Concatenate(axis=1, name="coefficients")(coefficients)
+    return Concatenate(axis=1, name="coefficients")(coefficients)
 
 
 def combine_linearly_prototypes(coefficients, prototypes):
     """Linearly combines prototypes masks using the predicted coefficients.
+
     This applies equation 1 of YOLACT https://arxiv.org/abs/1904.02689.
 
     Args:
@@ -138,21 +151,32 @@ def combine_linearly_prototypes(coefficients, prototypes):
     return masks
 
 
-def build_segmentation_head(branches, dimension, num_prototypes):
-    """Builds a YOLACT https://arxiv.org/abs/1904.02689 segmentation head
-    by predicting prototype masks, their linear coefficients, and combining
-    them to build the predicted masks.
+def build_segmentation_head(
+    branches, prototype_dimension, num_prototypes, coefficient_dimension
+):
+    """Builds a YOLACT https://arxiv.org/abs/1904.02689 segmentation head.
+
+    The proposed segmentation head of YOLACT https://arxiv.org/abs/1904.02689
+    predicts prototype masks, their linear coefficients, and combines them to
+    build the predicted masks.
 
     Args:
         branches: list of tensors, representing the outputs of a backbone model.
-        dimension: integer, inner number of channels used for mask prototypes.
+        prototype_dimension: integer, inner number of channels used for mask
+        prototypes.
         num_prototypes: integer, number of mask prototypes to build predictions.
+        coefficient_dimension: integer, inner number of channels used for
+        predicting the mask coefficients.
 
     Returns:
         Tensor representing all the predicted masks.
     """
-    prototypes = build_mask_prototypes(branches[0], dimension, num_prototypes)
-    coefficients = build_mask_coefficients(branches, num_prototypes)
+    prototypes = build_mask_prototypes(
+        branches[0], prototype_dimension, num_prototypes
+    )
+    coefficients = build_mask_coefficients(
+        branches, num_prototypes, coefficient_dimension
+    )
     masks = combine_linearly_prototypes(coefficients, prototypes)
     return masks
 
@@ -169,7 +193,7 @@ def split_masks(masks, num_classes):
         tensor representing each class mask in a different channel.
     """
     splitted_masks = []
-    for class_arg in range(num_classes):
+    for class_arg in range(1, num_classes + 1):
         splitted_masks.append(masks == class_arg)
     splitted_masks = ops.concatenate(splitted_masks, axis=-1)
     splitted_masks = ops.cast(splitted_masks, float)
@@ -177,10 +201,11 @@ def split_masks(masks, num_classes):
 
 
 def repeat_masks(masks, class_labels, num_classes):
-    """Repeats ground truth masks by gathering each ground truth mask
-    channel using the assigned class label. This is used to build a
-    tensor with the same shape as the predicted masks in order to
-    compute the loss.
+    """Repeats ground truth masks.
+
+    Each ground truth mask channel is gathered using the assigned class label.
+    This is used to build a tensor with the same shape as the predicted masks
+    in order to compute the mask loss.
 
     Args:
         masks: tensor representing ground truth masks using a single
@@ -195,12 +220,148 @@ def repeat_masks(masks, class_labels, num_classes):
     class_args = ops.argmax(class_labels, axis=-1)
     batch_shape = class_args.shape[0]
     class_args = ops.reshape(class_args, (batch_shape, 1, 1, -1))
-    masks = split_masks(masks, num_classes)
-    repeated_masks = ops.take_along_axis(masks, class_args, axis=-1)
+    splitted_masks = split_masks(masks, num_classes)
+    repeated_masks = ops.take_along_axis(splitted_masks, class_args, axis=-1)
     return repeated_masks
 
 
+def build_target_masks(true_masks, true_scores, H_mask, W_mask, num_classes):
+    """Build target masks by resizing and repeating ground truth masks.
+
+    Resizes ground truth masks to the predicted tensor mask shape, and repeats
+    masks using the largest true score value.
+
+    Args:
+        true_masks: tensor representing the ground truth masks.
+        true_scores: tensor with the class scores assigned by the label encoder.
+        num_classes: integer indicating the total number of classes.
+
+    Returns:
+        Tensor with resized and repeated target masks.
+    """
+    true_masks = ops.image.resize(true_masks, (H_mask, W_mask), "nearest")
+    true_masks = repeat_masks(true_masks, true_scores, num_classes)
+    true_masks = ops.moveaxis(true_masks, 3, 1)
+    return true_masks
+
+
+def compute_box_areas(boxes):
+    """Computes area for bounding boxes
+
+    Args:
+        boxes: (N, 4) or (batch_size, N, 4) float tensor, either batched
+        or unbatched boxes.
+
+    Returns:
+        a float Tensor of [N] or [batch_size, N]
+    """
+    y_min, x_min, y_max, x_max = ops.split(boxes[..., :4], 4, axis=-1)
+    box_areas = ops.squeeze((y_max - y_min) * (x_max - x_min), axis=-1)
+    return box_areas
+
+
+def normalize_box_areas(box_areas, H, W):
+    """Normalizes box areas by dividing by the total image area.
+
+    Args:
+        boxes: tensor of shape (B, N, 4) with bounding boxes in xyxy format.
+        H: integer indicating the mask height.
+        W: integer indicating the mask width.
+
+    Returns:
+        Tensor of shape (B, N, 4).
+    """
+    return box_areas / (H * W)
+
+
+def get_backbone_pyramid_layer_names(backbone, level_names):
+    """Gets actual layer names from the provided pyramid levels inside backbone.
+
+    Args:
+        backbone: Keras backbone model with the field "pyramid_level_inputs".
+        level_names: list of strings indicating the level names.
+
+    Returns:
+        List of layer strings indicating the layer names of each level.
+    """
+    layer_names = []
+    for level_name in level_names:
+        layer_names.append(backbone.pyramid_level_inputs[level_name])
+    return layer_names
+
+
+def build_feature_extractor(backbone, level_names):
+    """Builds feature extractor directly from the level names
+
+    Args:
+        backbone: Keras backbone model with the field "pyramid_level_inputs".
+        level_names: list of strings indicating the level names.
+
+    Returns:
+        Keras Model with level names as outputs.
+    """
+    layer_names = get_backbone_pyramid_layer_names(backbone, level_names)
+    extractor = get_feature_extractor(backbone, layer_names, level_names)
+    return extractor
+
+
+def extend_branches(inputs, extractor, FPN_depth):
+    """Extends extractor model with a feature pyramid network.
+
+    Args:
+        inputs: tensor, with image input.
+        extractor: Keras Model with level names as outputs.
+        FPN_depth: integer representing the feature pyramid depth.
+
+    Returns:
+        List of extended branch tensors.
+    """
+    features = list(extractor(inputs).values())
+    branches = apply_path_aggregation_fpn(features, FPN_depth, name="pa_fpn")
+    return branches
+
+
+def extend_backbone(backbone, level_names, trainable, FPN_depth):
+    """Extends backbone levels with a feature pyramid network.
+
+    Args:
+        backbone: Keras backbone model with the field "pyramid_level_inputs".
+        level_names: list of strings indicating the level names.
+        trainable: boolean indicating if backbone should be optimized.
+        FPN_depth: integer representing the feature pyramid depth.
+
+    Return:
+        Tuple with input image tensor, and list of extended branch tensors.
+    """
+    feature_extractor = build_feature_extractor(backbone, level_names)
+    feature_extractor.trainable = trainable
+    inputs = Input(feature_extractor.input_shape[1:])
+    branches = extend_branches(inputs, feature_extractor, FPN_depth)
+    return inputs, branches
+
+
+def add_no_op_for_pretty_print(x, name):
+    """Wrap tensor with dummy operation to change tensor name.
+
+    # Args:
+        x: tensor.
+        name: string name given to the tensor.
+
+    Return:
+        Tensor with new wrapped name.
+    """
+    return Concatenate(axis=1, name=name)([x])
+
+
 def unpack_input(data):
+    """Unpacks standard keras-cv data dictionary into inputs and outputs.
+
+    Args:
+        data: Dictionary with the standard key-value pairs of keras-cv
+
+    Returns:
+       Tuple containing inputs and outputs.
+    """
     classes = data["bounding_boxes"]["classes"]
     boxes = data["bounding_boxes"]["boxes"]
     segmentation_masks = data["segmentation_masks"]
@@ -212,6 +373,83 @@ def unpack_input(data):
     return data["images"], y
 
 
+def boxes_to_masks(boxes, H_mask, W_mask):
+    """Build mask with True values inside the bounding box and False elsewhere.
+
+    Args:
+        boxes: tensor of shape (N, 4) with bounding boxes in xyxy format.
+        H_mask: integer indicating the height of the mask.
+        W_mask: integer indicating the width of the mask.
+
+    Returns:
+        A mask of the specified shape with True values inside bounding box.
+    """
+    x_min, y_min, x_max, y_max = ops.split(boxes, 4, 1)
+
+    y_range = ops.arange(H_mask)
+    x_range = ops.arange(W_mask)
+    y_indices, x_indices = ops.meshgrid(y_range, x_range, indexing="ij")
+
+    y_indices = ops.expand_dims(y_indices, 0)
+    x_indices = ops.expand_dims(x_indices, 0)
+
+    x_min = ops.expand_dims(x_min, axis=1)
+    y_min = ops.expand_dims(y_min, axis=1)
+    x_max = ops.expand_dims(x_max, axis=1)
+    y_max = ops.expand_dims(y_max, axis=1)
+
+    in_x_min_to_x_max = ops.logical_and(x_indices >= x_min, x_indices < x_max)
+    in_y_min_to_y_max = ops.logical_and(y_indices >= y_min, y_indices < y_max)
+    masks = ops.logical_and(in_x_min_to_x_max, in_y_min_to_y_max)
+    return masks
+
+
+def batch_boxes_to_masks(boxes, H_mask, W_mask):
+    """Converts boxes to masks over the batch dimension.
+
+    Args:
+        boxes: tensor of shape (B, N, 4) with bounding boxes in xyxy format.
+        H_mask: integer indicating the height of the mask.
+        W_mask: integer indicating the width of the mask.
+
+    Returns:
+        Batch of masks with True values inside the bounding box.
+    """
+    batch_size = boxes.shape[0]
+    crop_masks = []
+    for batch_arg in range(batch_size):
+        boxes_sample = ops.cast(boxes[batch_arg], "int32")
+        crop_mask = boxes_to_masks(boxes_sample, H_mask, W_mask)
+        crop_masks.append(crop_mask[None])
+    crop_masks = ops.concatenate(crop_masks)
+    crop_masks = ops.cast(crop_masks, "float32")
+    return crop_masks
+
+
+def build_mask_weights(weight, boxes, H_mask, W_mask):
+    """Build mask sample weights used to scale the loss at every batch.
+
+    To balance the loss of masks with different shapes, YOLACT assigns a weight
+    to each mask that is inversely proportional to its area.
+
+    Args:
+        weight: float, weight multiplied to the mask loss.
+        boxes: tensor of shape (B, N, 4) with bounding boxes in xyxy format.
+        H_image: integer indicating the inputted image height.
+        W_image: integer indicating the inputted image width.
+        H_mask: integer indicating the predicted mask height.
+        W_mask: integer indicating the predicted mask width.
+
+    Returns:
+        Tensor of shape [B, num_anchors, 1, 1] containing the mask weights.
+    """
+    box_areas = compute_box_areas(boxes)
+    box_areas = normalize_box_areas(box_areas, H_mask, W_mask)
+    weights = ops.divide_no_nan(weight, box_areas)
+    weights = weights / (H_mask * W_mask)
+    return weights[..., None, None]
+
+
 @keras_cv_export(
     [
         "keras_cv.models.YOLOV8Segmentation",
@@ -219,7 +457,59 @@ def unpack_input(data):
     ]
 )
 class YOLOV8Segmentation(Task):
-    """Implements the YOLOV8 architecture for instance segmentation."""
+    """Implements the YOLOV8 instance segmentation model.
+
+    Args:
+        backbone: `keras.Model`, must implement the `pyramid_level_inputs`
+            property with keys "P3", "P4", and "P5" and layer names as values.
+            A sensible backbone to use is the `keras_cv.models.YOLOV8Backbone`.
+        num_classes: integer, the number of classes in your dataset excluding
+            the background class. Classes should be represented by integers in
+            the range [0, num_classes).
+        bounding_box_format: string, the format of bounding boxes of input
+            dataset.
+        fpn_depth: integer, a specification of the depth of the CSP blocks in
+            the Feature Pyramid Network. This is usually 1, 2, or 3, depending
+            on the size of your YOLOV8Detector model. We recommend using 3 for
+            "yolo_v8_l_backbone" and "yolo_v8_xl_backbone". Defaults to 2.
+        label_encoder: (Optional)  A `YOLOV8LabelEncoder` that is
+            responsible for transforming input boxes into trainable labels for
+            YOLOV8Detector. If not provided, a default is provided.
+        prediction_decoder: (Optional)  A `keras.layers.Layer` that is
+            responsible for transforming YOLOV8 predictions into usable
+            bounding boxes. If not provided, a default is provided. The
+            default `prediction_decoder` layer is a
+            `keras_cv.layers.MultiClassNonMaxSuppression` layer, which uses
+            a Non-Max Suppression for box pruning.
+        prototype_dimension: integer, inner number of channels used for mask
+        prototypes. Defaults to 256.
+        num_prototypes: integer, number of mask prototypes to build predictions.
+            Defaults to 32.
+        coefficient_dimension: integer, inner number of channels used for
+            predicting the mask coefficients. Defaults to 32
+        trainable_backbone: boolean indicating if the provided backbone should
+            be trained as well. Defaults to False.
+
+    Example:
+    ```python
+    images = tf.ones(shape=(1, 512, 512, 3))
+
+    model = keras_cv.models.YOLOV8Segmentation(
+        num_classes=20,
+        bounding_box_format="xywh",
+        backbone=keras_cv.models.YOLOV8Backbone.from_preset(
+            "yolo_v8_m_backbone_coco"
+        ),
+        fpn_depth=2
+    )
+
+    # Evaluate model without box decoding and NMS
+    model(images)
+
+    # Prediction with box decoding and NMS
+    model.predict(images)
+    ```
+    """
 
     def __init__(
         self,
@@ -231,47 +521,31 @@ def __init__(
         prediction_decoder=None,
         prototype_dimension=256,
         num_prototypes=32,
+        coefficient_dimension=32,
+        trainable_backbone=False,
         **kwargs,
     ):
-        extractor_levels = ["P3", "P4", "P5"]
-        extractor_layer_names = [
-            backbone.pyramid_level_inputs[i] for i in extractor_levels
-        ]
-        feature_extractor = get_feature_extractor(
-            backbone, extractor_layer_names, extractor_levels
+        level_names = ["P3", "P4", "P5"]
+        images, branches = extend_backbone(
+            backbone, level_names, trainable_backbone, fpn_depth
         )
-
-        images = keras.layers.Input(feature_extractor.input_shape[1:])
-        features = list(feature_extractor(images).values())
-
-        branches = apply_path_aggregation_fpn(
-            features, fpn_depth, name="pa_fpn"
-        )
-
         masks = build_segmentation_head(
-            branches, prototype_dimension, num_prototypes
+            branches, prototype_dimension, num_prototypes, coefficient_dimension
         )
-
-        detection_head = build_YOLOV8_detection_head(branches, num_classes)
+        detection_head = apply_yolo_v8_head(branches, num_classes)
         boxes, classes = detection_head["boxes"], detection_head["classes"]
-
-        # TODO remove no-op layer to overwrite metric name for pretty printing.
-        boxes = keras.layers.Concatenate(axis=1, name="box")([boxes])
-        scores = keras.layers.Concatenate(axis=1, name="class")([classes])
-        masks = keras.layers.Concatenate(axis=1, name="masks")([masks])
-
-        outputs = {"boxes": boxes, "classes": scores, "masks": masks}
+        boxes = add_no_op_for_pretty_print(boxes, "box")
+        masks = add_no_op_for_pretty_print(masks, "masks")
+        classes = add_no_op_for_pretty_print(classes, "class")
+        outputs = {"boxes": boxes, "classes": classes, "masks": masks}
         super().__init__(inputs=images, outputs=outputs, **kwargs)
 
         self.bounding_box_format = bounding_box_format
-        self._prediction_decoder = (
-            prediction_decoder
-            or layers.NonMaxSuppression(
-                bounding_box_format=bounding_box_format,
-                from_logits=False,
-                confidence_threshold=0.2,
-                iou_threshold=0.7,
-            )
+        self._prediction_decoder = prediction_decoder or NonMaxSuppression(
+            bounding_box_format=bounding_box_format,
+            from_logits=False,
+            confidence_threshold=0.2,
+            iou_threshold=0.7,
         )
         self.backbone = backbone
         self.fpn_depth = fpn_depth
@@ -279,6 +553,10 @@ def __init__(
         self.label_encoder = label_encoder or YOLOV8LabelEncoder(
             num_classes=num_classes
         )
+        self.prototype_dimension = prototype_dimension
+        self.num_prototypes = num_prototypes
+        self.coefficient_dimension = coefficient_dimension
+        self.trainable_backbone = trainable_backbone
 
     def compile(
         self,
@@ -291,7 +569,7 @@ def compile(
         metrics=None,
         **kwargs,
     ):
-        """Compiles the YOLOV8Detector.
+        """Compiles the YOLOV8Segmentation.
 
         `compile()` mirrors the standard Keras `compile()` method, but has one
         key distinction -- two losses must be provided: `box_loss` and
@@ -332,9 +610,7 @@ def compile(
                 )
         if isinstance(classification_loss, str):
             if classification_loss == "binary_crossentropy":
-                classification_loss = keras.losses.BinaryCrossentropy(
-                    reduction="sum"
-                )
+                classification_loss = BinaryCrossentropy(reduction="sum")
             else:
                 raise ValueError(
                     "Invalid classification loss for YOLOV8Detector: "
@@ -344,9 +620,7 @@ def compile(
 
         if isinstance(segmentation_loss, str):
             if segmentation_loss == "binary_crossentropy":
-                segmentation_loss = keras.losses.BinaryCrossentropy(
-                    reduction="sum"
-                )
+                segmentation_loss = BinaryCrossentropy(reduction="sum")
             else:
                 raise ValueError(
                     "Invalid segmentation loss for YOLOV8Detector: "
@@ -413,50 +687,48 @@ def compute_loss(self, x, y, y_pred, sample_weight=None, **kwargs):
 
         target_bboxes /= stride_tensor
         target_scores_sum = ops.maximum(ops.sum(target_scores), 1)
+
         box_weight = ops.expand_dims(
             ops.sum(target_scores, axis=-1) * fg_mask,
             axis=-1,
         )
 
-        target_masks = y["segmentation_masks"]
-        _, num_priors, H_mask, W_mask = y_pred["masks"].shape
-        target_masks = ops.image.resize(target_masks, (H_mask, W_mask))
-        target_masks = repeat_masks(
-            target_masks, target_scores, self.num_classes
+        true_masks = y["segmentation_masks"]
+        pred_masks = y_pred["masks"]
+        batch_size, _, H_mask, W_mask = pred_masks.shape
+        true_masks = build_target_masks(
+            true_masks, target_scores, H_mask, W_mask, self.num_classes
         )
-        batch_size, H_mask, W_mask, num_anchors = target_masks.shape
-        target_masks = ops.reshape(
-            target_masks, (batch_size, num_anchors, H_mask, W_mask)
+
+        crop_masks = batch_boxes_to_masks(target_bboxes, H_mask, W_mask)
+        H_image, W_image = x.shape[1:3]
+        mask_weights = build_mask_weights(
+            self.segmentation_loss_weight, target_bboxes, H_mask, W_mask
         )
 
         y_true = {
             "box": target_bboxes * fg_mask[..., None],
             "class": target_scores,
-            "masks": target_masks * fg_mask[..., None, None],
+            "masks": true_masks * crop_masks * fg_mask[..., None, None],
         }
         y_pred = {
             "box": pred_bboxes * fg_mask[..., None],
             "class": pred_scores,
-            "masks": y_pred["masks"] * fg_mask[..., None, None],
+            "masks": pred_masks * crop_masks * fg_mask[..., None, None],
         }
         sample_weights = {
             "box": self.box_loss_weight * box_weight / target_scores_sum,
             "class": self.classification_loss_weight / target_scores_sum,
-            "masks": self.segmentation_loss_weight / target_scores_sum,
+            "masks": mask_weights,
         }
 
         return super().compute_loss(
             x=x, y=y_true, y_pred=y_pred, sample_weight=sample_weights, **kwargs
         )
 
-    def decode_predictions(
-        self,
-        pred,
-        images,
-    ):
+    def decode_predictions(self, pred, images):
         boxes = pred["boxes"]
         scores = pred["classes"]
-
         boxes = decode_regression_to_boxes(boxes)
 
         anchor_points, stride_tensor = get_anchors(image_shape=images.shape[1:])
@@ -474,10 +746,14 @@ def decode_predictions(
 
     def predict_step(self, *args):
         outputs = super().predict_step(*args)
-        if isinstance(outputs, tuple):
-            return self.decode_predictions(outputs[0], args[-1]), outputs[1]
-        else:
-            return self.decode_predictions(outputs, args[-1])
+        decoded_outputs = self.decode_predictions(outputs, args[-1])
+        selected_args = decoded_outputs["idx"][..., None, None]
+        masks = outputs["masks"]
+        masks = ops.take_along_axis(masks, selected_args, axis=1)
+        is_valid_output = decoded_outputs["confidence"] > -1
+        masks = ops.where(is_valid_output[..., None, None], masks, -1)
+        decoded_outputs["masks"] = masks
+        return decoded_outputs
 
     @property
     def prediction_decoder(self):
@@ -501,16 +777,20 @@ def prediction_decoder(self, prediction_decoder):
 
     def get_config(self):
         return {
+            "backbone": keras.saving.serialize_keras_object(self.backbone),
             "num_classes": self.num_classes,
             "bounding_box_format": self.bounding_box_format,
             "fpn_depth": self.fpn_depth,
-            "backbone": keras.saving.serialize_keras_object(self.backbone),
             "label_encoder": keras.saving.serialize_keras_object(
                 self.label_encoder
             ),
             "prediction_decoder": keras.saving.serialize_keras_object(
                 self._prediction_decoder
             ),
+            "prototype_dimension": self.prototype_dimension,
+            "num_prototypes": self.num_prototypes,
+            "coefficient_dimension": self.coefficient_dimension,
+            "trainable_backbone": self.trainable_backbone,
         }
 
     @classmethod

From 6a5e0bd9d12b4450a83f315498966e3b95d0ed8f Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Wed, 11 Sep 2024 07:41:43 +0200
Subject: [PATCH 07/12] Add yolo v8 segmentation to GPU tests

---
 .kokoro/github/ubuntu/gpu/build.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.kokoro/github/ubuntu/gpu/build.sh b/.kokoro/github/ubuntu/gpu/build.sh
index a19b109f82..6855b8f897 100644
--- a/.kokoro/github/ubuntu/gpu/build.sh
+++ b/.kokoro/github/ubuntu/gpu/build.sh
@@ -70,7 +70,8 @@ then
       keras_cv/src/models/object_detection_3d \
       keras_cv/src/models/segmentation \
       keras_cv/src/models/feature_extractor/clip \
-      keras_cv/src/models/stable_diffusion
+      keras_cv/src/models/stable_diffusion \
+      keras_cv/src/models/segmentation/yolo_v8_segmentation
 else
    pytest --cache-clear --check_gpu --run_large --durations 0 \
       keras_cv/src/bounding_box \
@@ -85,5 +86,6 @@ else
       keras_cv/src/models/object_detection_3d \
       keras_cv/src/models/segmentation \
       keras_cv/src/models/feature_extractor/clip \
-      keras_cv/src/models/stable_diffusion
-fi
\ No newline at end of file
+      keras_cv/src/models/stable_diffusion \
+      keras_cv/src/models/segmentation/yolo_v8_segmentation
+fi

From 0b5cbdef6312fd475b70bb8e1d9810c589d2d744 Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Wed, 11 Sep 2024 07:50:32 +0200
Subject: [PATCH 08/12] Fix overwrite of API for label encoder

---
 .../segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
index 9cee3bf146..0c48b75e4e 100644
--- a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
@@ -19,7 +19,6 @@
 import tensorflow as tf
 
 from keras_cv.src import bounding_box
-from keras_cv.src.api_export import keras_cv_export
 from keras_cv.src.backend import keras
 from keras_cv.src.backend import ops
 from keras_cv.src.bounding_box.iou import compute_ciou
@@ -35,7 +34,6 @@ def is_anchor_center_within_box(anchors, gt_bboxes):
     )
 
 
-@keras_cv_export("keras_cv.models.yolov8.LabelEncoder")
 class YOLOV8LabelEncoder(keras.layers.Layer):
     """
     Encodes ground truth boxes to target boxes and class labels for training a

From a337643232daf28c754988d51efe2c5f3feab6f7 Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Wed, 11 Sep 2024 07:51:26 +0200
Subject: [PATCH 09/12] Add to API YOLOV8Segmentation model

---
 keras_cv/api/models/__init__.py              | 3 +++
 keras_cv/api/models/segmentation/__init__.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/keras_cv/api/models/__init__.py b/keras_cv/api/models/__init__.py
index 97f9bc577b..ec56460feb 100644
--- a/keras_cv/api/models/__init__.py
+++ b/keras_cv/api/models/__init__.py
@@ -253,6 +253,9 @@
 from keras_cv.src.models.segmentation.segment_anything.sam_transformer import (
     TwoWayTransformer,
 )
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_segmentation import (
+    YOLOV8Segmentation,
+)
 from keras_cv.src.models.stable_diffusion.stable_diffusion import (
     StableDiffusion,
 )
diff --git a/keras_cv/api/models/segmentation/__init__.py b/keras_cv/api/models/segmentation/__init__.py
index 9f5276304b..f111e7c9c0 100644
--- a/keras_cv/api/models/segmentation/__init__.py
+++ b/keras_cv/api/models/segmentation/__init__.py
@@ -12,3 +12,6 @@
 from keras_cv.src.models.segmentation.segment_anything.sam import (
     SegmentAnythingModel,
 )
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_segmentation import (
+    YOLOV8Segmentation,
+)

From 9b5a635406c59054c5b174db7715bbf863725248 Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Wed, 11 Sep 2024 07:55:37 +0200
Subject: [PATCH 10/12] Fix linter E501 warnings with imports

---
 .../yolo_v8_segmentation/yolo_v8_segmentation.py          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
index 869bae1594..dcf625df0a 100644
--- a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_segmentation.py
@@ -41,16 +41,16 @@
 from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (  # noqa: E501
     apply_path_aggregation_fpn,
 )
-from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (  # noqa: E501
     apply_yolo_v8_head,
 )
-from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (  # noqa: E501
     decode_regression_to_boxes,
 )
-from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (  # noqa: E501
     dist2bbox,
 )
-from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (
+from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_backbone import (  # noqa: E501
     get_anchors,
 )
 from keras_cv.src.models.segmentation.yolo_v8_segmentation.yolo_v8_label_encoder import (  # noqa: E501

From 33d9014102cc0c37b9c42c5637477bddf90e9280 Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Tue, 17 Sep 2024 20:19:02 +0200
Subject: [PATCH 11/12] Remove TODO

---
 .../segmentation/yolo_v8_segmentation/yolo_v8_backbone.py       | 2 --
 1 file changed, 2 deletions(-)

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py
index 4719096d3c..5a6af54566 100644
--- a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_backbone.py
@@ -19,8 +19,6 @@
 BOX_REGRESSION_CHANNELS = 64
 
 
-# TODO(ianstenbit): Remove this method once we're using CSPDarkNet backbone
-# (Calls to it should be inlined in the detector head)
 def apply_conv_bn(
     inputs,
     output_channel,

From 5f551a3a4b1ee2184681716cbe2fc87c62c3af60 Mon Sep 17 00:00:00 2001
From: Octavio Arriaga <arriaga.camargo@gmail.com>
Date: Tue, 17 Sep 2024 20:20:33 +0200
Subject: [PATCH 12/12] Move docstring to class description

---
 .../yolo_v8_segmentation/yolo_v8_label_encoder.py      | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
index 0c48b75e4e..4eebd27535 100644
--- a/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
+++ b/keras_cv/src/models/segmentation/yolo_v8_segmentation/yolo_v8_label_encoder.py
@@ -11,10 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Label encoder for YOLOV8. This uses the TOOD Task Aligned Assigner approach.
-See https://arxiv.org/abs/2108.07755 for more info, as well as a reference
-implementation at https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
-"""  # noqa: E501
 
 import tensorflow as tf
 
@@ -37,8 +33,10 @@ def is_anchor_center_within_box(anchors, gt_bboxes):
 class YOLOV8LabelEncoder(keras.layers.Layer):
     """
     Encodes ground truth boxes to target boxes and class labels for training a
-    YOLOV8 model. This is an implementation of the Task-aligned sample
-    assignment scheme proposed in https://arxiv.org/abs/2108.07755.
+    YOLOV8 model. This uses the TOOD Task Aligned Assigner approach.
+    See https://arxiv.org/abs/2108.07755 for more info, as well as a reference
+    implementation at https://github.com/fcjian/TOOD/blob/master/
+        mmdet/core/bbox/assigners/task_aligned_assigner.py
 
     Args:
         num_classes: integer, the number of classes in the training dataset