From d5d756221f269c3fde8267ded682e5f8ec577b5d Mon Sep 17 00:00:00 2001 From: Shreya Jain Date: Sun, 26 Jul 2020 01:03:51 -0700 Subject: [PATCH 1/6] add nms layer refactor --- src/ml/neural_net/model_spec.cpp | 43 +++- src/ml/neural_net/model_spec.hpp | 24 ++ .../neural_net_models_exporter.cpp | 230 ++++++++---------- .../neural_net_models_exporter.hpp | 18 +- .../object_detection/object_detector.cpp | 60 ++--- .../od_darknet_yolo_model_trainer.cpp | 41 ++-- .../od_darknet_yolo_model_trainer.hpp | 18 +- .../object_detection/od_model_trainer.hpp | 55 ++--- .../object_detection/od_serialization.cpp | 76 ++++-- .../object_detection/od_serialization.hpp | 11 +- src/toolkits/object_detection/od_yolo.cpp | 92 ++++++- src/toolkits/object_detection/od_yolo.hpp | 38 ++- 12 files changed, 434 insertions(+), 272 deletions(-) diff --git a/src/ml/neural_net/model_spec.cpp b/src/ml/neural_net/model_spec.cpp index f95e60bea6..96512dbbbd 100644 --- a/src/ml/neural_net/model_spec.cpp +++ b/src/ml/neural_net/model_spec.cpp @@ -38,6 +38,7 @@ using CoreML::Specification::NeuralNetwork; using CoreML::Specification::NeuralNetworkImageScaler; using CoreML::Specification::NeuralNetworkLayer; using CoreML::Specification::NeuralNetworkPreprocessing; +using CoreML::Specification::NonMaximumSuppressionLayerParams; using CoreML::Specification::PaddingLayerParams; using CoreML::Specification::PaddingLayerParams_PaddingConstant; using CoreML::Specification::Pipeline; @@ -45,6 +46,7 @@ using CoreML::Specification::PoolingLayerParams; using CoreML::Specification::ReshapeDynamicLayerParams; using CoreML::Specification::ReshapeStaticLayerParams; using CoreML::Specification::SamePadding; +using CoreML::Specification::SliceDynamicLayerParams; using CoreML::Specification::SplitNDLayerParams; using CoreML::Specification::SqueezeLayerParams; using CoreML::Specification::TransposeLayerParams; @@ -105,7 +107,7 @@ void update_weight_params(const std::string& name, const float_array& value, Wei } Span out(value.data(), value.size()); - + weights->mutable_floatvalue()->begin(); #ifdef TURI_USE_FLOAT16 if (use_quantization && is_convertible_to_fp16(out)) { @@ -1113,13 +1115,13 @@ void model_spec::add_expand_dims(const std::string& name, layer->set_name(name); layer->add_input(input); auto* inputTensor = layer->add_inputtensor(); - inputTensor->set_rank(static_cast(inputVector.size())); + inputTensor->set_rank(inputVector.size()); for (size_t i = 0; i < inputVector.size(); ++i) { inputTensor->add_dimvalue(inputVector[i]); } layer->add_output(name); auto* outputTensor = layer->add_outputtensor(); - outputTensor->set_rank(static_cast(outputVector.size())); + outputTensor->set_rank(outputVector.size()); for (size_t i = 0; i < outputVector.size(); ++i) { outputTensor->add_dimvalue(outputVector[i]); } @@ -1137,13 +1139,13 @@ void model_spec::add_squeeze(const std::string& name, const std::string& input, layer->set_name(name); layer->add_input(input); auto* inputTensor = layer->add_inputtensor(); - inputTensor->set_rank(static_cast(inputVector.size())); + inputTensor->set_rank(inputVector.size()); for (size_t i = 0; i < inputVector.size(); ++i) { inputTensor->add_dimvalue(inputVector[i]); } layer->add_output(name); auto* outputTensor = layer->add_outputtensor(); - outputTensor->set_rank(static_cast(outputVector.size())); + outputTensor->set_rank(outputVector.size()); for (size_t i = 0; i < outputVector.size(); ++i) { outputTensor->add_dimvalue(outputVector[i]); } @@ -1200,6 +1202,37 @@ void model_spec::add_get_shape(const std::string& name, layer->mutable_getshape(); } +void model_spec::add_nms_layer(const std::string& name, const std::vector& inputs, + const std::vector& outputs, float iou_threshold, + float confidence_threshold, size_t max_boxes, + bool per_class_supression) +{ + NeuralNetworkLayer* layer = impl_->add_layers(); + layer->set_name(name); + for (const std::string& input : inputs) { + layer->add_input(input); + } + for (const std::string& output : outputs) { + layer->add_output(output); + } + NonMaximumSuppressionLayerParams* nms_params = layer->mutable_nonmaximumsuppression(); + nms_params->set_iouthreshold(iou_threshold); + nms_params->set_scorethreshold(confidence_threshold); + nms_params->set_maxboxes(static_cast<::_tc_google::protobuf::uint64>(max_boxes)); + nms_params->set_perclasssuppression(per_class_supression); +} + +void model_spec::add_slice_dynamic(const std::string& name, const std::vector& inputs) +{ + NeuralNetworkLayer* layer = impl_->add_layers(); + layer->set_name(name); + for (const std::string& input : inputs) { + layer->add_input(input); + } + layer->add_output(name); + layer->mutable_slicedynamic(); +} + pipeline_spec::pipeline_spec(std::unique_ptr impl) : impl_(std::move(impl)) {} diff --git a/src/ml/neural_net/model_spec.hpp b/src/ml/neural_net/model_spec.hpp index aff6d21f29..fc58daf190 100644 --- a/src/ml/neural_net/model_spec.hpp +++ b/src/ml/neural_net/model_spec.hpp @@ -530,6 +530,30 @@ class model_spec { */ void add_get_shape(const std::string& name, const std::string& input); + /** + * Appends dynamic slicing. + * + * \param name The name of the layer and its output + * \param inputs The name of the layer's inputs + */ + void add_slice_dynamic(const std::string& name, const std::vector& inputs); + + /** + * Appends a non maximum suppression layer. + * + * \param name The name of the layer and its output + * \param inputs The name of the layer's inputs + * \param outputs The outputs of the layer + * \param iou_thrsshold The default value for the iou threshold + * \param confidence_threshold The default value for the confidence threshold + * \param max_boxes The maximum number of boxes you want NMS to run + * \param per_class_suppression When false, suppression happens for all + * classes. + */ + void add_nms_layer(const std::string& name, const std::vector& inputs, + const std::vector& outputs, float iou_threshold, + float confidence_threshold, size_t max_boxes, bool per_class_supression); + private: std::unique_ptr impl_; }; diff --git a/src/toolkits/coreml_export/neural_net_models_exporter.cpp b/src/toolkits/coreml_export/neural_net_models_exporter.cpp index 9a8ba6a302..90631e565c 100644 --- a/src/toolkits/coreml_export/neural_net_models_exporter.cpp +++ b/src/toolkits/coreml_export/neural_net_models_exporter.cpp @@ -25,30 +25,13 @@ using CoreML::Specification::NonMaximumSuppressionLayerParams; using CoreML::Specification::SizeRange; using turi::coreml::MLModelWrapper; - namespace turi { namespace { -constexpr size_t MAX_NUM_BOXES_FOR_NMS_LAYER = 64; - constexpr char CONFIDENCE_STR[] = "Boxes × Class confidence (see user-defined metadata \"classes\")"; constexpr char COORDINATES_STR[] = "Boxes × [x, y, width, height] (relative to image size)"; -std::string iou_threshold_description(float default_value) { - std::stringstream ss; - ss << "The maximum allowed overlap (as intersection-over-union ratio) for any" - << " pair of output bounding boxes (default: " << default_value << ")"; - return ss.str(); -} - -std::string confidence_threshold_description(float default_value) { - std::stringstream ss; - ss << "The minimum confidence score for an output bounding box" - << " (default: " << default_value << ")"; - return ss.str(); -} - void set_string_feature(FeatureDescription* feature_desc, std::string name, std::string short_description) { @@ -65,25 +48,6 @@ void set_int64_feature(FeatureDescription* feature_desc, std::string name, feature_desc->mutable_type()->mutable_int64type(); } -void set_array_feature(FeatureDescription* feature_desc, std::string name, - std::string short_description, - const std::vector& shape) -{ - // Set string values. - feature_desc->set_name(std::move(name)); - feature_desc->set_shortdescription(std::move(short_description)); - - // Set shape. - ArrayFeatureType* array = - feature_desc->mutable_type()->mutable_multiarraytype(); - for (size_t s : shape) { - array->add_shape(s); - } - - // Set data type. - array->set_datatype(ArrayFeatureType::DOUBLE); -} - void set_dictionary_string_feature(FeatureDescription* feature_desc, std::string name, std::string short_description) @@ -101,8 +65,9 @@ void set_feature_optional(FeatureDescription* feature_desc) { } void set_predictions_feature(FeatureDescription* feature_desc, std::string feature_name, - size_t num_predictions, size_t num_classes, bool include_shape, bool use_flexible_shape, - std::string short_desc) { + size_t num_predictions, size_t num_classes, bool include_shape, + bool use_flexible_shape, std::string short_desc, bool is_double = true) +{ feature_desc->set_name(feature_name); if (!short_desc.empty()) @@ -113,7 +78,11 @@ void set_predictions_feature(FeatureDescription* feature_desc, std::string featu feature_desc_feature->add_shape(num_predictions); feature_desc_feature->add_shape(num_classes); } - feature_desc_feature->set_datatype(ArrayFeatureType::DOUBLE); + if (is_double) { + feature_desc_feature->set_datatype(ArrayFeatureType::DOUBLE); + } else { + feature_desc_feature->set_datatype(ArrayFeatureType::FLOAT32); + } if (use_flexible_shape) { auto *shape1 = feature_desc_feature->mutable_shaperange() @@ -168,23 +137,22 @@ ImageFeatureType* set_image_feature( return image_feature; } -void set_non_maximum_suppression_model(Model* model_nms, - ModelDescription* pipeline_desc, +void set_non_maximum_suppression_model(Model* model_nms, ModelDescription* pipeline_desc, float num_classes, float num_predictions, - const flex_list& class_labels, - float confidence_threshold, - float iou_threshold) { + const flex_list& class_labels, float confidence_threshold, + float iou_threshold, bool use_most_confident_class) +{ model_nms->set_specificationversion(CoreML::MLMODEL_SPECIFICATION_VERSION); ModelDescription* nms_desc = model_nms->mutable_description(); // Write FeatureDescription for the Raw Confidence input. set_predictions_feature(nms_desc->add_input(), "raw_confidence", num_predictions, num_classes, - true, true, ""); + true, true, ""); // Write FeatureDescription for the Raw Coordinates input. - set_predictions_feature(nms_desc->add_input(), "raw_coordinates", num_predictions, 4, - true, true, ""); + set_predictions_feature(nms_desc->add_input(), "raw_coordinates", num_predictions, 4, true, true, + ""); // Write FeatureDescription for the IOU Threshold input. set_threshold_feature(nms_desc->add_input(), "iouThreshold", ""); @@ -193,12 +161,12 @@ void set_non_maximum_suppression_model(Model* model_nms, set_threshold_feature(nms_desc->add_input(), "confidenceThreshold", ""); // Write FeatureDescription for the Confidence output. - set_predictions_feature(nms_desc->add_output(), "confidence", num_predictions, num_classes, - false, true, CONFIDENCE_STR); + set_predictions_feature(nms_desc->add_output(), "confidence", num_predictions, num_classes, false, + true, CONFIDENCE_STR); // Write FeatureDescription for the Coordinates input. - set_predictions_feature(nms_desc->add_output(), "coordinates", num_predictions, 4, - false, true, COORDINATES_STR); + set_predictions_feature(nms_desc->add_output(), "coordinates", num_predictions, 4, false, true, + COORDINATES_STR); CoreML::Specification::NonMaximumSuppression* first_layer_nms = model_nms->mutable_nonmaximumsuppression(); @@ -219,6 +187,8 @@ void set_non_maximum_suppression_model(Model* model_nms, first_layer_nms->set_confidenceoutputfeaturename("confidence"); first_layer_nms->set_coordinatesoutputfeaturename("coordinates"); + first_layer_nms->mutable_picktop()->set_perclass(use_most_confident_class); + // Write FeatureDescription for the IOU Threshold input. FeatureDescription* iou_threshold_desc = pipeline_desc->add_input(); set_threshold_feature(iou_threshold_desc, "iouThreshold", @@ -233,57 +203,23 @@ void set_non_maximum_suppression_model(Model* model_nms, // Write FeatureDescription for the Confidence output. set_predictions_feature(pipeline_desc->add_output(), "confidence", num_predictions, num_classes, - false, true, CONFIDENCE_STR); + false, true, CONFIDENCE_STR); // Write FeatureDescription for the Coordinates output. - set_predictions_feature(pipeline_desc->add_output(), "coordinates", num_predictions, 4, - false, true, COORDINATES_STR); + set_predictions_feature(pipeline_desc->add_output(), "coordinates", num_predictions, 4, false, + true, COORDINATES_STR); } -void add_non_maximum_suppression_layer(Model* model_nn, - ModelDescription* pipeline_desc, - size_t num_classes, size_t max_boxes, - float confidence_threshold, - float iou_threshold) { - // The model we're modifying must be a NeuralNetwork. - ASSERT_TRUE(model_nn->has_neuralnetwork()); - - // Append the actual NMS layer. - NeuralNetworkLayer* nms_layer = - model_nn->mutable_neuralnetwork()->add_layers(); - nms_layer->set_name("nonmaximumsuppression"); - - // Name the inputs and outputs. - nms_layer->add_input("raw_coordinates"); - nms_layer->add_input("raw_confidence"); - nms_layer->add_input("iouThreshold"); - nms_layer->add_input("confidenceThreshold"); - nms_layer->add_output("coordinates"); - nms_layer->add_output("confidence"); - nms_layer->add_output("indicesOfBoxes"); - nms_layer->add_output("numberOfBoxes"); - - // Write the parameters of the NMS layer. - NonMaximumSuppressionLayerParams* nms_params = - nms_layer->mutable_nonmaximumsuppression(); - nms_params->set_iouthreshold(iou_threshold); - nms_params->set_scorethreshold(confidence_threshold); - nms_params->set_maxboxes( - static_cast<::_tc_google::protobuf::uint64>(max_boxes)); - nms_params->set_perclasssuppression(false); - - // Add the necessary feature descriptions to both the NN model and to the - // overall pipeline. - - // Adjust the model description to reflect the new inputs and outputs. - ModelDescription* model_desc = model_nn->mutable_description(); - +void set_non_maximum_suppression_layer_description(ModelDescription* pipeline_desc, + float num_classes, float num_predictions, + const flex_list& class_labels, + float confidence_threshold, float iou_threshold) +{ // Write FeatureDescription for the IOU Threshold input. FeatureDescription* iou_threshold_desc = pipeline_desc->add_input(); set_array_feature(iou_threshold_desc, "iouThreshold", iou_threshold_description(iou_threshold), {1}); set_feature_optional(iou_threshold_desc); - model_desc->add_input()->CopyFrom(*iou_threshold_desc); // Write FeatureDescription for the Confidence Threshold input. FeatureDescription* confidence_threshold_desc = pipeline_desc->add_input(); @@ -291,42 +227,77 @@ void add_non_maximum_suppression_layer(Model* model_nn, confidence_threshold_description(confidence_threshold), {1}); set_feature_optional(confidence_threshold_desc); - model_desc->add_input()->CopyFrom(*confidence_threshold_desc); // Write FeatureDescription for the Confidence output. - FeatureDescription* confidence_desc = pipeline_desc->add_output(); - set_predictions_feature(confidence_desc, "confidence", max_boxes, num_classes, - false, true, CONFIDENCE_STR); - model_desc->add_output()->CopyFrom(*confidence_desc); + set_predictions_feature(pipeline_desc->add_output(), "confidence", num_predictions, num_classes, + false, true, CONFIDENCE_STR, false); // Write FeatureDescription for the Coordinates output. - FeatureDescription* coordinates_desc = pipeline_desc->add_output(); - set_predictions_feature(coordinates_desc, "coordinates", max_boxes, 4, false, - true, COORDINATES_STR); - model_desc->add_output()->CopyFrom(*coordinates_desc); - - // Write FeatureDescription for the numberOfBoxes output. - FeatureDescription* number_of_boxes_desc = pipeline_desc->add_output(); - set_array_feature(number_of_boxes_desc, "numberOfBoxes", - "The number of valid output bounding boxes", {1}); - model_desc->add_output()->CopyFrom(*number_of_boxes_desc); - - // Write FeatureDescription for the indicesOfBoxes output. - FeatureDescription* indices_of_boxes_desc = pipeline_desc->add_output(); - set_array_feature(indices_of_boxes_desc, "indicesOfBoxes", - "For each output bounding box, the index of the " - "corresponding input bounding box", - {max_boxes}); - model_desc->add_output()->CopyFrom(*indices_of_boxes_desc); + set_predictions_feature(pipeline_desc->add_output(), "coordinates", num_predictions, 4, false, + true, COORDINATES_STR, false); } } // namespace +std::string iou_threshold_description(float default_value) +{ + std::stringstream ss; + ss << "The maximum allowed overlap (as intersection-over-union ratio) for any" + << " pair of output bounding boxes (default: " << default_value << ")"; + return ss.str(); +} + +std::string confidence_threshold_description(float default_value) +{ + std::stringstream ss; + ss << "The minimum confidence score for an output bounding box" + << " (default: " << default_value << ")"; + return ss.str(); +} + +void set_threshold_array_feature(FeatureDescription* feature_desc, std::string name, + std::string short_description, const std::vector& shape, + float value) +{ + // Set string values. + feature_desc->set_name(std::move(name)); + feature_desc->set_shortdescription(std::move(short_description)); + + // Set shape. + ArrayFeatureType* array = feature_desc->mutable_type()->mutable_multiarraytype(); + feature_desc->mutable_type()->set_isoptional(true); + // Set data type. + array->set_datatype(ArrayFeatureType::DOUBLE); + + for (size_t s : shape) { + array->add_shape(s); + } + // array->set_doubledefaultvalue(value); +} + +void set_array_feature(FeatureDescription* feature_desc, std::string name, + std::string short_description, const std::vector& shape) +{ + // Set string values. + feature_desc->set_name(std::move(name)); + feature_desc->set_shortdescription(std::move(short_description)); + + // Set shape. + ArrayFeatureType* array = feature_desc->mutable_type()->mutable_multiarraytype(); + + // Set data type. + array->set_datatype(ArrayFeatureType::DOUBLE); + + for (size_t s : shape) { + array->add_shape(s); + } +} + std::shared_ptr export_object_detector_model( - neural_net::pipeline_spec raw_pipeline, size_t num_classes, - size_t num_predictions, flex_list class_labels, float confidence_threshold, - float iou_threshold, bool include_non_maximum_suppression, - bool use_nms_layer) { + neural_net::pipeline_spec raw_pipeline, size_t num_classes, size_t num_predictions, + flex_list class_labels, float confidence_threshold, float iou_threshold, + bool include_non_maximum_suppression, bool use_nms_layer, bool use_most_confident_class) +{ // Set up Pipeline CoreML::Specification::Model model_pipeline; model_pipeline.set_specificationversion( @@ -358,15 +329,18 @@ std::shared_ptr export_object_detector_model( ASSERT_GT(num_models, 0); Model* model_nn = model_pipeline.mutable_pipeline()->mutable_models(num_models - 1); - add_non_maximum_suppression_layer(model_nn, pipeline_desc, num_classes, - MAX_NUM_BOXES_FOR_NMS_LAYER, - confidence_threshold, iou_threshold); + ASSERT_TRUE(model_nn->has_neuralnetwork()); + + set_non_maximum_suppression_layer_description(pipeline_desc, num_classes, num_predictions, + class_labels, confidence_threshold, + iou_threshold); + } else { // Add Non Maximum Suppression model to pipeline auto* model_nms = model_pipeline.mutable_pipeline()->add_models(); - set_non_maximum_suppression_model(model_nms, pipeline_desc, num_classes, - num_predictions, class_labels, - confidence_threshold, iou_threshold); + set_non_maximum_suppression_model(model_nms, pipeline_desc, num_classes, num_predictions, + class_labels, confidence_threshold, iou_threshold, + use_most_confident_class); } // Wrap the pipeline @@ -383,7 +357,7 @@ std::shared_ptr export_activity_classifier_model( const flex_list& class_labels, const flex_string& target) { CoreML::Specification::Model model; - model.set_specificationversion(CoreML::MLMODEL_SPECIFICATION_VERSION); + model.set_specificationversion(1); // Write the model description. ModelDescription* model_desc = model.mutable_description(); @@ -503,7 +477,7 @@ std::shared_ptr export_drawing_classifier_model( const flex_list& class_labels, const flex_string& target) { CoreML::Specification::Model model; - model.set_specificationversion(CoreML::MLMODEL_SPECIFICATION_VERSION); + model.set_specificationversion(1); // Write the model description. ModelDescription* model_desc = model.mutable_description(); diff --git a/src/toolkits/coreml_export/neural_net_models_exporter.hpp b/src/toolkits/coreml_export/neural_net_models_exporter.hpp index 11d37aca27..71d2981623 100644 --- a/src/toolkits/coreml_export/neural_net_models_exporter.hpp +++ b/src/toolkits/coreml_export/neural_net_models_exporter.hpp @@ -13,10 +13,21 @@ #include #include +#include #include namespace turi { +std::string iou_threshold_description(float default_value); +std::string confidence_threshold_description(float default_value); + +void set_threshold_array_feature(CoreML::Specification::FeatureDescription* feature_desc, + std::string name, std::string short_description, + const std::vector& shape, float value); + +void set_array_feature(CoreML::Specification::FeatureDescription* feature_desc, std::string name, + std::string short_description, const std::vector& shape); + /** * Wraps a trained object detector model_spec as a complete MLModel. * @@ -28,10 +39,9 @@ namespace turi { * is responsible for populating the inputs and outputs? */ std::shared_ptr export_object_detector_model( - neural_net::pipeline_spec pipeline, size_t num_classes, - size_t num_predictions, flex_list class_labels, float confidence_threshold, - float iou_threshold, bool include_non_maximum_suppression, - bool use_nms_layer); + neural_net::pipeline_spec pipeline, size_t num_classes, size_t num_predictions, + flex_list class_labels, float confidence_threshold, float iou_threshold, + bool include_non_maximum_suppression, bool use_nms_layer, bool use_most_confident_class); /** Wraps a trained activity classifier model_spec as a complete MLModel. */ std::shared_ptr export_activity_classifier_model( diff --git a/src/toolkits/object_detection/object_detector.cpp b/src/toolkits/object_detection/object_detector.cpp index 4ec4875bbe..186e55799d 100644 --- a/src/toolkits/object_detection/object_detector.cpp +++ b/src/toolkits/object_detection/object_detector.cpp @@ -408,17 +408,18 @@ variant_type object_detector::evaluate(gl_sframe data, std::string metric, } // If called during training, synchronize the model first. const Checkpoint& checkpoint = read_checkpoint(); + CheckpointMetadata checkpoint_info = checkpoint.GetCheckpointMetadata(); //parse input opts float confidence_threshold, iou_threshold; auto it_confidence = opts.find("confidence_threshold"); if (it_confidence == opts.end()){ - confidence_threshold = checkpoint.GetEvaluateConfidence(); + confidence_threshold = checkpoint_info.evaluate_confidence; } else { confidence_threshold = opts["confidence_threshold"]; } auto it_iou = opts.find("iou_threshold"); if (it_iou == opts.end()){ - iou_threshold = checkpoint.GetNonMaximumSuppressionThreshold(); + iou_threshold = checkpoint_info.nms_threshold; } else { iou_threshold = opts["iou_threshold"]; } @@ -519,6 +520,8 @@ variant_type object_detector::predict( variant_type data, std::map opts) { // If called during training, synchronize the model first. const Checkpoint& checkpoint = read_checkpoint(); + CheckpointMetadata checkpoint_info = checkpoint.GetCheckpointMetadata(); + gl_sarray_writer result(flex_type_enum::LIST, 1); auto consumer = [&](const std::vector& predicted_row, @@ -553,13 +556,13 @@ variant_type object_detector::predict( float confidence_threshold, iou_threshold; auto it_confidence = opts.find("confidence_threshold"); if (it_confidence == opts.end()){ - confidence_threshold = checkpoint.GetPredictConfidence(); + confidence_threshold = checkpoint_info.predict_confidence; } else { confidence_threshold = opts["confidence_threshold"]; } auto it_iou = opts.find("iou_threshold"); if (it_iou == opts.end()){ - iou_threshold = checkpoint.GetNonMaximumSuppressionThreshold(); + iou_threshold = checkpoint_info.nms_threshold; } else { iou_threshold = opts["iou_threshold"]; } @@ -711,24 +714,20 @@ std::shared_ptr object_detector::export_to_coreml( { // If called during training, synchronize the model first. const Checkpoint& checkpoint = read_checkpoint(); + CheckpointMetadata checkpoint_info = checkpoint.GetCheckpointMetadata(); std::string input_str = read_state("feature"); - std::string coordinates_str = "coordinates"; - std::string confidence_str = "confidence"; // No options provided defaults to include Non Maximum Suppression. bool include_non_maximum_suppression = true; bool use_nms_layer = false; - float iou_threshold = checkpoint.GetNonMaximumSuppressionThreshold(); - float confidence_threshold = checkpoint.GetPredictConfidence(); + float iou_threshold = checkpoint_info.nms_threshold; + float confidence_threshold = checkpoint_info.predict_confidence; auto opts_it = opts.find("include_non_maximum_suppression"); if (opts_it != opts.end()) { include_non_maximum_suppression = opts_it->second.to(); } if (include_non_maximum_suppression) { - coordinates_str = "raw_coordinates"; - confidence_str = "raw_confidence"; - // Read user-provided options. opts_it = opts.find("iou_threshold"); if (opts_it != opts.end()) { @@ -744,6 +743,14 @@ std::shared_ptr object_detector::export_to_coreml( } } + bool use_nms_model = include_non_maximum_suppression && !(use_nms_layer); + const std::string coordinates_name = use_nms_model ? "raw_coordinates" : "coordinates"; + const std::string confidence_name = use_nms_model ? "raw_confidence" : "confidence"; + + neural_net::pipeline_spec spec = + checkpoint.ExportToCoreML(input_str, coordinates_name, confidence_name, use_nms_layer, + confidence_threshold, iou_threshold); + // Compute the string representation of the list of class labels. flex_string class_labels_str; flex_list class_labels = read_state("classes"); @@ -759,7 +766,7 @@ std::shared_ptr object_detector::export_to_coreml( if (show_metadata) { // Generate "user-defined" metadata. user_defined_metadata = { - {"model", checkpoint.GetModelType()}, + {"model", checkpoint_info.model_type}, {"max_iterations", read_state("max_iterations")}, {"training_iterations", read_state("training_iterations")}, {"include_non_maximum_suppression", "False"}, @@ -784,19 +791,13 @@ std::shared_ptr object_detector::export_to_coreml( user_defined_metadata = {{"iterations", read_state("training_iterations")}}; } - neural_net::pipeline_spec spec = - checkpoint.ExportToCoreML(input_str, coordinates_str, confidence_str); - std::shared_ptr model_wrapper = export_object_detector_model( - std::move(spec), class_labels.size(), - checkpoint.GetNumberOfPredictions(), std::move(class_labels), - confidence_threshold, iou_threshold, include_non_maximum_suppression, - use_nms_layer); - - model_wrapper->add_metadata({ - {"user_defined", std::move(user_defined_metadata)}, - {"short_description", short_desc} - }); + std::move(spec), class_labels.size(), checkpoint_info.num_predictions, + std::move(class_labels), confidence_threshold, iou_threshold, include_non_maximum_suppression, + use_nms_layer, checkpoint_info.use_most_confident_class); + + model_wrapper->add_metadata( + {{"user_defined", std::move(user_defined_metadata)}, {"short_description", short_desc}}); if (!filename.empty()) { model_wrapper->save(filename); @@ -1129,11 +1130,10 @@ void object_detector::update_model_metrics(gl_sframe data, std::map metrics; // If called during training, synchronize the model first. const Checkpoint& checkpoint = read_checkpoint(); - + CheckpointMetadata checkpoint_info = checkpoint.GetCheckpointMetadata(); // Compute training metrics. - variant_type training_metrics_raw = - perform_evaluation(data, "all", "dict", checkpoint.GetEvaluateConfidence(), - checkpoint.GetNonMaximumSuppressionThreshold()); + variant_type training_metrics_raw = perform_evaluation( + data, "all", "dict", checkpoint_info.evaluate_confidence, checkpoint_info.nms_threshold); variant_map_type training_metrics = variant_get_value(training_metrics_raw); for (const auto& kv : training_metrics) { @@ -1143,8 +1143,8 @@ void object_detector::update_model_metrics(gl_sframe data, // Compute validation metrics if necessary. if (!validation_data.empty()) { variant_type validation_metrics_raw = - perform_evaluation(validation_data, "all", "dict", checkpoint.GetEvaluateConfidence(), - checkpoint.GetNonMaximumSuppressionThreshold()); + perform_evaluation(validation_data, "all", "dict", checkpoint_info.evaluate_confidence, + checkpoint_info.nms_threshold); variant_map_type validation_metrics = variant_get_value(validation_metrics_raw); for (const auto& kv : validation_metrics) { diff --git a/src/toolkits/object_detection/od_darknet_yolo_model_trainer.cpp b/src/toolkits/object_detection/od_darknet_yolo_model_trainer.cpp index ac22763a45..9026ed27d8 100644 --- a/src/toolkits/object_detection/od_darknet_yolo_model_trainer.cpp +++ b/src/toolkits/object_detection/od_darknet_yolo_model_trainer.cpp @@ -407,34 +407,27 @@ std::unique_ptr DarknetYOLOCheckpoint::CreateModelTrainer( return result; } -pipeline_spec DarknetYOLOCheckpoint::ExportToCoreML( - const std::string& input_name, const std::string& coordinates_output_name, - const std::string& confidence_output_name) const { - return export_darknet_yolo(weights_, input_name, coordinates_output_name, - confidence_output_name, GetAnchorBoxes(), - config_.num_classes, config_.output_height, - config_.output_width, SPATIAL_REDUCTION); -} - -size_t DarknetYOLOCheckpoint::GetNumberOfPredictions() const { - return config_.output_width * config_.output_height * GetAnchorBoxes().size(); -} - -std::string DarknetYOLOCheckpoint::GetModelType() const { return "YOLOv2"; } - -float DarknetYOLOCheckpoint::GetEvaluateConfidence() const -{ - return DEFAULT_CONFIDENCE_THRESHOLD_EVALUATE; -} - -float DarknetYOLOCheckpoint::GetPredictConfidence() const +pipeline_spec DarknetYOLOCheckpoint::ExportToCoreML(const std::string& input_name, + const std::string& coordinates_name, + const std::string& confidence_name, + bool use_nms_layer, float iou_threshold, + float confidence_threshold) const { - return DEFAULT_CONFIDENCE_THRESHOLD_PREDICT; + return export_darknet_yolo(weights_, input_name, coordinates_name, confidence_name, + GetAnchorBoxes(), config_.num_classes, use_nms_layer, + config_.output_height, config_.output_width, iou_threshold, + confidence_threshold, SPATIAL_REDUCTION); } -float DarknetYOLOCheckpoint::GetNonMaximumSuppressionThreshold() const +CheckpointMetadata DarknetYOLOCheckpoint::GetCheckpointMetadata() const { - return DEFAULT_NON_MAXIMUM_SUPPRESSION_THRESHOLD; + CheckpointMetadata metadata; + metadata.num_predictions = config_.output_width * config_.output_height * GetAnchorBoxes().size(); + metadata.model_type = "YOLOv2"; + metadata.evaluate_confidence = DEFAULT_CONFIDENCE_THRESHOLD_EVALUATE; + metadata.predict_confidence = DEFAULT_CONFIDENCE_THRESHOLD_PREDICT; + metadata.nms_threshold = DEFAULT_NON_MAXIMUM_SUPPRESSION_THRESHOLD; + return metadata; } float_array_map DarknetYOLOCheckpoint::internal_config() const { diff --git a/src/toolkits/object_detection/od_darknet_yolo_model_trainer.hpp b/src/toolkits/object_detection/od_darknet_yolo_model_trainer.hpp index 5268a2cae6..620047cf6f 100644 --- a/src/toolkits/object_detection/od_darknet_yolo_model_trainer.hpp +++ b/src/toolkits/object_detection/od_darknet_yolo_model_trainer.hpp @@ -141,19 +141,13 @@ class DarknetYOLOCheckpoint : public Checkpoint { std::unique_ptr CreateModelTrainer( neural_net::compute_context* context) const override; - neural_net::pipeline_spec ExportToCoreML( - const std::string& input_name, const std::string& coordinates_output_name, - const std::string& confidence_output_name) const override; + neural_net::pipeline_spec ExportToCoreML(const std::string& input_name, + const std::string& coordinates_name, + const std::string& confidence_name, bool use_nms_layer, + float iou_threshold, + float confidence_threshold) const override; - size_t GetNumberOfPredictions() const override; - - std::string GetModelType() const override; - - float GetEvaluateConfidence() const override; - - float GetPredictConfidence() const override; - - float GetNonMaximumSuppressionThreshold() const override; + CheckpointMetadata GetCheckpointMetadata() const override; /** Returns the config dictionary used to initialize darknet-yolo backends. */ neural_net::float_array_map internal_config() const; diff --git a/src/toolkits/object_detection/od_model_trainer.hpp b/src/toolkits/object_detection/od_model_trainer.hpp index cb25805f7d..4bc7aec6e2 100644 --- a/src/toolkits/object_detection/od_model_trainer.hpp +++ b/src/toolkits/object_detection/od_model_trainer.hpp @@ -130,6 +130,27 @@ struct Config { int num_classes = -1; }; +/** Stores additional data for specific model backend for a checkpoint. */ +struct CheckpointMetadata { + /** The number of predictions for the loaded model. */ + size_t num_predictions = 0; + + /** The model type name for use in exported models. */ + std::string model_type = ""; + + /** The confidence threshold for evaluation */ + float evaluate_confidence = 0.f; + + /** The confidence threshold for prediction */ + float predict_confidence = 0.f; + + /** The Non Maximal Suppression threshold for evaluation */ + float nms_threshold = 0.f; + + /** When true, use NMS only on the most confident class otherwise across all classes. */ + bool use_most_confident_class = false; +}; + /** * A representation of all the parameters needed to reconstruct a model. * @@ -153,36 +174,14 @@ class Checkpoint { * least two outputs, all with the given names. The outputs must be suitable * for passing directly into a NonMaximumSuppression model. */ - virtual neural_net::pipeline_spec ExportToCoreML( - const std::string& input_name, const std::string& coordinates_output_name, - const std::string& confidence_output_name) const = 0; - - /** - * Returns the number of predictions for the loaded model. - */ - virtual size_t GetNumberOfPredictions() const = 0; - - /** - * Returns the model type name for use in exported models. - */ - virtual std::string GetModelType() const = 0; - - /** - * Returns the confidence threshold for evaluation - */ - virtual float GetEvaluateConfidence() const = 0; + virtual neural_net::pipeline_spec ExportToCoreML(const std::string& input_name, + const std::string& coordinates_name, + const std::string& confidence_name, + bool use_nms_layer, float iou_threshold, + float confidence_threshold) const = 0; - /** - * Returns the confidence threshold for prediction - */ - virtual float GetPredictConfidence() const = 0; - - /** - * Returns the Non Maximal Suppression threshold for evaluation - */ - virtual float GetNonMaximumSuppressionThreshold() const = 0; + virtual CheckpointMetadata GetCheckpointMetadata() const = 0; }; - /** * Wrapper adapting object_detection::data_iterator to the Iterator interface. */ diff --git a/src/toolkits/object_detection/od_serialization.cpp b/src/toolkits/object_detection/od_serialization.cpp index fcd4895e1c..6de6c8508b 100644 --- a/src/toolkits/object_detection/od_serialization.cpp +++ b/src/toolkits/object_detection/od_serialization.cpp @@ -14,6 +14,7 @@ #include #include +#include namespace turi { namespace object_detection { @@ -28,6 +29,9 @@ using CoreML::Specification::ModelDescription; using CoreML::Specification::NeuralNetwork; using CoreML::Specification::Pipeline; +using turi::confidence_threshold_description; +using turi::iou_threshold_description; +using turi::set_array_feature; using turi::neural_net::float_array_map; using turi::neural_net::model_spec; using turi::neural_net::pipeline_spec; @@ -35,10 +39,9 @@ using turi::neural_net::zero_weight_initializer; using padding_type = model_spec::padding_type; -constexpr char CONFIDENCE_STR[] = +constexpr char kConfidenceDesc[] = "Boxes × Class confidence (see user-defined metadata \"classes\")"; -constexpr char COORDINATES_STR[] = - "Boxes × [x, y, width, height] (relative to image size)"; +constexpr char kCoordinatesDesc[] = "Boxes × [x, y, width, height] (relative to image size)"; } // namespace @@ -146,24 +149,31 @@ void init_darknet_yolo(model_spec& nn_spec, size_t num_classes, nn_spec.add_preprocessing(input_name, 1.0); } -pipeline_spec export_darknet_yolo( - const float_array_map& weights, const std::string& input_name, - const std::string& coordinates_name, const std::string& confidence_name, - const std::vector>& anchor_boxes, - size_t num_classes, size_t output_grid_height, size_t output_grid_width, - size_t spatial_reduction) { +pipeline_spec export_darknet_yolo(const float_array_map& weights, const std::string& input_name, + const std::string& coordinates_name, + const std::string& confidence_name, + const std::vector>& anchor_boxes, + size_t num_classes, bool use_nms_layer, size_t output_grid_height, + size_t output_grid_width, float iou_threshold, + float confidence_threshold, size_t spatial_reduction) +{ // Initialize the result with the learned layers from the model_backend. std::unique_ptr nn_spec(new model_spec); init_darknet_yolo(*nn_spec, num_classes, anchor_boxes.size(), input_name); nn_spec->update_params(weights, /* use_quantization */ true); // Add the layers that convert to intelligible predictions. - add_yolo(nn_spec.get(), coordinates_name, confidence_name, "conv8_fwd", - anchor_boxes, num_classes, output_grid_height, output_grid_width); + add_yolo(nn_spec.get(), coordinates_name, confidence_name, "conv8_fwd", anchor_boxes, num_classes, + use_nms_layer, iou_threshold, confidence_threshold, output_grid_height, + output_grid_width); // Extract the underlying Core ML spec and move it into a new Pipeline. std::unique_ptr network = std::move(*nn_spec).move_coreml_spec(); + if (use_nms_layer) { + network->set_arrayinputshapemapping( + CoreML::Specification::NeuralNetworkMultiArrayShapeMapping::EXACT_ARRAY_MAPPING); + } std::unique_ptr pipeline(new Pipeline); Model* model = pipeline->add_models(); model->mutable_neuralnetwork()->Swap(network.get()); @@ -181,33 +191,59 @@ pipeline_spec export_darknet_yolo( image_feature->set_height(output_grid_height * spatial_reduction); image_feature->set_colorspace(ImageFeatureType::RGB); + if (use_nms_layer) { + // Set CoreML spec version. + FeatureDescription* iou_threshold_desc = model_desc->add_input(); + set_array_feature(iou_threshold_desc, "iouThreshold", iou_threshold_description(iou_threshold), + {1}); + iou_threshold_desc->mutable_type()->set_isoptional(true); + + FeatureDescription* confidence_threshold_desc = model_desc->add_input(); + set_array_feature(confidence_threshold_desc, "confidenceThreshold", + confidence_threshold_description(confidence_threshold), {1}); + confidence_threshold_desc->mutable_type()->set_isoptional(true); + + model->set_specificationversion(CoreML::MLMODEL_SPECIFICATION_VERSION); + + } else { + // Set CoreML spec version. + model->set_specificationversion(1); + } + // Create a helper function for writing the shapes of the confidence and // coordinates outputs. size_t num_predictions = output_grid_width * output_grid_height * anchor_boxes.size(); - auto set_shape = [num_predictions](FeatureDescription* feature_desc, - size_t num_features_per_prediction) { + auto set_shape = [num_predictions, use_nms_layer](FeatureDescription* feature_desc, + size_t num_features_per_prediction) { ArrayFeatureType* array_feature = feature_desc->mutable_type()->mutable_multiarraytype(); - array_feature->set_datatype(ArrayFeatureType::DOUBLE); - array_feature->add_shape(num_predictions); - array_feature->add_shape(num_features_per_prediction); + if (use_nms_layer) { + array_feature->set_datatype(ArrayFeatureType::FLOAT32); + auto* shape1 = array_feature->mutable_shaperange()->add_sizeranges(); + shape1->set_upperbound(-1); + auto* shape2 = array_feature->mutable_shaperange()->add_sizeranges(); + shape2->set_lowerbound(num_features_per_prediction); + shape2->set_upperbound(num_features_per_prediction); + } else { + array_feature->set_datatype(ArrayFeatureType::DOUBLE); + array_feature->add_shape(num_predictions); + array_feature->add_shape(num_features_per_prediction); + } }; // Write FeatureDescription for the confidence output. FeatureDescription* confidence_desc = model_desc->add_output(); confidence_desc->set_name(confidence_name); - confidence_desc->set_shortdescription(CONFIDENCE_STR); + confidence_desc->set_shortdescription(kConfidenceDesc); set_shape(confidence_desc, num_classes); // Write FeatureDescription for the coordinates output. FeatureDescription* coordinates_desc = model_desc->add_output(); coordinates_desc->set_name(coordinates_name); - coordinates_desc->set_shortdescription(COORDINATES_STR); + coordinates_desc->set_shortdescription(kCoordinatesDesc); set_shape(coordinates_desc, 4); - // Set CoreML spec version. - model->set_specificationversion(1); return pipeline_spec(std::move(pipeline)); } diff --git a/src/toolkits/object_detection/od_serialization.hpp b/src/toolkits/object_detection/od_serialization.hpp index 62ef2f2723..8b2586f2fd 100644 --- a/src/toolkits/object_detection/od_serialization.hpp +++ b/src/toolkits/object_detection/od_serialization.hpp @@ -11,10 +11,15 @@ #include #include #include +#include namespace turi { namespace object_detection { +void set_array_feature(CoreML::Specification::FeatureDescription* feature_desc, std::string name, + std::string short_description, const std::vector& shape, + float value); + void _save_impl(oarchive& oarc, const std::map& state, const neural_net::float_array_map& weights); @@ -30,9 +35,9 @@ void init_darknet_yolo(neural_net::model_spec& nn_spec, neural_net::pipeline_spec export_darknet_yolo( const neural_net::float_array_map& weights, const std::string& input_name, const std::string& coordinates_name, const std::string& confidence_name, - const std::vector>& anchor_boxes, - size_t num_classes, size_t output_grid_height, size_t output_grid_width, - size_t spatial_reduction); + const std::vector>& anchor_boxes, size_t num_classes, + bool use_nms_layer, size_t output_grid_height, size_t output_grid_width, float iou_threshold, + float confidence_threshold, size_t spatial_reduction); } // namespace object_detection } // namespace turi diff --git a/src/toolkits/object_detection/od_yolo.cpp b/src/toolkits/object_detection/od_yolo.cpp index 036dfc8f98..e9af6be278 100644 --- a/src/toolkits/object_detection/od_yolo.cpp +++ b/src/toolkits/object_detection/od_yolo.cpp @@ -18,9 +18,12 @@ using neural_net::float_array; using neural_net::image_annotation; using neural_net::image_box; using neural_net::model_spec; +using neural_net::scalar_weight_initializer; namespace { +constexpr size_t kMaxBoxesAfterNMS = 64; + float sigmoid(float x) { return 1.f / (1.f + std::exp(-x)); } @@ -193,19 +196,70 @@ std::vector convert_yolo_to_annotations( return result; } +void apply_nms_layer(model_spec* nn_spec, const std::string& coordinates_name, + const std::string& confidence_name, const std::string& prefix, + size_t num_bounding_boxes, size_t num_classes, float confidence_threshold, + float iou_threshold, size_t nms_boxes, bool use_most_confident_class) +{ + nn_spec->add_nms_layer( + "non_maximum_supression", + {"raw_coordinates", "raw_confidence", "iouThreshold", "confidenceThreshold"}, + {"nms_coordinates", "nms_confidence", "indices", "num_boxes"}, iou_threshold, + confidence_threshold, nms_boxes, use_most_confident_class); + + nn_spec->add_squeeze("nms_coord_squeezed", "nms_coordinates", {0}, {1, num_bounding_boxes, 4}, + {num_bounding_boxes, 4}); + + nn_spec->add_squeeze("nms_class_squeezed", "nms_confidence", {0}, + {1, num_bounding_boxes, num_classes}, {num_bounding_boxes, num_classes}); + + nn_spec->add_constant_nd( + /* name */ "index", + /* shape */ {1}, + /* data */ scalar_weight_initializer(0)); + nn_spec->add_gather( + /* name */ "num_of_boxes", + /* inputs */ {"num_boxes", "index"}); + nn_spec->add_constant_nd( + /* name */ "postfix1", + /* shape */ {1}, + /* data */ scalar_weight_initializer(num_classes)); + + nn_spec->add_constant_nd( + /* name */ "postfix2", + /* shape */ {1}, + /* data */ scalar_weight_initializer(4)); + nn_spec->add_constant_nd( + /* name */ "beginId", + /* shape */ {2}, + /* data */ scalar_weight_initializer(0)); + nn_spec->add_concat_nd( + /* name */ "endIdConfidence", + /* inputs */ {"num_of_boxes", "postfix1"}, + /* axis */ 0); + nn_spec->add_concat_nd( + /* name */ "endIdCoordinates", + /* inputs */ {"num_of_boxes", "postfix2"}, + /* axis */ 0); + + nn_spec->add_slice_dynamic(coordinates_name, + {"nms_coord_squeezed", "beginId", "endIdCoordinates"}); + nn_spec->add_slice_dynamic(confidence_name, {"nms_class_squeezed", "beginId", "endIdConfidence"}); +} + void add_yolo(model_spec* nn_spec, const std::string& coordinates_name, const std::string& confidence_name, const std::string& input, - const std::vector>& anchor_boxes, - size_t num_classes, size_t output_grid_height, - size_t output_grid_width, std::string prefix) { - + const std::vector>& anchor_boxes, size_t num_classes, + bool use_nms_layer, float iou_threshold, float confidence_threshold, + size_t output_grid_height, size_t output_grid_width) +{ // For darknet-yolo, input should be the (B*(5+C), H, W) conv8_fwd output, // where B is the number of anchor boxes, C is the number of classes, and H // is the output grid height, and W is the output grid width. // Note that the shapes below conform to the CoreML layout // (Seq_length, C, H, W), although sequence length is always 1 here. - + const std::string prefix = "__tc_internal__"; const size_t num_spatial = output_grid_height * output_grid_width; const size_t num_bounding_boxes = num_spatial * anchor_boxes.size(); @@ -335,8 +389,6 @@ void add_yolo(model_spec* nn_spec, const std::string& coordinates_name, } ASSERT_EQ(out, last); }; - nn_spec->add_scale(coordinates_name, prefix + "boxes_out", - {{ num_bounding_boxes, 4, 1 }}, boxes_out_init); // CLASS PROBABILITIES AND OBJECT CONFIDENCE @@ -378,9 +430,29 @@ void add_yolo(model_spec* nn_spec, const std::string& coordinates_name, nn_spec->add_reshape(prefix + "confprobs_transposed", prefix + "confprobs_sp", {{ 1, num_classes, num_bounding_boxes, 1 }}); - // (1, B*H*W, C, 1) - nn_spec->add_permute(confidence_name, prefix + "confprobs_transposed", - {{ 0, 2, 1, 3 }}); + if (use_nms_layer) { + nn_spec->add_scale(prefix + "scaled_boxes_out", prefix + "boxes_out", + {{num_bounding_boxes, 4, 1}}, boxes_out_init); + // (1, B*H*W, C, 1) + nn_spec->add_permute(prefix + "confprobs", prefix + "confprobs_transposed", {{0, 2, 1, 3}}); + + nn_spec->add_squeeze("raw_" + confidence_name, prefix + "confprobs", {3}, + {1, num_bounding_boxes, num_classes, 1}, + {1, num_bounding_boxes, num_classes}); + + nn_spec->add_squeeze("raw_" + coordinates_name, prefix + "scaled_boxes_out", {3}, + {1, num_bounding_boxes, 4, 1}, {1, num_bounding_boxes, 4}); + + apply_nms_layer(std::move(nn_spec), coordinates_name, confidence_name, prefix, + num_bounding_boxes, num_classes, confidence_threshold, iou_threshold, + kMaxBoxesAfterNMS, false); + + } else { + nn_spec->add_scale(coordinates_name, prefix + "boxes_out", {{num_bounding_boxes, 4, 1}}, + boxes_out_init); + // (1, B*H*W, C, 1) + nn_spec->add_permute(confidence_name, prefix + "confprobs_transposed", {{0, 2, 1, 3}}); + } } } // object_detection diff --git a/src/toolkits/object_detection/od_yolo.hpp b/src/toolkits/object_detection/od_yolo.hpp index 74b1eb83ec..3952cacba7 100644 --- a/src/toolkits/object_detection/od_yolo.hpp +++ b/src/toolkits/object_detection/od_yolo.hpp @@ -76,16 +76,38 @@ std::vector convert_yolo_to_annotations( * YOLO model. * \param output_grid_width The width W of the output grid used to train the * YOLO model. - * \param prefix The prefix to apply to intermediate layers added in service of - * output layers named by `coordinates_name` and `confidence_name`. */ -void add_yolo(neural_net::model_spec* nn_spec, - const std::string& coordinates_name, +void add_yolo(neural_net::model_spec* nn_spec, const std::string& coordinates_name, const std::string& confidence_name, const std::string& input, - const std::vector>& anchor_boxes, - size_t num_classes, size_t output_grid_height, - size_t output_grid_width, - std::string prefix = "__tc__internal__"); + const std::vector>& anchor_boxes, size_t num_classes, + bool use_nms_layer, float iou_threshold, float confidence_threshold, + size_t output_grid_height, size_t output_grid_width); + +/** + * Appends layers to add non maximum suppression layer and + * + * \param nn_spec Model spec for the trained model + * \param coordinates_name The name to give to the CoreML layer which will + * output the predicted bounding boxes (B*H*W, 4, 1) for each of the + * B anchor boxes and each of the H*W output grid cells, in + * (x,y,width,height) order, normalized to the interval [0,1]. + * \param confidence_name The name to give to the CoreML layer which will output + * the predicted class label confidences (B*H*W, C, 1) for each of + * the B anchor boxes, each of the H*W output grid cells, and each of + * the C class labels. + * \param num_bounding_boxes The number of anchor boxes. + * \param num_classes The number of class labels C used to train the YOLO model. + * \param confidence_threshold The confidence threshold to be applied in the NMS layer. + * \param iou_threshold The IoU threshold to be applied in the NMS layer. + * \param prefix Prefix string attached to layer names. + * \param nms_boxes The maximum number of boxes we want after NMS. + * \param use_most_confident_class Suppression can be done only across the most + * confident class. + */ +void apply_nms_layer(neural_net::model_spec* nn_spec, const std::string& coordinates_name, + const std::string& confidence_name, const std::string& prefix, + size_t num_bounding_boxes, size_t num_classes, float confidence_threshold, + float iou_threshold, size_t nms_boxes, bool use_most_confident_class); } // object_detection } // turi From 93923bd541ccf2a703797846d607259f5757b9fe Mon Sep 17 00:00:00 2001 From: Shreya Jain Date: Mon, 27 Jul 2020 00:27:03 -0700 Subject: [PATCH 2/6] add the change --- src/ml/neural_net/model_spec.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ml/neural_net/model_spec.cpp b/src/ml/neural_net/model_spec.cpp index 96512dbbbd..979681a197 100644 --- a/src/ml/neural_net/model_spec.cpp +++ b/src/ml/neural_net/model_spec.cpp @@ -1115,13 +1115,13 @@ void model_spec::add_expand_dims(const std::string& name, layer->set_name(name); layer->add_input(input); auto* inputTensor = layer->add_inputtensor(); - inputTensor->set_rank(inputVector.size()); + inputTensor->set_rank(static_cast(inputVector.size())); for (size_t i = 0; i < inputVector.size(); ++i) { inputTensor->add_dimvalue(inputVector[i]); } layer->add_output(name); auto* outputTensor = layer->add_outputtensor(); - outputTensor->set_rank(outputVector.size()); + outputTensor->set_rank(static_cast(outputVector.size())); for (size_t i = 0; i < outputVector.size(); ++i) { outputTensor->add_dimvalue(outputVector[i]); } @@ -1139,13 +1139,13 @@ void model_spec::add_squeeze(const std::string& name, const std::string& input, layer->set_name(name); layer->add_input(input); auto* inputTensor = layer->add_inputtensor(); - inputTensor->set_rank(inputVector.size()); + inputTensor->set_rank(static_cast(inputVector.size())); for (size_t i = 0; i < inputVector.size(); ++i) { inputTensor->add_dimvalue(inputVector[i]); } layer->add_output(name); auto* outputTensor = layer->add_outputtensor(); - outputTensor->set_rank(outputVector.size()); + outputTensor->set_rank(static_cast(outputVector.size())); for (size_t i = 0; i < outputVector.size(); ++i) { outputTensor->add_dimvalue(outputVector[i]); } From 769b392f9d555079488dafd94c7612e67cb85612 Mon Sep 17 00:00:00 2001 From: Shreya Jain Date: Mon, 27 Jul 2020 13:19:51 -0700 Subject: [PATCH 3/6] add test fixes --- .../test_neural_nets_model_exporter.cxx | 13 ++-- .../object_detection/test_od_yolo.cxx | 78 ++++++++++--------- 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/test/unity/toolkits/coreml_export/test_neural_nets_model_exporter.cxx b/test/unity/toolkits/coreml_export/test_neural_nets_model_exporter.cxx index 01a4b70a15..19da6f78fa 100644 --- a/test/unity/toolkits/coreml_export/test_neural_nets_model_exporter.cxx +++ b/test/unity/toolkits/coreml_export/test_neural_nets_model_exporter.cxx @@ -52,13 +52,12 @@ BOOST_AUTO_TEST_CASE(test_object_detector_export_coreml_with_nms) { flex_list t_class_labels = flex_list(test_class_labels.begin(), test_class_labels.end()); - std::shared_ptr model_wrapper = - export_object_detector_model( - neural_net::pipeline_spec(std::move(model_to_export)), - test_class_labels.size(), 13 * 13 * 15, std::move(t_class_labels), - test_confidence_threshold, test_iou_threshold, - /* include_non_maximum_suppression */ true, - /* use_nms_layer */ false); + std::shared_ptr model_wrapper = export_object_detector_model( + neural_net::pipeline_spec(std::move(model_to_export)), test_class_labels.size(), + 13 * 13 * 15, std::move(t_class_labels), test_confidence_threshold, test_iou_threshold, + /* include_non_maximum_suppression */ true, + /* use_nms_layer */ false, + /* use_most_confident_class*/ false); std::shared_ptr c_model = model_wrapper->coreml_model(); auto p_model = c_model->getProto(); diff --git a/test/unity/toolkits/object_detection/test_od_yolo.cxx b/test/unity/toolkits/object_detection/test_od_yolo.cxx index 4e8d5e4497..d3a433e603 100644 --- a/test/unity/toolkits/object_detection/test_od_yolo.cxx +++ b/test/unity/toolkits/object_detection/test_od_yolo.cxx @@ -138,7 +138,6 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { static constexpr size_t NUM_CLASSES = 6; static constexpr size_t NUM_PREDS = NUM_CLASSES + 5; // 4 for bbox, 1 conf - const std::string prefix = "__test__"; const std::vector> anchor_boxes = { {1.f, 2.f}, {1.f, 1.f}, {2.f, 1.f}, {2.f, 4.f}, {2.f, 2.f}, {4.f, 2.f}, @@ -148,14 +147,19 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { }; model_spec nn_spec; - add_yolo(&nn_spec, COORDINATES_NAME, CONFIDENCE_NAME, INPUT_NAME, - anchor_boxes, NUM_CLASSES, OUTPUT_GRID_SIZE, OUTPUT_GRID_SIZE, - prefix); + + bool use_nms_layer = false; + float iou_threshold = 0.33; + float confidence_threshold = 0.001; + + add_yolo(&nn_spec, COORDINATES_NAME, CONFIDENCE_NAME, INPUT_NAME, anchor_boxes, NUM_CLASSES, + use_nms_layer, iou_threshold, confidence_threshold, OUTPUT_GRID_SIZE, OUTPUT_GRID_SIZE); // The add_yolo function simply appends a mostly fixed sequence of 24 layers // to an existing model_spec. Assert that the resulting proto is what we want. // In theory, some of the layers could be reordered or have different names, // but it's much easier to test for exact equality. + const std::string prefix = "__tc_internal__"; const CoreML::Specification::NeuralNetwork& nn = nn_spec.get_coreml_spec(); TS_ASSERT_EQUALS(nn.layers_size(), 24); @@ -336,24 +340,7 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { TS_ASSERT_EQUALS(boxes_out.permute().axis(2), 1); TS_ASSERT_EQUALS(boxes_out.permute().axis(3), 3); - const auto& coordinates = nn.layers(15); - TS_ASSERT_EQUALS(coordinates.input_size(), 1); - TS_ASSERT_EQUALS(coordinates.input(0), prefix + "boxes_out"); - TS_ASSERT_EQUALS(coordinates.output_size(), 1); - TS_ASSERT_EQUALS(coordinates.output(0), COORDINATES_NAME); - TS_ASSERT_EQUALS(coordinates.scale().shapescale_size(), 3); - TS_ASSERT_EQUALS(coordinates.scale().shapescale(0), - OUTPUT_GRID_AREA * anchor_boxes.size()); - TS_ASSERT_EQUALS(coordinates.scale().shapescale(1), 4); - TS_ASSERT_EQUALS(coordinates.scale().shapescale(2), 1); - TS_ASSERT_EQUALS(coordinates.scale().scale().floatvalue_size(), - OUTPUT_GRID_AREA * anchor_boxes.size() * 4 * 1); - for (int i = 0; i < coordinates.scale().scale().floatvalue_size(); ++i) { - TS_ASSERT_EQUALS(coordinates.scale().scale().floatvalue(i), - 1.f / OUTPUT_GRID_SIZE); - } - - const auto& scores_sp = nn.layers(16); + const auto& scores_sp = nn.layers(15); TS_ASSERT_EQUALS(scores_sp.input_size(), 1); TS_ASSERT_EQUALS(scores_sp.input(0), prefix + "ymap_sp"); TS_ASSERT_EQUALS(scores_sp.output_size(), 1); @@ -364,14 +351,14 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { TS_ASSERT_EQUALS(scores_sp.slice().axis(), CoreML::Specification::SliceLayerParams::CHANNEL_AXIS); - const auto& probs_sp = nn.layers(17); + const auto& probs_sp = nn.layers(16); TS_ASSERT_EQUALS(probs_sp.input_size(), 1); TS_ASSERT_EQUALS(probs_sp.input(0), prefix + "scores_sp"); TS_ASSERT_EQUALS(probs_sp.output_size(), 1); TS_ASSERT_EQUALS(probs_sp.output(0), prefix + "probs_sp"); TS_ASSERT(probs_sp.has_softmax()); - const auto& logit_conf_sp = nn.layers(18); + const auto& logit_conf_sp = nn.layers(17); TS_ASSERT_EQUALS(logit_conf_sp.input_size(), 1); TS_ASSERT_EQUALS(logit_conf_sp.input(0), prefix + "ymap_sp"); TS_ASSERT_EQUALS(logit_conf_sp.output_size(), 1); @@ -382,14 +369,14 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { TS_ASSERT_EQUALS(logit_conf_sp.slice().axis(), CoreML::Specification::SliceLayerParams::CHANNEL_AXIS); - const auto& conf_sp = nn.layers(19); + const auto& conf_sp = nn.layers(18); TS_ASSERT_EQUALS(conf_sp.input_size(), 1); TS_ASSERT_EQUALS(conf_sp.input(0), prefix + "logit_conf_sp"); TS_ASSERT_EQUALS(conf_sp.output_size(), 1); TS_ASSERT_EQUALS(conf_sp.output(0), prefix + "conf_sp"); TS_ASSERT(conf_sp.activation().has_sigmoid()); - const auto& conf_tiled_sp = nn.layers(20); + const auto& conf_tiled_sp = nn.layers(19); TS_ASSERT_EQUALS(conf_tiled_sp.input_size(), NUM_CLASSES); for (int i = 0; i < static_cast(NUM_CLASSES); ++i) { TS_ASSERT_EQUALS(conf_tiled_sp.input(i), prefix + "conf_sp"); @@ -399,7 +386,7 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { TS_ASSERT(conf_tiled_sp.has_concat()); TS_ASSERT(!conf_tiled_sp.concat().sequenceconcat()); - const auto& confprobs_sp = nn.layers(21); + const auto& confprobs_sp = nn.layers(20); TS_ASSERT_EQUALS(confprobs_sp.input_size(), 2); TS_ASSERT_EQUALS(confprobs_sp.input(0), prefix + "conf_tiled_sp"); TS_ASSERT_EQUALS(confprobs_sp.input(1), prefix + "probs_sp"); @@ -407,7 +394,7 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { TS_ASSERT_EQUALS(confprobs_sp.output(0), prefix + "confprobs_sp"); TS_ASSERT(confprobs_sp.has_multiply()); - const auto& confprobs_transposed = nn.layers(22); + const auto& confprobs_transposed = nn.layers(21); TS_ASSERT_EQUALS(confprobs_transposed.input_size(), 1); TS_ASSERT_EQUALS(confprobs_transposed.input(0), prefix + "confprobs_sp"); TS_ASSERT_EQUALS(confprobs_transposed.output_size(), 1); @@ -420,16 +407,31 @@ BOOST_AUTO_TEST_CASE(test_add_tiny_darknet_yolo) { OUTPUT_GRID_AREA * anchor_boxes.size()); TS_ASSERT_EQUALS(confprobs_transposed.reshape().targetshape(3), 1); - const auto& confidence = nn.layers(23); - TS_ASSERT_EQUALS(confidence.input_size(), 1); - TS_ASSERT_EQUALS(confidence.input(0), prefix + "confprobs_transposed"); - TS_ASSERT_EQUALS(confidence.output_size(), 1); - TS_ASSERT_EQUALS(confidence.output(0), CONFIDENCE_NAME); - TS_ASSERT_EQUALS(confidence.permute().axis_size(), 4); - TS_ASSERT_EQUALS(confidence.permute().axis(0), 0); - TS_ASSERT_EQUALS(confidence.permute().axis(1), 2); - TS_ASSERT_EQUALS(confidence.permute().axis(2), 1); - TS_ASSERT_EQUALS(confidence.permute().axis(3), 3); + const auto& coordinates = nn.layers(22); + TS_ASSERT_EQUALS(coordinates.input_size(), 1); + TS_ASSERT_EQUALS(coordinates.input(0), prefix + "boxes_out"); + TS_ASSERT_EQUALS(coordinates.output_size(), 1); + TS_ASSERT_EQUALS(coordinates.output(0), COORDINATES_NAME); + TS_ASSERT_EQUALS(coordinates.scale().shapescale_size(), 3); + TS_ASSERT_EQUALS(coordinates.scale().shapescale(0), OUTPUT_GRID_AREA * anchor_boxes.size()); + TS_ASSERT_EQUALS(coordinates.scale().shapescale(1), 4); + TS_ASSERT_EQUALS(coordinates.scale().shapescale(2), 1); + TS_ASSERT_EQUALS(coordinates.scale().scale().floatvalue_size(), + OUTPUT_GRID_AREA * anchor_boxes.size() * 4 * 1); + for (int i = 0; i < coordinates.scale().scale().floatvalue_size(); ++i) { + TS_ASSERT_EQUALS(coordinates.scale().scale().floatvalue(i), 1.f / OUTPUT_GRID_SIZE); + + const auto& confidence = nn.layers(23); + TS_ASSERT_EQUALS(confidence.input_size(), 1); + TS_ASSERT_EQUALS(confidence.input(0), prefix + "confprobs_transposed"); + TS_ASSERT_EQUALS(confidence.output_size(), 1); + TS_ASSERT_EQUALS(confidence.output(0), CONFIDENCE_NAME); + TS_ASSERT_EQUALS(confidence.permute().axis_size(), 4); + TS_ASSERT_EQUALS(confidence.permute().axis(0), 0); + TS_ASSERT_EQUALS(confidence.permute().axis(1), 2); + TS_ASSERT_EQUALS(confidence.permute().axis(2), 1); + TS_ASSERT_EQUALS(confidence.permute().axis(3), 3); + } } } // namespace From fd5a99dd8694c6e3e8a15c4c373ec561b0d3feb3 Mon Sep 17 00:00:00 2001 From: Shreya Jain Date: Mon, 27 Jul 2020 13:34:48 -0700 Subject: [PATCH 4/6] add preprocessor def --- cmake/SetupCompiler.cmake | 4 ++++ src/toolkits/coreml_export/neural_net_models_exporter.cpp | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cmake/SetupCompiler.cmake b/cmake/SetupCompiler.cmake index 32b7a6d085..e06553317f 100644 --- a/cmake/SetupCompiler.cmake +++ b/cmake/SetupCompiler.cmake @@ -128,6 +128,10 @@ if(APPLE) if(NOT TC_BASE_SDK_VERSION VERSION_LESS 10.15) add_definitions(-DHAS_MACOS_10_15) endif() + + if(NOT TC_BASE_SDK_VERSION VERSION_LESS 10.16) + add_definitions(-DHAS_MACOS_10_16) + endif() endif() endmacro() diff --git a/src/toolkits/coreml_export/neural_net_models_exporter.cpp b/src/toolkits/coreml_export/neural_net_models_exporter.cpp index 90631e565c..6070e4fa41 100644 --- a/src/toolkits/coreml_export/neural_net_models_exporter.cpp +++ b/src/toolkits/coreml_export/neural_net_models_exporter.cpp @@ -272,7 +272,10 @@ void set_threshold_array_feature(FeatureDescription* feature_desc, std::string n for (size_t s : shape) { array->add_shape(s); } - // array->set_doubledefaultvalue(value); + +#ifdef HAS_MACOS_10_16 + array->set_doubledefaultvalue(value); +#endif } void set_array_feature(FeatureDescription* feature_desc, std::string name, From 5c88a985a98c0cad702c3f4bfc799a65e57e1fc2 Mon Sep 17 00:00:00 2001 From: Shreya Jain Date: Mon, 27 Jul 2020 14:27:19 -0700 Subject: [PATCH 5/6] remove redunant line --- src/ml/neural_net/model_spec.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ml/neural_net/model_spec.cpp b/src/ml/neural_net/model_spec.cpp index 979681a197..b7f99b13bf 100644 --- a/src/ml/neural_net/model_spec.cpp +++ b/src/ml/neural_net/model_spec.cpp @@ -107,7 +107,7 @@ void update_weight_params(const std::string& name, const float_array& value, Wei } Span out(value.data(), value.size()); - weights->mutable_floatvalue()->begin(); + #ifdef TURI_USE_FLOAT16 if (use_quantization && is_convertible_to_fp16(out)) { From 5e727eb56e24c030edbe1a51d5fb9dfbe8c1ad10 Mon Sep 17 00:00:00 2001 From: Shreya Jain Date: Mon, 27 Jul 2020 23:46:06 -0700 Subject: [PATCH 6/6] add changes --- cmake/SetupCompiler.cmake | 4 ---- src/ml/neural_net/model_spec.cpp | 1 - src/toolkits/coreml_export/neural_net_models_exporter.cpp | 2 -- 3 files changed, 7 deletions(-) diff --git a/cmake/SetupCompiler.cmake b/cmake/SetupCompiler.cmake index e06553317f..32b7a6d085 100644 --- a/cmake/SetupCompiler.cmake +++ b/cmake/SetupCompiler.cmake @@ -128,10 +128,6 @@ if(APPLE) if(NOT TC_BASE_SDK_VERSION VERSION_LESS 10.15) add_definitions(-DHAS_MACOS_10_15) endif() - - if(NOT TC_BASE_SDK_VERSION VERSION_LESS 10.16) - add_definitions(-DHAS_MACOS_10_16) - endif() endif() endmacro() diff --git a/src/ml/neural_net/model_spec.cpp b/src/ml/neural_net/model_spec.cpp index b7f99b13bf..89e8d1db3d 100644 --- a/src/ml/neural_net/model_spec.cpp +++ b/src/ml/neural_net/model_spec.cpp @@ -107,7 +107,6 @@ void update_weight_params(const std::string& name, const float_array& value, Wei } Span out(value.data(), value.size()); - #ifdef TURI_USE_FLOAT16 if (use_quantization && is_convertible_to_fp16(out)) { diff --git a/src/toolkits/coreml_export/neural_net_models_exporter.cpp b/src/toolkits/coreml_export/neural_net_models_exporter.cpp index 6070e4fa41..baee040c17 100644 --- a/src/toolkits/coreml_export/neural_net_models_exporter.cpp +++ b/src/toolkits/coreml_export/neural_net_models_exporter.cpp @@ -273,9 +273,7 @@ void set_threshold_array_feature(FeatureDescription* feature_desc, std::string n array->add_shape(s); } -#ifdef HAS_MACOS_10_16 array->set_doubledefaultvalue(value); -#endif } void set_array_feature(FeatureDescription* feature_desc, std::string name,