microsoft · mszhanyi · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_activations.h b/onnxruntime/core/providers/cpu/fp16/fp16_activations.h
@@ -75,6 +75,9 @@ struct LeakyRelu<MLFloat16> : public ElementWiseRangedTransform<MLFloat16> {
 //    MlasTanhActivation,
 //    MlasLogisticActivation,
 //    MlasClipActivation,
+//          Once it's added, please update TestNhwcConvReluClipFusion_FP16
+//          in xnnpack_basic_test.cc
+//          to enable outputs verification for Clip activation.
 //    MlasHardSigmoidActivation,
 
 }  // namespace functors

diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc
@@ -9,6 +9,7 @@
 
 #include "core/common/common.h"
 #include "core/common/safeint.h"
+#include "core/framework/float16.h"
 #include "core/framework/node_unit.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph.h"
@@ -245,7 +246,7 @@ std::unique_ptr<IndexedSubGraph::MetaDef> FuseActivation(const NodeUnit& node_un
 
   const auto& activation_type = activation.OpType();
   if (activation_type == "Clip") {
-    min = std::numeric_limits<float>::min();
+    min = std::numeric_limits<float>::lowest();
     max = std::numeric_limits<float>::max();
     bool min_max_are_attributes = activation.SinceVersion() == 1 || activation.SinceVersion() == 6;
 
@@ -267,9 +268,17 @@ std::unique_ptr<IndexedSubGraph::MetaDef> FuseActivation(const NodeUnit& node_un
             ORT_ENFORCE(utils::HasExternalData(value) == false,
                         "External data is not supported for the scalar min/max Clip values");
 
-            value_to_set = utils::HasRawData(value)
-                               ? *reinterpret_cast<const float*>(value.raw_data().data())
-                               : value.float_data()[0];
+            int32_t arg_type;
+            if (GetType(arg, arg_type) && arg_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
 if (initializer) { 
   Initializer i(*initializer, graph.ModelPath()); 
   switch (initializer->data_type()) { 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: 
       value = *i.data<float>(); 
       break; 
     // double isn't currently supported 
     // case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: 
     //  value = static_cast<float>(*i.data<double>()); 
     //  break; 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: 
       value = math::halfToFloat(i.data<MLFloat16>()->val); 
       break; 
     default: 
       ORT_THROW("Unexpected data type for Clip input of ", initializer->data_type()); 
 const NodeUnit* ClipReluChecker(const NodeUnit& node_unit, 
                                 const GraphViewer& graph, 
                                 const std::unordered_map<const Node*, const NodeUnit*>& supported_node_unit_map) { 
 if (initializer) { 
   Initializer i(*initializer, graph.ModelPath()); 
   switch (initializer->data_type()) { 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: 
       value = *i.data<float>(); 
       break; 
     // double isn't currently supported 
     // case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: 
     //  value = static_cast<float>(*i.data<double>()); 
     //  break; 
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: 
       value = math::halfToFloat(i.data<MLFloat16>()->val); 
       break; 
     default: 
       ORT_THROW("Unexpected data type for Clip input of ", initializer->data_type()); 
 const NodeUnit* ClipReluChecker(const NodeUnit& node_unit, 
                                 const GraphViewer& graph, 
                                 const std::unordered_map<const Node*, const NodeUnit*>& supported_node_unit_map) { 
+              // arg is of type FP16
+              value_to_set = utils::HasRawData(value)
+                                 ? (*reinterpret_cast<const MLFloat16*>(value.raw_data().data())).ToFloat()
+                                 : value.float_data()[0];
+            } else if (GetType(arg, arg_type) && arg_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+              value_to_set = utils::HasRawData(value)
+                                 ? *reinterpret_cast<const float*>(value.raw_data().data())
+                                 : value.float_data()[0];
+            } 
           }
         }
       };

diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
@@ -1323,4 +1323,4 @@ TEST(ConvFp16Test, SharedPrepackedWeights) {
 }  // namespace test
 }  // namespace onnxruntime
 
-#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
+#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
diff --git a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc
@@ -6,9 +6,11 @@
 
 #include "core/common/logging/logging.h"
 #include "core/common/span_utils.h"
+#include "core/framework/float16.h"
 #include "core/framework/utils.h"
 #include "core/graph/graph.h"
 #include "core/providers/xnnpack/xnnpack_execution_provider.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 #include "core/session/inference_session.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -89,6 +91,91 @@ TEST(XnnpackEP, TestNhwcConvReluClipFusion) {
   RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion", std::move(ep), feeds, params);
 }
 
+#ifdef XNNPACK_FP16_SUPPORTED
+// This test can be removed if Mlas implemented FP16 Clip fusion.
+// Now TestNhwcConvReluClipFusion_FP16 skips output verification
+TEST(XnnpackEP, TestNhwcConvReluFusion_FP16) {
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_relu_model_fp16.onnx";
+
+  RandomValueGenerator generator;
+  TensorShape input_shape_x{1, 16, 16, 192};
+  std::vector<MLFloat16> input_x = generator.Uniform<MLFloat16>(input_shape_x.GetDims(), -128, 128);
+
+  OrtValue ml_value_x;
+  CreateMLValue<MLFloat16>(input_shape_x.GetDims(), input_x.data(), OrtMemoryInfo(), &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("model_input", ml_value_x));
+
+  std::function<void(const Graph&)> verify = [](const Graph& graph) -> void {
+    ASSERT_EQ(graph.NumberOfNodes(), 2) << "Transpose nodes should have been removed, and "
+                                           "Conv+Relu should have been fused, leaving 2 nodes.";
+    auto node_iter = graph.Nodes().begin();
+    auto check_node = [](const Node& node, const std::string& fusion_type) {
+      const auto& attr = node.GetAttributes();
+      auto activation = attr.find("activation");
+      ASSERT_NE(activation, attr.cend()) << "Fused node should have activation attribute";
+      ASSERT_EQ(activation->second.s(), fusion_type);
+    };
+
+    ++node_iter;
+    check_node(*node_iter, "Relu");
+  };
+
+  EPVerificationParams params;
+  params.ep_node_assignment = ExpectedEPNodeAssignment::Some;
+  params.fp32_abs_err = 0.5f;
+  params.graph_verifier = &verify;
+
+  auto ep = DefaultXnnpackExecutionProvider();
+  RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluFusion_FP16", std::move(ep), feeds, params);
+};
+
+// Now, this Test is mainly check whether Xnnpack Clip fusion works.
+TEST(XnnpackEP, TestNhwcConvReluClipFusion_FP16) {
+  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "nhwc_conv_clip_relu_fp16.onnx";
+
+  RandomValueGenerator generator;
+  TensorShape input_shape_x{1, 16, 16, 192};
+  std::vector<MLFloat16> input_x = generator.Uniform<MLFloat16>(input_shape_x.GetDims(), -128, 128);
+
+  OrtValue ml_value_x;
+  CreateMLValue<MLFloat16>(input_shape_x.GetDims(), input_x.data(), OrtMemoryInfo(), &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("model_input", ml_value_x));
+
+  std::function<void(const Graph&)> verify = [](const Graph& graph) -> void {
+    ASSERT_EQ(graph.NumberOfNodes(), 3) << "Transpose nodes should have been removed, and "
+                                           "Conv+Relu and Conv+Clip should have been fused, leaving 3 nodes.";
+    auto node_iter = graph.Nodes().begin();
+    auto check_node = [](const Node& node, const std::string& fusion_type) {
+      const auto& attr = node.GetAttributes();
+      auto activation = attr.find("activation");
+      ASSERT_NE(activation, attr.cend()) << "Fused node should have activation attribute";
+      ASSERT_EQ(activation->second.s(), fusion_type);
+    };
+
+    // check 2nd and 3rd nodes.
+    // the first node is the Conv that does not get fused (created after first call to GetCapability)
+    // the 2nd and 3rd nodes are the fused nodes (created after second call to GetCapability)
+    ++node_iter;
+    check_node(*node_iter, "Clip");
+    ++node_iter;
+    check_node(*node_iter, "Relu");
+  };
+
+  EPVerificationParams params;
+  params.ep_node_assignment = ExpectedEPNodeAssignment::Some;
+  params.fp32_abs_err = 0.5f;
+  params.graph_verifier = &verify;
+
+  auto ep = DefaultXnnpackExecutionProvider();
+  // So far, CPU EP doesn't support Fp16 Conv fusion, so verify_outputs is skipped.
+  RunAndVerifyOutputsWithEP(ort_model_path, "TestNhwcConvReluClipFusion_FP16", std::move(ep), feeds, params, {}, false);
+}
+#endif
+
 // test we can share the cpu ep allocator with the xnnpack EP
 TEST(XnnpackEP, TestAllocatorSharing) {
   auto init_session = [](std::vector<std::shared_ptr<IExecutionProvider>>& eps,

diff --git a/onnxruntime/test/testdata/nhwc_conv_clip_relu_fp16.onnx b/onnxruntime/test/testdata/nhwc_conv_clip_relu_fp16.onnx
diff --git a/onnxruntime/test/testdata/nhwc_conv_relu_model_fp16.onnx b/onnxruntime/test/testdata/nhwc_conv_relu_model_fp16.onnx