From 896bf41477f258709c4f05afc69703b8edf965e1 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Thu, 30 Oct 2025 20:46:51 +0100 Subject: [PATCH 1/2] Perform f16 compression to postponed constant input --- .../compress_float_constants.cpp | 62 +++++++++--- .../moe_transpose_weights.cpp | 1 - src/core/src/xml_util/xml_serialize_util.cpp | 7 +- .../tests/pass/serialization/custom_ops.cpp | 98 ++++++++++++++++++- 4 files changed, 148 insertions(+), 20 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp index 69eac733b47767..670f9419c8f8f6 100644 --- a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp @@ -7,6 +7,7 @@ #include "itt.hpp" #include "openvino/core/graph_util.hpp" #include "openvino/core/rt_info.hpp" +#include "openvino/core/type.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convert.hpp" #include "openvino/op/fake_convert.hpp" @@ -196,21 +197,56 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed) return false; } auto constant_target_inputs = const_node->get_output_target_inputs(0); - auto convert = std::make_shared(new_const, const_node->get_element_type()); - - convert->set_friendly_name(const_node->get_friendly_name()); - new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed"); - ov::copy_runtime_info(const_node, convert); - ov::mark_as_decompression(convert); - if (postponed) { - postpone_fp16_compression(new_const->get_rt_info()); - postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info()); - - for (const auto& target_input : constant_target_inputs) { - target_input.replace_source_output(convert); + + // Check if the next node is a postponed constant. It will be constant_folded later during serialization. + auto postponed_constant_node = [&]() -> std::shared_ptr { + if (constant_target_inputs.size() == 1 && + constant_target_inputs.begin()->get_node()->get_rt_info().count("postponed_constant")) { + return constant_target_inputs.begin()->get_node()->shared_from_this(); + } + return nullptr; + }(); + + if (postponed_constant_node && postponed) { + // If f16 conversion is also postponed, we need to insert Convert after the postponed_constant_node + if (is_fp16_compression_postponed(postponed_constant_node->get_rt_info())) { + // Convert was already added after postponed_constant_node. Get it and just update rt info + auto next_node = postponed_constant_node->get_output_target_inputs(0).begin()->get_node(); + OPENVINO_ASSERT(ov::as_type(next_node)); + ov::copy_runtime_info(const_node, next_node->shared_from_this()); + } else { + auto postponed_constant_target_inputs = postponed_constant_node->get_output_target_inputs(0); + auto convert = + std::make_shared(postponed_constant_node, const_node->get_element_type()); + + convert->set_friendly_name(postponed_constant_node->get_friendly_name()); + ov::mark_as_decompression(convert); + ov::copy_runtime_info(const_node, convert); + postponed_constant_node->set_friendly_name(postponed_constant_node->get_friendly_name() + "_compressed"); + postpone_fp16_compression(postponed_constant_node->get_rt_info()); + postpone_fp16_compression(postponed_constant_node->get_output_tensor(0).get_rt_info()); + + for (const auto& target_input : postponed_constant_target_inputs) { + target_input.replace_source_output(convert); + } } } else { - ov::replace_node(const_node, convert); + auto convert = std::make_shared(new_const, const_node->get_element_type()); + + convert->set_friendly_name(const_node->get_friendly_name()); + new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed"); + ov::copy_runtime_info(const_node, convert); + ov::mark_as_decompression(convert); + if (postponed) { + postpone_fp16_compression(new_const->get_rt_info()); + postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info()); + + for (const auto& target_input : constant_target_inputs) { + target_input.replace_source_output(convert); + } + } else { + ov::replace_node(const_node, convert); + } } return true; }; diff --git a/src/common/transformations/src/transformations/common_optimizations/moe_transpose_weights.cpp b/src/common/transformations/src/transformations/common_optimizations/moe_transpose_weights.cpp index 38d3479db0e7e3..ce0cc1229c1521 100644 --- a/src/common/transformations/src/transformations/common_optimizations/moe_transpose_weights.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/moe_transpose_weights.cpp @@ -109,7 +109,6 @@ ov::pass::VectorizedMOE2GEMMTransposeWeights::VectorizedMOE2GEMMTransposeWeights if (ov::is_type(transpose_input.get_node_shared_ptr())) { transpose->get_rt_info()["postponed_constant"] = true; ov::pass::disable_constant_folding(transpose); - ov::disable_fp16_compression(transpose_input.get_node_shared_ptr()); } ov::NodeVector rt_sources{transpose_input.get_node_shared_ptr()}; diff --git a/src/core/src/xml_util/xml_serialize_util.cpp b/src/core/src/xml_util/xml_serialize_util.cpp index 36bae43b0ba9c9..4be9db0aad9f72 100644 --- a/src/core/src/xml_util/xml_serialize_util.cpp +++ b/src/core/src/xml_util/xml_serialize_util.cpp @@ -11,6 +11,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/meta_data.hpp" #include "openvino/core/model.hpp" +#include "openvino/core/rt_info.hpp" #include "openvino/core/runtime_attribute.hpp" #include "openvino/op/binary_convolution.hpp" #include "openvino/op/constant.hpp" @@ -58,14 +59,16 @@ class PostponedConstantReplacer { // clone to keep original node unchanged node_clone = node->clone_with_new_inputs(node->input_values()); node_clone->get_rt_info().erase(ov::pass::DisableConstantFolding::get_type_info_static()); - node = node_clone.get(); } + auto node_to_fold = node_clone ? node_clone : node->shared_from_this(); OPENVINO_ASSERT( - node->constant_fold(outputs, node->input_values()), + node_to_fold->constant_fold(outputs, node_to_fold->input_values()), "Node with set `postponed_constant` attribute cannot be fold to constant when saving model to IR file"); m_constant = outputs[0].get_node_shared_ptr(); m_node = m_constant.get(); m_node->set_friendly_name(node->get_friendly_name()); + ov::copy_runtime_info(node->shared_from_this(), m_constant); + ov::copy_output_runtime_info(node->outputs(), m_constant->outputs()); } } }; diff --git a/src/core/tests/pass/serialization/custom_ops.cpp b/src/core/tests/pass/serialization/custom_ops.cpp index 5b69d84c7f5561..46004d7dcfe054 100644 --- a/src/core/tests/pass/serialization/custom_ops.cpp +++ b/src/core/tests/pass/serialization/custom_ops.cpp @@ -11,11 +11,13 @@ #include "openvino/op/add.hpp" #include "openvino/op/concat.hpp" #include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" #include "openvino/op/multiply.hpp" #include "openvino/pass/constant_folding.hpp" #include "openvino/pass/manager.hpp" #include "openvino/pass/serialize.hpp" #include "openvino/runtime/core.hpp" +#include "transformations/common_optimizations/compress_float_constants.hpp" class CustomOpsSerializationTest : public ::testing::Test { protected: @@ -186,7 +188,7 @@ TEST(PostponedConstantTest, ConcatWithPostponedConstant) { auto model = std::make_shared(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel"); - ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model); + ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model)); } ov::Core core; @@ -230,7 +232,7 @@ TEST(PostponedConstantTest, SubgraphExclusion) { auto model = std::make_shared(final_add->outputs(), ov::ParameterVector{param}, "SubgraphExclusionModel"); - ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model); + ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model)); } ov::Core core; @@ -274,7 +276,7 @@ TEST(PostponedConstantTest, NodeWithMultipleConsumers) { concat->get_rt_info()["postponed_constant"] = true; - ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model); + ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model)); } ov::Core core; @@ -330,7 +332,7 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) { ov::pass::disable_constant_folding(concat); auto model_copy = model->clone(); - ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model); + ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model)); const auto& [success, message] = compare_functions(model_copy, model, true, true, true, true, true); ASSERT_TRUE(success) << message; @@ -358,3 +360,91 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) { ASSERT_TRUE(success) << message; } } + +TEST(PostponedConstantTest, F16Compression2Inputs) { + std::stringstream serialized_xml, serialized_bin; + { + auto const1 = + std::make_shared(ov::element::f32, ov::Shape{2, 2}, std::vector{1, 2, 3, 4}); + auto const2 = + std::make_shared(ov::element::f32, ov::Shape{2, 2}, std::vector{5, 6, 7, 8}); + auto concat = std::make_shared(ov::OutputVector{const1, const2}, 0); + concat->get_rt_info()["postponed_constant"] = true; + + auto param = std::make_shared(ov::element::f32, ov::Shape{4, 2}); + auto add = std::make_shared(concat, param); + + auto model = std::make_shared(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel"); + + // in case of postponed_constant + postponed f16 compression, f16 -> f32 convert should be added after postponed + // constant + bool postponed = true; + ov::pass::compress_model_to_f16(model, postponed); + + ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model)); + } + ov::Core core; + + auto weights = serialized_bin.str(); + ov::Tensor weights_tensor(ov::element::u8, ov::Shape{weights.size()}, weights.data()); + + auto deserialized_model = core.read_model(serialized_xml.str(), weights_tensor); + + { + auto constant = std::make_shared(ov::element::f16, + ov::Shape{4, 2}, + std::vector{1, 2, 3, 4, 5, 6, 7, 8}); + auto convert = std::make_shared(constant, ov::element::f32); + auto param = std::make_shared(ov::element::f32, ov::Shape{4, 2}); + auto add = std::make_shared(convert, param); + + auto expected = std::make_shared(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel"); + + const auto& [success, message] = + compare_functions(deserialized_model, expected, true, false, false, true, true); + ASSERT_TRUE(success) << message; + } +} + +TEST(PostponedConstantTest, F16CompressionNotPostponned) { + std::stringstream serialized_xml, serialized_bin; + auto check_model = [](const std::shared_ptr& model) { + auto const1 = + std::make_shared(ov::element::f16, ov::Shape{2, 2}, std::vector{1, 2, 3, 4}); + auto convert1 = std::make_shared(const1, ov::element::f32); + auto const2 = + std::make_shared(ov::element::f16, ov::Shape{2, 2}, std::vector{5, 6, 7, 8}); + auto convert2 = std::make_shared(const2, ov::element::f32); + auto concat = std::make_shared(ov::OutputVector{convert1, convert2}, 0); + concat->get_rt_info()["postponed_constant"] = true; + + auto param = std::make_shared(ov::element::f32, ov::Shape{4, 2}); + auto add = std::make_shared(concat, param); + + auto expected = std::make_shared(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel"); + + const auto& [success, message] = compare_functions(model, expected, true, false, false, true, true); + ASSERT_TRUE(success) << message; + }; + + { + auto const1 = + std::make_shared(ov::element::f32, ov::Shape{2, 2}, std::vector{1, 2, 3, 4}); + auto const2 = + std::make_shared(ov::element::f32, ov::Shape{2, 2}, std::vector{5, 6, 7, 8}); + auto concat = std::make_shared(ov::OutputVector{const1, const2}, 0); + concat->get_rt_info()["postponed_constant"] = true; + + auto param = std::make_shared(ov::element::f32, ov::Shape{4, 2}); + auto add = std::make_shared(concat, param); + + auto model = std::make_shared(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel"); + + bool postponed = false; + ov::pass::compress_model_to_f16(model, postponed); + + check_model(model); + + ASSERT_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model), ov::Exception); + } +} From 276c994aa996b7a1f9cf187a93dd64345d2e2fd3 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Wed, 5 Nov 2025 13:51:24 +0100 Subject: [PATCH 2/2] Fix tests and code style --- .../common_optimizations/compress_float_constants.cpp | 3 ++- .../common_optimizations/moe_transpose_weights_test.cpp | 5 ----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp index 670f9419c8f8f6..09aacf6562db84 100644 --- a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp @@ -222,7 +222,8 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed) convert->set_friendly_name(postponed_constant_node->get_friendly_name()); ov::mark_as_decompression(convert); ov::copy_runtime_info(const_node, convert); - postponed_constant_node->set_friendly_name(postponed_constant_node->get_friendly_name() + "_compressed"); + postponed_constant_node->set_friendly_name(postponed_constant_node->get_friendly_name() + + "_compressed"); postpone_fp16_compression(postponed_constant_node->get_rt_info()); postpone_fp16_compression(postponed_constant_node->get_output_tensor(0).get_rt_info()); diff --git a/src/common/transformations/tests/common_optimizations/moe_transpose_weights_test.cpp b/src/common/transformations/tests/common_optimizations/moe_transpose_weights_test.cpp index 6f96188f8f2414..a4f8469ea2b835 100644 --- a/src/common/transformations/tests/common_optimizations/moe_transpose_weights_test.cpp +++ b/src/common/transformations/tests/common_optimizations/moe_transpose_weights_test.cpp @@ -30,7 +30,6 @@ #include "openvino/op/unsqueeze.hpp" #include "openvino/pass/constant_folding.hpp" #include "transformations/rt_info/decompression.hpp" -#include "transformations/rt_info/disable_fp16_compression.hpp" using namespace ov; @@ -85,7 +84,6 @@ std::shared_ptr build_moe_2gemm_model(bool use_decompression, bool wi auto gate_transpose = std::make_shared(convert_input, order); gate_transpose->get_rt_info()["postponed_constant"] = true; ov::pass::disable_constant_folding(gate_transpose); - ov::disable_fp16_compression(gate_const); convert_input = gate_transpose; } @@ -99,7 +97,6 @@ std::shared_ptr build_moe_2gemm_model(bool use_decompression, bool wi auto gate_transpose = std::make_shared(gate_weight_output, order); gate_transpose->get_rt_info()["postponed_constant"] = true; ov::pass::disable_constant_folding(gate_transpose); - ov::disable_fp16_compression(gate_weights); gate_weight_output = gate_transpose; } } @@ -156,7 +153,6 @@ std::shared_ptr build_moe_2gemm_model(bool use_decompression, bool wi auto down_transpose = std::make_shared(convert_input, order); down_transpose->get_rt_info()["postponed_constant"] = true; ov::pass::disable_constant_folding(down_transpose); - ov::disable_fp16_compression(down_const); convert_input = down_transpose; } @@ -177,7 +173,6 @@ std::shared_ptr build_moe_2gemm_model(bool use_decompression, bool wi auto down_transpose = std::make_shared(down_weight_output, order); down_transpose->get_rt_info()["postponed_constant"] = true; ov::pass::disable_constant_folding(down_transpose); - ov::disable_fp16_compression(down_const); down_weight_output = down_transpose; } }