openvinotoolkit · olpipi · Oct 30, 2025 · Nov 5, 2025 · praasz · Nov 5, 2025
@@ -7,6 +7,7 @@
 #include "itt.hpp"
 #include "openvino/core/graph_util.hpp"
 #include "openvino/core/rt_info.hpp"
+#include "openvino/core/type.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/fake_convert.hpp"
@@ -196,21 +197,57 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed)
             return false;
         }
         auto constant_target_inputs = const_node->get_output_target_inputs(0);
-        auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
-
-        convert->set_friendly_name(const_node->get_friendly_name());
-        new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
-        ov::copy_runtime_info(const_node, convert);
-        ov::mark_as_decompression(convert);
-        if (postponed) {
-            postpone_fp16_compression(new_const->get_rt_info());
-            postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
-
-            for (const auto& target_input : constant_target_inputs) {
-                target_input.replace_source_output(convert);
+
+        // Check if the next node is a postponed constant. It will be constant_folded later during serialization.
+        auto postponed_constant_node = [&]() -> std::shared_ptr<ov::Node> {
+            if (constant_target_inputs.size() == 1 &&
+                constant_target_inputs.begin()->get_node()->get_rt_info().count("postponed_constant")) {
+                return constant_target_inputs.begin()->get_node()->shared_from_this();
+            }
+            return nullptr;
+        }();
+
+        if (postponed_constant_node && postponed) {
+            // If f16 conversion is also postponed, we need to insert Convert after the postponed_constant_node
+            if (is_fp16_compression_postponed(postponed_constant_node->get_rt_info())) {
+                // Convert was already added after postponed_constant_node. Get it and just update rt info
+                auto next_node = postponed_constant_node->get_output_target_inputs(0).begin()->get_node();
+                OPENVINO_ASSERT(ov::as_type<ov::op::v0::Convert>(next_node));
+                ov::copy_runtime_info(const_node, next_node->shared_from_this());
+            } else {
+                auto postponed_constant_target_inputs = postponed_constant_node->get_output_target_inputs(0);
+                auto convert =
+                    std::make_shared<ov::op::v0::Convert>(postponed_constant_node, const_node->get_element_type());
+
+                convert->set_friendly_name(postponed_constant_node->get_friendly_name());
+                ov::mark_as_decompression(convert);
+                ov::copy_runtime_info(const_node, convert);
+                postponed_constant_node->set_friendly_name(postponed_constant_node->get_friendly_name() +
+                                                           "_compressed");
+                postpone_fp16_compression(postponed_constant_node->get_rt_info());
+                postpone_fp16_compression(postponed_constant_node->get_output_tensor(0).get_rt_info());
+
+                for (const auto& target_input : postponed_constant_target_inputs) {
+                    target_input.replace_source_output(convert);
+                }
             }
         } else {
-            ov::replace_node(const_node, convert);
+            auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
+
+            convert->set_friendly_name(const_node->get_friendly_name());
+            new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
+            ov::copy_runtime_info(const_node, convert);
+            ov::mark_as_decompression(convert);
+            if (postponed) {
+                postpone_fp16_compression(new_const->get_rt_info());
+                postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
+
+                for (const auto& target_input : constant_target_inputs) {
+                    target_input.replace_source_output(convert);
+                }
+            } else {
+                ov::replace_node(const_node, convert);
+            }
         }
         return true;
     };

@@ -109,7 +109,6 @@ ov::pass::VectorizedMOE2GEMMTransposeWeights::VectorizedMOE2GEMMTransposeWeights
             if (ov::is_type<ov::op::v0::Constant>(transpose_input.get_node_shared_ptr())) {
                 transpose->get_rt_info()["postponed_constant"] = true;
                 ov::pass::disable_constant_folding(transpose);
-                ov::disable_fp16_compression(transpose_input.get_node_shared_ptr());
             }
 
             ov::NodeVector rt_sources{transpose_input.get_node_shared_ptr()};

@@ -30,7 +30,6 @@
 #include "openvino/op/unsqueeze.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "transformations/rt_info/decompression.hpp"
-#include "transformations/rt_info/disable_fp16_compression.hpp"
 
 using namespace ov;
 
@@ -85,7 +84,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto gate_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
             gate_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(gate_transpose);
-            ov::disable_fp16_compression(gate_const);
             convert_input = gate_transpose;
         }
 
@@ -99,7 +97,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto gate_transpose = std::make_shared<op::v1::Transpose>(gate_weight_output, order);
             gate_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(gate_transpose);
-            ov::disable_fp16_compression(gate_weights);
             gate_weight_output = gate_transpose;
         }
     }
@@ -156,7 +153,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto down_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
             down_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(down_transpose);
-            ov::disable_fp16_compression(down_const);
             convert_input = down_transpose;
         }
 
@@ -177,7 +173,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto down_transpose = std::make_shared<op::v1::Transpose>(down_weight_output, order);
             down_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(down_transpose);
-            ov::disable_fp16_compression(down_const);
             down_weight_output = down_transpose;
         }
     }

@@ -11,6 +11,7 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/meta_data.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/core/rt_info.hpp"
 #include "openvino/core/runtime_attribute.hpp"
 #include "openvino/op/binary_convolution.hpp"
 #include "openvino/op/constant.hpp"
@@ -58,14 +59,16 @@ class PostponedConstantReplacer {
                 // clone to keep original node unchanged
                 node_clone = node->clone_with_new_inputs(node->input_values());
                 node_clone->get_rt_info().erase(ov::pass::DisableConstantFolding::get_type_info_static());
-                node = node_clone.get();
             }
+            auto node_to_fold = node_clone ? node_clone : node->shared_from_this();
             OPENVINO_ASSERT(
-                node->constant_fold(outputs, node->input_values()),
+                node_to_fold->constant_fold(outputs, node_to_fold->input_values()),
                 "Node with set `postponed_constant` attribute cannot be fold to constant when saving model to IR file");
             m_constant = outputs[0].get_node_shared_ptr();
             m_node = m_constant.get();
             m_node->set_friendly_name(node->get_friendly_name());
+            ov::copy_runtime_info(node->shared_from_this(), m_constant);
+            ov::copy_output_runtime_info(node->outputs(), m_constant->outputs());
         }
     }
 };

@@ -11,11 +11,13 @@
 #include "openvino/op/add.hpp"
 #include "openvino/op/concat.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/serialize.hpp"
 #include "openvino/runtime/core.hpp"
+#include "transformations/common_optimizations/compress_float_constants.hpp"
 
 class CustomOpsSerializationTest : public ::testing::Test {
 protected:
@@ -186,7 +188,7 @@ TEST(PostponedConstantTest, ConcatWithPostponedConstant) {
 
         auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
 
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
     }
     ov::Core core;
 
@@ -230,7 +232,7 @@ TEST(PostponedConstantTest, SubgraphExclusion) {
         auto model =
             std::make_shared<ov::Model>(final_add->outputs(), ov::ParameterVector{param}, "SubgraphExclusionModel");
 
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
     }
     ov::Core core;
 
@@ -274,7 +276,7 @@ TEST(PostponedConstantTest, NodeWithMultipleConsumers) {
 
         concat->get_rt_info()["postponed_constant"] = true;
 
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
     }
     ov::Core core;
 
@@ -330,7 +332,7 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
         ov::pass::disable_constant_folding(concat);
 
         auto model_copy = model->clone();
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
 
         const auto& [success, message] = compare_functions(model_copy, model, true, true, true, true, true);
         ASSERT_TRUE(success) << message;
@@ -358,3 +360,91 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
         ASSERT_TRUE(success) << message;
     }
 }
+
+TEST(PostponedConstantTest, F16Compression2Inputs) {
+    std::stringstream serialized_xml, serialized_bin;
+    {
+        auto const1 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+        auto const2 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
+        concat->get_rt_info()["postponed_constant"] = true;
+
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(concat, param);
+
+        auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        // in case of postponed_constant + postponed f16 compression, f16 -> f32 convert should be added after postponed
+        // constant
+        bool postponed = true;
+        ov::pass::compress_model_to_f16(model, postponed);
+
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
+    }
+    ov::Core core;
+
+    auto weights = serialized_bin.str();
+    ov::Tensor weights_tensor(ov::element::u8, ov::Shape{weights.size()}, weights.data());
+
+    auto deserialized_model = core.read_model(serialized_xml.str(), weights_tensor);
+
+    {
+        auto constant = std::make_shared<ov::op::v0::Constant>(ov::element::f16,
+                                                               ov::Shape{4, 2},
+                                                               std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+        auto convert = std::make_shared<ov::op::v0::Convert>(constant, ov::element::f32);
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(convert, param);
+
+        auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        const auto& [success, message] =
+            compare_functions(deserialized_model, expected, true, false, false, true, true);
+        ASSERT_TRUE(success) << message;
+    }
+}
+
+TEST(PostponedConstantTest, F16CompressionNotPostponned) {
+    std::stringstream serialized_xml, serialized_bin;
+    auto check_model = [](const std::shared_ptr<ov::Model>& model) {
+        auto const1 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+        auto convert1 = std::make_shared<ov::op::v0::Convert>(const1, ov::element::f32);
+        auto const2 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
+        auto convert2 = std::make_shared<ov::op::v0::Convert>(const2, ov::element::f32);
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{convert1, convert2}, 0);
+        concat->get_rt_info()["postponed_constant"] = true;
+
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(concat, param);
+
+        auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        const auto& [success, message] = compare_functions(model, expected, true, false, false, true, true);
+        ASSERT_TRUE(success) << message;
+    };
+
+    {
+        auto const1 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+        auto const2 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
+        concat->get_rt_info()["postponed_constant"] = true;
+
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(concat, param);
+
+        auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        bool postponed = false;
+        ov::pass::compress_model_to_f16(model, postponed);
+
+        check_model(model);
+
+        ASSERT_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model), ov::Exception);
+    }
+}