Perform f16 compression to postponed constant input

olpipi · olpipi · commit 0733d604bf49 · 2025-10-30T20:46:51.000+01:00
diff --git a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp
@@ -196,7 +196,28 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed)
             return false;
         }
         auto constant_target_inputs = const_node->get_output_target_inputs(0);
-        auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
+
+        std::shared_ptr<ov::Node> postponed_constant_node;
+        decltype(constant_target_inputs) postponed_constant_node_target_inputs;
+        bool is_postponed_constant_next = [&]() {
+            if (constant_target_inputs.size() == 1 &&
+                constant_target_inputs.begin()->get_node()->get_rt_info().count("postponed_constant")) {
+                postponed_constant_node = constant_target_inputs.begin()->get_node()->shared_from_this();
+                postponed_constant_node_target_inputs = postponed_constant_node->get_output_target_inputs(0);
+                return true;
+            }
+            return false;
+        }();
+        // is_postponed_constant_next flag means that the next node is to be constant_folded later during serialization.
+        // If f16 conversion is also postponed, we need to insert Convert node after the postponed_constant node
+
+        std::shared_ptr<ov::Node> convert;
+        if (is_postponed_constant_next && postponed) {
+            convert = std::make_shared<ov::op::v0::Convert>(postponed_constant_node, const_node->get_element_type());
+            postponed_constant_node->set_friendly_name(const_node->get_friendly_name() + "_compressed");
+        } else {
+            convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
+        }
 
         convert->set_friendly_name(const_node->get_friendly_name());
         new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
@@ -206,7 +227,10 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed)
             postpone_fp16_compression(new_const->get_rt_info());
             postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
 
-            for (const auto& target_input : constant_target_inputs) {
+            auto target_inputs_to_replace = is_postponed_constant_next
+                                                        ? postponed_constant_node_target_inputs
+                                                        : constant_target_inputs;
+            for (const auto& target_input : target_inputs_to_replace) {
                 target_input.replace_source_output(convert);
             }
         } else {
diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_const_transposes_extraction.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_const_transposes_extraction.cpp
@@ -43,9 +43,6 @@ ov::pass::MatMulConstTransposesExtraction::MatMulConstTransposesExtraction() {
             transpose->get_rt_info()["postponed_constant"] = true;
             // disable constant folding here to postpone it to serialization step
             ov::pass::disable_constant_folding(transpose);
-            // disable fp16 compression. Otherwise an additional conversion will be added after the constant, which
-            // breaks postponed_constant serialization
-            ov::disable_fp16_compression(weights.get_node_shared_ptr());
         }
         auto new_matmul = std::make_shared<ov::op::v0::MatMul>(pattern_value_map.at(data_pattern),
                                                                transpose,