Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "itt.hpp"
#include "openvino/core/graph_util.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/core/type.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/fake_convert.hpp"
Expand Down Expand Up @@ -196,21 +197,57 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed)
return false;
}
auto constant_target_inputs = const_node->get_output_target_inputs(0);
auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());

convert->set_friendly_name(const_node->get_friendly_name());
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
ov::copy_runtime_info(const_node, convert);
ov::mark_as_decompression(convert);
if (postponed) {
postpone_fp16_compression(new_const->get_rt_info());
postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());

for (const auto& target_input : constant_target_inputs) {
target_input.replace_source_output(convert);

// Check if the next node is a postponed constant. It will be constant_folded later during serialization.
auto postponed_constant_node = [&]() -> std::shared_ptr<ov::Node> {
if (constant_target_inputs.size() == 1 &&
constant_target_inputs.begin()->get_node()->get_rt_info().count("postponed_constant")) {
return constant_target_inputs.begin()->get_node()->shared_from_this();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be output returned?
Usually the output is input for other nodes there is no need to make another conversion.

}
return nullptr;
}();

if (postponed_constant_node && postponed) {
// If f16 conversion is also postponed, we need to insert Convert after the postponed_constant_node
if (is_fp16_compression_postponed(postponed_constant_node->get_rt_info())) {
// Convert was already added after postponed_constant_node. Get it and just update rt info
auto next_node = postponed_constant_node->get_output_target_inputs(0).begin()->get_node();
OPENVINO_ASSERT(ov::as_type<ov::op::v0::Convert>(next_node));
ov::copy_runtime_info(const_node, next_node->shared_from_this());
} else {
auto postponed_constant_target_inputs = postponed_constant_node->get_output_target_inputs(0);
auto convert =
std::make_shared<ov::op::v0::Convert>(postponed_constant_node, const_node->get_element_type());

convert->set_friendly_name(postponed_constant_node->get_friendly_name());
ov::mark_as_decompression(convert);
ov::copy_runtime_info(const_node, convert);
postponed_constant_node->set_friendly_name(postponed_constant_node->get_friendly_name() +
"_compressed");
postpone_fp16_compression(postponed_constant_node->get_rt_info());
postpone_fp16_compression(postponed_constant_node->get_output_tensor(0).get_rt_info());

for (const auto& target_input : postponed_constant_target_inputs) {
target_input.replace_source_output(convert);
}
}
} else {
ov::replace_node(const_node, convert);
auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());

convert->set_friendly_name(const_node->get_friendly_name());
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
ov::copy_runtime_info(const_node, convert);
ov::mark_as_decompression(convert);
if (postponed) {
postpone_fp16_compression(new_const->get_rt_info());
postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());

for (const auto& target_input : constant_target_inputs) {
target_input.replace_source_output(convert);
}
} else {
ov::replace_node(const_node, convert);
}
}
return true;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ ov::pass::VectorizedMOE2GEMMTransposeWeights::VectorizedMOE2GEMMTransposeWeights
if (ov::is_type<ov::op::v0::Constant>(transpose_input.get_node_shared_ptr())) {
transpose->get_rt_info()["postponed_constant"] = true;
ov::pass::disable_constant_folding(transpose);
ov::disable_fp16_compression(transpose_input.get_node_shared_ptr());
}

ov::NodeVector rt_sources{transpose_input.get_node_shared_ptr()};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
#include "openvino/op/unsqueeze.hpp"
#include "openvino/pass/constant_folding.hpp"
#include "transformations/rt_info/decompression.hpp"
#include "transformations/rt_info/disable_fp16_compression.hpp"

using namespace ov;

Expand Down Expand Up @@ -85,7 +84,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
auto gate_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
gate_transpose->get_rt_info()["postponed_constant"] = true;
ov::pass::disable_constant_folding(gate_transpose);
ov::disable_fp16_compression(gate_const);
convert_input = gate_transpose;
}

Expand All @@ -99,7 +97,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
auto gate_transpose = std::make_shared<op::v1::Transpose>(gate_weight_output, order);
gate_transpose->get_rt_info()["postponed_constant"] = true;
ov::pass::disable_constant_folding(gate_transpose);
ov::disable_fp16_compression(gate_weights);
gate_weight_output = gate_transpose;
}
}
Expand Down Expand Up @@ -156,7 +153,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
auto down_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
down_transpose->get_rt_info()["postponed_constant"] = true;
ov::pass::disable_constant_folding(down_transpose);
ov::disable_fp16_compression(down_const);
convert_input = down_transpose;
}

Expand All @@ -177,7 +173,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
auto down_transpose = std::make_shared<op::v1::Transpose>(down_weight_output, order);
down_transpose->get_rt_info()["postponed_constant"] = true;
ov::pass::disable_constant_folding(down_transpose);
ov::disable_fp16_compression(down_const);
down_weight_output = down_transpose;
}
}
Expand Down
7 changes: 5 additions & 2 deletions src/core/src/xml_util/xml_serialize_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "openvino/core/except.hpp"
#include "openvino/core/meta_data.hpp"
#include "openvino/core/model.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/core/runtime_attribute.hpp"
#include "openvino/op/binary_convolution.hpp"
#include "openvino/op/constant.hpp"
Expand Down Expand Up @@ -58,14 +59,16 @@ class PostponedConstantReplacer {
// clone to keep original node unchanged
node_clone = node->clone_with_new_inputs(node->input_values());
node_clone->get_rt_info().erase(ov::pass::DisableConstantFolding::get_type_info_static());
node = node_clone.get();
}
auto node_to_fold = node_clone ? node_clone : node->shared_from_this();
OPENVINO_ASSERT(
node->constant_fold(outputs, node->input_values()),
node_to_fold->constant_fold(outputs, node_to_fold->input_values()),
"Node with set `postponed_constant` attribute cannot be fold to constant when saving model to IR file");
m_constant = outputs[0].get_node_shared_ptr();
m_node = m_constant.get();
m_node->set_friendly_name(node->get_friendly_name());
ov::copy_runtime_info(node->shared_from_this(), m_constant);
ov::copy_output_runtime_info(node->outputs(), m_constant->outputs());
}
}
};
Expand Down
98 changes: 94 additions & 4 deletions src/core/tests/pass/serialization/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
#include "openvino/op/add.hpp"
#include "openvino/op/concat.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/convert.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/pass/constant_folding.hpp"
#include "openvino/pass/manager.hpp"
#include "openvino/pass/serialize.hpp"
#include "openvino/runtime/core.hpp"
#include "transformations/common_optimizations/compress_float_constants.hpp"

class CustomOpsSerializationTest : public ::testing::Test {
protected:
Expand Down Expand Up @@ -186,7 +188,7 @@ TEST(PostponedConstantTest, ConcatWithPostponedConstant) {

auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");

ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
}
ov::Core core;

Expand Down Expand Up @@ -230,7 +232,7 @@ TEST(PostponedConstantTest, SubgraphExclusion) {
auto model =
std::make_shared<ov::Model>(final_add->outputs(), ov::ParameterVector{param}, "SubgraphExclusionModel");

ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
}
ov::Core core;

Expand Down Expand Up @@ -274,7 +276,7 @@ TEST(PostponedConstantTest, NodeWithMultipleConsumers) {

concat->get_rt_info()["postponed_constant"] = true;

ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
}
ov::Core core;

Expand Down Expand Up @@ -330,7 +332,7 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
ov::pass::disable_constant_folding(concat);

auto model_copy = model->clone();
ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));

const auto& [success, message] = compare_functions(model_copy, model, true, true, true, true, true);
ASSERT_TRUE(success) << message;
Expand Down Expand Up @@ -358,3 +360,91 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
ASSERT_TRUE(success) << message;
}
}

TEST(PostponedConstantTest, F16Compression2Inputs) {
std::stringstream serialized_xml, serialized_bin;
{
auto const1 =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
auto const2 =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
concat->get_rt_info()["postponed_constant"] = true;

auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
auto add = std::make_shared<ov::op::v1::Add>(concat, param);

auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");

// in case of postponed_constant + postponed f16 compression, f16 -> f32 convert should be added after postponed
// constant
bool postponed = true;
ov::pass::compress_model_to_f16(model, postponed);

ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
}
ov::Core core;

auto weights = serialized_bin.str();
ov::Tensor weights_tensor(ov::element::u8, ov::Shape{weights.size()}, weights.data());

auto deserialized_model = core.read_model(serialized_xml.str(), weights_tensor);

{
auto constant = std::make_shared<ov::op::v0::Constant>(ov::element::f16,
ov::Shape{4, 2},
std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
auto convert = std::make_shared<ov::op::v0::Convert>(constant, ov::element::f32);
auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
auto add = std::make_shared<ov::op::v1::Add>(convert, param);

auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");

const auto& [success, message] =
compare_functions(deserialized_model, expected, true, false, false, true, true);
ASSERT_TRUE(success) << message;
}
}

TEST(PostponedConstantTest, F16CompressionNotPostponned) {
std::stringstream serialized_xml, serialized_bin;
auto check_model = [](const std::shared_ptr<ov::Model>& model) {
auto const1 =
std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
auto convert1 = std::make_shared<ov::op::v0::Convert>(const1, ov::element::f32);
auto const2 =
std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
auto convert2 = std::make_shared<ov::op::v0::Convert>(const2, ov::element::f32);
auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{convert1, convert2}, 0);
concat->get_rt_info()["postponed_constant"] = true;

auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
auto add = std::make_shared<ov::op::v1::Add>(concat, param);

auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");

const auto& [success, message] = compare_functions(model, expected, true, false, false, true, true);
ASSERT_TRUE(success) << message;
};

{
auto const1 =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
auto const2 =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
concat->get_rt_info()["postponed_constant"] = true;

auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
auto add = std::make_shared<ov::op::v1::Add>(concat, param);

auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");

bool postponed = false;
ov::pass::compress_model_to_f16(model, postponed);

check_model(model);

ASSERT_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model), ov::Exception);
}
}
Loading