From de1aa08166514f77021eb8f8c5c86f09831bf363 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 15 Oct 2024 19:06:43 +0200 Subject: [PATCH 1/5] [Transformation] LoraSubgraph fusion --- .../include/ov_ops/lora_subgraph.hpp | 27 +++ .../lora_subgraph_fusion.hpp | 25 +++ .../src/ov_ops/lora_subgraph.cpp | 36 +++ .../lora_subgraph_fusion.cpp | 108 +++++++++ .../lora_subgraph_fusion.cpp | 208 ++++++++++++++++++ 5 files changed, 404 insertions(+) create mode 100644 src/common/transformations/include/ov_ops/lora_subgraph.hpp create mode 100644 src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp create mode 100644 src/common/transformations/src/ov_ops/lora_subgraph.cpp create mode 100644 src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp create mode 100644 src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp diff --git a/src/common/transformations/include/ov_ops/lora_subgraph.hpp b/src/common/transformations/include/ov_ops/lora_subgraph.hpp new file mode 100644 index 00000000000000..e4511823e50509 --- /dev/null +++ b/src/common/transformations/include/ov_ops/lora_subgraph.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "openvino/op/util/sub_graph_base.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace op { +namespace internal { +class TRANSFORMATIONS_API LoraSubgraph : public ov::op::util::SubGraphOp { +public: + OPENVINO_OP("LoraSubgraph", "ie_internal_opset"); + + LoraSubgraph() = default; + LoraSubgraph(const OutputVector& args, const std::shared_ptr& body); + + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp new file mode 100644 index 00000000000000..8422ad95f262c6 --- /dev/null +++ b/src/common/transformations/include/transformations/common_optimizations/lora_subgraph_fusion.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API LoraSubgraphFusion; + +} // namespace pass +} // namespace ov + +class ov::pass::LoraSubgraphFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("LoraSubgraphFusion", "0"); + LoraSubgraphFusion(); +}; diff --git a/src/common/transformations/src/ov_ops/lora_subgraph.cpp b/src/common/transformations/src/ov_ops/lora_subgraph.cpp new file mode 100644 index 00000000000000..185dca47719f21 --- /dev/null +++ b/src/common/transformations/src/ov_ops/lora_subgraph.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/lora_subgraph.hpp" + +namespace ov { +namespace op { +namespace internal { + +LoraSubgraph::LoraSubgraph(const OutputVector& args, const std::shared_ptr& body) : SubGraphOp(args) { + OPENVINO_ASSERT(args.size() == 5, "LoraSubgraph must have 5 inputs"); + SubGraphOp::set_function(body); + for (size_t i = 0; i < body->get_parameters().size(); ++i) + m_input_descriptions[0].push_back(std::make_shared(i, i)); + for (size_t i = 0; i < body->get_output_size(); ++i) + m_output_descriptions[0].push_back(std::make_shared(i, i)); + constructor_validate_and_infer_types(); +} + +std::shared_ptr LoraSubgraph::clone_with_new_inputs(const OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(new_args, get_function()->clone()); +} + +void LoraSubgraph::validate_and_infer_types() { + const auto& body = get_function(); + OPENVINO_ASSERT(body, "LoraSubgraph must have initialized body"); + validate_and_infer_type_body(body, m_input_descriptions[0]); + for (size_t i = 0; i < get_output_size(); ++i) + set_output_type(i, body->get_output_element_type(i), body->get_output_partial_shape(i)); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp new file mode 100644 index 00000000000000..913ca32f99b06b --- /dev/null +++ b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/lora_subgraph_fusion.hpp" + +#include +#include + +#include "itt.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/convolution.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/util/read_value_base.hpp" +#include "openvino/pass/pattern/op/optional.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/lora_subgraph.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { + MATCHER_SCOPE(LoraSubgraphFusion); + using namespace pass::pattern; + auto input_m = any_input(); + auto transpose_const1_m = wrap_type(consumers_count(1)); + auto transpose1_m = optional({input_m, transpose_const1_m}, consumers_count(1)); + auto read_value1_m = wrap_type(); + auto matmul1_m = wrap_type({transpose1_m, read_value1_m}, consumers_count(1)); + auto read_value2_m = wrap_type(); + auto multiply_m = wrap_type({matmul1_m, read_value2_m}, consumers_count(1)); + auto read_value3_m = wrap_type(); + auto matmul2_m = wrap_type({multiply_m, read_value3_m}, consumers_count(1)); + auto transpose_const2_m = wrap_type(consumers_count(1)); + auto transpose2_m = optional({matmul2_m, transpose_const2_m}, consumers_count(1)); + auto external_matmul_m = wrap_type({input_m, any_input()}); + auto add_m = wrap_type({transpose2_m, external_matmul_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + const auto& input = pattern_map.at(input_m); + const auto& matmul1 = pattern_map.at(matmul1_m); + const auto& read_value1 = pattern_map.at(read_value1_m); + const auto& multiply = pattern_map.at(multiply_m); + const auto& read_value2 = pattern_map.at(read_value2_m); + const auto& matmul2 = pattern_map.at(matmul2_m); + const auto& read_value3 = pattern_map.at(read_value3_m); + const auto& external_matmul = pattern_map.at(external_matmul_m); + const auto& add = pattern_map.at(add_m); + + const auto add_node = add.get_node_shared_ptr(); + if (transformation_callback(add_node)) { + return false; + } + + auto find_connected_input = [](ov::Node* child, ov::Node* parent) { + for (size_t i = 0; i < child->get_input_size(); ++i) { + auto input = child->input(i); + if (input.get_source_output().get_node() == parent) + return input; + } + OPENVINO_THROW("Ops are not connected"); + }; + + std::vector> internal_inputs{ + pattern_map.count(transpose1_m) ? pattern_map.at(transpose1_m).get_node()->input(0) + : matmul1.get_node()->input(0), + matmul1.get_node()->input(1), + // For commutative eltwise ops, input idx may be any, so it must be computed + find_connected_input(multiply.get_node(), read_value2.get_node()), + matmul2.get_node()->input(1), + find_connected_input(add.get_node(), external_matmul.get_node()), + }; + + ov::ParameterVector subgraph_parameters; + subgraph_parameters.reserve(internal_inputs.size()); + for (auto& input : internal_inputs) { + const auto new_parameter = std::make_shared(input.get_element_type(), input.get_partial_shape()); + subgraph_parameters.push_back(new_parameter); + input.replace_source_output(new_parameter); + } + // Note: lora consumers should be taken before lora_subgraph creation, + // because only original consumers should be replaced with lora's output + const auto& lora_consumers = add.get_target_inputs(); + auto lora_subgraph = std::make_shared(ov::OutputVector{add}, subgraph_parameters); + + ov::OutputVector external_connections{ + input, + read_value1, + read_value2, + read_value3, + external_matmul, + }; + const auto lora_node = std::make_shared(external_connections, lora_subgraph); + ov::copy_runtime_info(m.get_matched_nodes(), lora_node); + lora_node->set_friendly_name(add_node->get_friendly_name()); + + for (const auto& consumer : lora_consumers) + consumer.replace_source_output(lora_node->output(0)); + if (!add.get_names().empty()) + lora_node->output(0).set_names(add.get_names()); + return true; + }; + + auto m = std::make_shared(add_m, matcher_name); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp new file mode 100644 index 00000000000000..a230e83a194450 --- /dev/null +++ b/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp @@ -0,0 +1,208 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/lora_subgraph_fusion.hpp" + +#include + +#include + +#include "common_test_utils/node_builders/convolution.hpp" +#include "common_test_utils/node_builders/eltwise.hpp" +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/core/model.hpp" +#include "openvino/opsets/opset15.hpp" +#include "ov_ops/lora_subgraph.hpp" +#include "transformations/utils/utils.hpp" + +using namespace testing; +using namespace ov; + +static constexpr auto netType = ov::element::f32; + +std::pair create_states(const std::vector& shapes) { + ov::OutputVector read_values; + ov::SinkVector assigns; + size_t idx = 0; + auto create_state = [&](const ov::PartialShape& shape) { + auto variable = std::make_shared(ov::op::util::VariableInfo{shape, netType, std::to_string(idx++)}); + auto read_value = std::make_shared(variable); + auto assign = std::make_shared(read_value, variable); + read_values.push_back(read_value); + assigns.push_back(assign); + }; + for (const auto& shape : shapes) + create_state(shape); + return std::make_pair(read_values, assigns); +} + +std::shared_ptr create_lora_subgraph(const ov::Output& lora_input, + const ov::Output& data_flow, + const ov::OutputVector& states, + bool add_transposes, + size_t mul_read_value_idx = 1, + size_t add_data_flow_idx = 0) { + OPENVINO_ASSERT(states.size() == 3, "get_lora_subgraph expects states size == 3"); + OPENVINO_ASSERT(mul_read_value_idx == 0 || mul_read_value_idx == 1, "mul_read_value_idx must be 0 or 1"); + OPENVINO_ASSERT(add_data_flow_idx == 0 || add_data_flow_idx == 1, "add_data_flow_idx must be 0 or 1"); + + auto create_transpose = [](const ov::Output& input) -> ov::Output { + auto constant = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {2, 3, 0, 1}); + return std::make_shared(input, constant); + }; + + const auto& mm1_input = add_transposes ? create_transpose(lora_input) : lora_input; + auto mm1 = std::make_shared(mm1_input, states[0], false, true); + + const auto& mul_in_0 = mul_read_value_idx == 0 ? states[1] : mm1->output(0); + const auto& mul_in_1 = mul_read_value_idx == 0 ? mm1->output(0) : states[1]; + auto mul = std::make_shared(mul_in_0, mul_in_1); + + auto mm2 = std::make_shared(mul, states[2], false, true); + + const auto& add_sec_input = add_transposes ? create_transpose(mm2) : mm2; + const auto& add_in_0 = add_data_flow_idx == 0 ? data_flow : add_sec_input; + const auto& add_in_1 = add_data_flow_idx == 0 ? add_sec_input : data_flow; + return std::make_shared(add_in_0, add_in_1); +} + +class LoraSubgraphFusionTests : public TransformationTestsF { +public: + void SetUp() override { + TransformationTestsF::SetUp(); + manager.register_pass(); + } +}; + +class LoraSubgraphFusionMatMulTests : public LoraSubgraphFusionTests { +public: + const ov::Dimension K = 563; + const ov::Dimension N = 2048; + ov::PartialShape shape_x = {-1, -1, K}; + ov::PartialShape shape_w = {N, K}; + ov::PartialShape shape_state_1 = {-1, K}; + ov::PartialShape shape_state_2 = {1, -1}; + ov::PartialShape shape_state_3 = {N, -1}; +}; + + +TEST_F(LoraSubgraphFusionMatMulTests, StandardPattern) { + { + auto param_y = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_y, param_w, false, true); + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + auto lora_subgraph = create_lora_subgraph(param_y, main_mm, states.first, false); + model = std::make_shared(OutputVector{lora_subgraph, main_mm}, states.second, ParameterVector{param_y, param_w}); + } + { + auto param_y = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_y, param_w, false, true); + + auto inner_param_y = std::make_shared(netType, shape_x); + auto inner_state_1 = std::make_shared(netType, shape_state_1); + auto inner_state_2 = std::make_shared(netType, shape_state_2); + auto inner_state_3 = std::make_shared(netType, shape_state_3); + auto inner_param_mm = std::make_shared(netType, main_mm->get_output_partial_shape(0)); + + ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; + auto lora_subgraph = create_lora_subgraph(inner_param_y, inner_param_mm, states_outs, false); + ov::ParameterVector inner_params{inner_param_y, inner_state_1, inner_state_2, inner_state_3, inner_param_mm}; + auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); + + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + ov::OutputVector lora_inputs{param_y, states.first[0], states.first[1], states.first[2], main_mm}; + auto lora = std::make_shared(lora_inputs, inner_model); + + model_ref = std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_y, param_w}); + } +} + +TEST_F(LoraSubgraphFusionMatMulTests, ReshaffledEltwiseInputs) { + { + auto param_y = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_y, param_w, false, true); + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + auto lora_subgraph = create_lora_subgraph(param_y, main_mm, states.first, false, 0, 1); + model = std::make_shared(OutputVector{lora_subgraph, main_mm}, states.second, ParameterVector{param_y, param_w}); + } + { + auto param_y = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + auto main_mm = std::make_shared(param_y, param_w, false, true); + + auto inner_param_y = std::make_shared(netType, shape_x); + auto inner_state_1 = std::make_shared(netType, shape_state_1); + auto inner_state_2 = std::make_shared(netType, shape_state_2); + auto inner_state_3 = std::make_shared(netType, shape_state_3); + auto inner_param_mm = std::make_shared(netType, main_mm->get_output_partial_shape(0)); + + ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; + auto lora_subgraph = create_lora_subgraph(inner_param_y, inner_param_mm, states_outs, false, 0, 1); + ov::ParameterVector inner_params{inner_param_y, inner_state_1, inner_state_2, inner_state_3, inner_param_mm}; + auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); + + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + ov::OutputVector lora_inputs{param_y, states.first[0], states.first[1], states.first[2], main_mm}; + auto lora = std::make_shared(lora_inputs, inner_model); + + model_ref = std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_y, param_w}); + } +} + +class LoraSubgraphFusionConvolutionTests : public LoraSubgraphFusionTests { +public: + const ov::Dimension num_channels = 320; + ov::PartialShape shape_x = {-1, num_channels, -1, -1}; + ov::PartialShape shape_state_1 = {-1, num_channels}; + ov::PartialShape shape_state_2 = {1, -1}; + ov::PartialShape shape_state_3 = {num_channels, -1}; +}; + +TEST_F(LoraSubgraphFusionConvolutionTests, StandardPattern) { + { + auto param_y = std::make_shared(netType, shape_x); + auto main_conv = ov::test::utils::make_convolution(param_y, + netType, + {1, 1}, + {1, 1}, + {0, 0}, + {0, 0}, + {1, 1}, + ov::op::PadType::EXPLICIT, + num_channels.get_length()); + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + auto lora_subgraph = create_lora_subgraph(param_y, main_conv, states.first, true); + model = std::make_shared(OutputVector{lora_subgraph, main_conv}, states.second, ParameterVector{param_y}); + } + { + auto param_y = std::make_shared(netType, shape_x); + auto main_conv = ov::test::utils::make_convolution(param_y, + netType, + {1, 1}, + {1, 1}, + {0, 0}, + {0, 0}, + {1, 1}, + ov::op::PadType::EXPLICIT, + num_channels.get_length()); + + auto inner_param_y = std::make_shared(netType, shape_x); + auto inner_state_1 = std::make_shared(netType, shape_state_1); + auto inner_state_2 = std::make_shared(netType, shape_state_2); + auto inner_state_3 = std::make_shared(netType, shape_state_3); + auto inner_param_mm = std::make_shared(netType, main_conv->get_output_partial_shape(0)); + auto lora_subgraph = create_lora_subgraph(inner_param_y, inner_param_mm, ov::OutputVector{inner_state_1, inner_state_2, inner_state_3}, true); + ov::ParameterVector inner_params{inner_param_y, inner_state_1, inner_state_2, inner_state_3, inner_param_mm}; + auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); + + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); + ov::OutputVector lora_inputs{param_y, states.first[0], states.first[1], states.first[2], main_conv}; + auto lora = std::make_shared(lora_inputs, inner_model); + + model_ref = std::make_shared(OutputVector{lora, main_conv}, states.second, ParameterVector{param_y}); + } +} From 3246da7a4151bcc8410dc4f9863520adb2659360 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 16 Oct 2024 10:39:58 +0200 Subject: [PATCH 2/5] Final refactoring --- .../include/ov_ops/lora_subgraph.hpp | 10 ++ .../src/ov_ops/lora_subgraph.cpp | 3 +- .../lora_subgraph_fusion.cpp | 36 +++--- .../lora_subgraph_fusion.cpp | 119 ++++++++++++------ 4 files changed, 108 insertions(+), 60 deletions(-) diff --git a/src/common/transformations/include/ov_ops/lora_subgraph.hpp b/src/common/transformations/include/ov_ops/lora_subgraph.hpp index e4511823e50509..7cdf00bf82ccaf 100644 --- a/src/common/transformations/include/ov_ops/lora_subgraph.hpp +++ b/src/common/transformations/include/ov_ops/lora_subgraph.hpp @@ -11,6 +11,16 @@ namespace ov { namespace op { namespace internal { +/** + * @interface LoraSubgraph + * @brief LoraSubgraph operation, which is used for LoRA subgraphs fusion. + * It always has only 1 output, and the following inputs, whose order is fixed: + * 1. main_flow_input - input from original model. + * 2. LoRA_input - data flow to which all the LoRA modifications are applied. The modified input is combined with `main_flow_input` + * 3. LoRA_matrices - 3 Low-Rank adaptation matrices applied to `LoRA_input`. + * The fused subgraph can be optimized in runtime based on LoRA semantic. + * For instance, `main_flow_input` can be fast-forwarded to output in case of empty `LoRA_matrices`. + */ class TRANSFORMATIONS_API LoraSubgraph : public ov::op::util::SubGraphOp { public: OPENVINO_OP("LoraSubgraph", "ie_internal_opset"); diff --git a/src/common/transformations/src/ov_ops/lora_subgraph.cpp b/src/common/transformations/src/ov_ops/lora_subgraph.cpp index 185dca47719f21..e729682fa6cada 100644 --- a/src/common/transformations/src/ov_ops/lora_subgraph.cpp +++ b/src/common/transformations/src/ov_ops/lora_subgraph.cpp @@ -9,7 +9,6 @@ namespace op { namespace internal { LoraSubgraph::LoraSubgraph(const OutputVector& args, const std::shared_ptr& body) : SubGraphOp(args) { - OPENVINO_ASSERT(args.size() == 5, "LoraSubgraph must have 5 inputs"); SubGraphOp::set_function(body); for (size_t i = 0; i < body->get_parameters().size(); ++i) m_input_descriptions[0].push_back(std::make_shared(i, i)); @@ -24,6 +23,8 @@ std::shared_ptr LoraSubgraph::clone_with_new_inputs(const OutputVector& ne } void LoraSubgraph::validate_and_infer_types() { + OPENVINO_ASSERT(get_input_size() == 5, "LoraSubgraph must have 5 inputs whereas it has ", get_input_size()); + OPENVINO_ASSERT(get_output_size() == 1, "LoraSubgraph must have 1 output whereas it has ", get_output_size()); const auto& body = get_function(); OPENVINO_ASSERT(body, "LoraSubgraph must have initialized body"); validate_and_infer_type_body(body, m_input_descriptions[0]); diff --git a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp index 913ca32f99b06b..1632c9e07e6a51 100644 --- a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp @@ -23,9 +23,9 @@ ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { MATCHER_SCOPE(LoraSubgraphFusion); using namespace pass::pattern; - auto input_m = any_input(); + auto lora_input_m = any_input(); auto transpose_const1_m = wrap_type(consumers_count(1)); - auto transpose1_m = optional({input_m, transpose_const1_m}, consumers_count(1)); + auto transpose1_m = optional({lora_input_m, transpose_const1_m}, consumers_count(1)); auto read_value1_m = wrap_type(); auto matmul1_m = wrap_type({transpose1_m, read_value1_m}, consumers_count(1)); auto read_value2_m = wrap_type(); @@ -34,19 +34,19 @@ ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { auto matmul2_m = wrap_type({multiply_m, read_value3_m}, consumers_count(1)); auto transpose_const2_m = wrap_type(consumers_count(1)); auto transpose2_m = optional({matmul2_m, transpose_const2_m}, consumers_count(1)); - auto external_matmul_m = wrap_type({input_m, any_input()}); - auto add_m = wrap_type({transpose2_m, external_matmul_m}); + auto main_flow_m = wrap_type({lora_input_m, any_input()}); + auto add_m = wrap_type({transpose2_m, main_flow_m}); ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { const auto& pattern_map = m.get_pattern_value_map(); - const auto& input = pattern_map.at(input_m); + const auto& lora_input = pattern_map.at(lora_input_m); const auto& matmul1 = pattern_map.at(matmul1_m); const auto& read_value1 = pattern_map.at(read_value1_m); const auto& multiply = pattern_map.at(multiply_m); const auto& read_value2 = pattern_map.at(read_value2_m); const auto& matmul2 = pattern_map.at(matmul2_m); const auto& read_value3 = pattern_map.at(read_value3_m); - const auto& external_matmul = pattern_map.at(external_matmul_m); + const auto& main_flow = pattern_map.at(main_flow_m); const auto& add = pattern_map.at(add_m); const auto add_node = add.get_node_shared_ptr(); @@ -63,14 +63,22 @@ ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { OPENVINO_THROW("Ops are not connected"); }; - std::vector> internal_inputs{ + // Note: internal_inputs/external_connections order corresponds to LoraSubgraph semantic + const std::vector> internal_inputs{ + find_connected_input(add.get_node(), main_flow.get_node()), + // For commutative eltwise ops, input idx may be any, so it must be computed pattern_map.count(transpose1_m) ? pattern_map.at(transpose1_m).get_node()->input(0) : matmul1.get_node()->input(0), matmul1.get_node()->input(1), - // For commutative eltwise ops, input idx may be any, so it must be computed find_connected_input(multiply.get_node(), read_value2.get_node()), matmul2.get_node()->input(1), - find_connected_input(add.get_node(), external_matmul.get_node()), + }; + const ov::OutputVector external_connections{ + main_flow, + lora_input, + read_value1, + read_value2, + read_value3, }; ov::ParameterVector subgraph_parameters; @@ -83,15 +91,7 @@ ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { // Note: lora consumers should be taken before lora_subgraph creation, // because only original consumers should be replaced with lora's output const auto& lora_consumers = add.get_target_inputs(); - auto lora_subgraph = std::make_shared(ov::OutputVector{add}, subgraph_parameters); - - ov::OutputVector external_connections{ - input, - read_value1, - read_value2, - read_value3, - external_matmul, - }; + const auto lora_subgraph = std::make_shared(ov::OutputVector{add}, subgraph_parameters); const auto lora_node = std::make_shared(external_connections, lora_subgraph); ov::copy_runtime_info(m.get_matched_nodes(), lora_node); lora_node->set_friendly_name(add_node->get_friendly_name()); diff --git a/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp index a230e83a194450..6557f763c6b368 100644 --- a/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp +++ b/src/common/transformations/tests/common_optimizations/lora_subgraph_fusion.cpp @@ -9,7 +9,6 @@ #include #include "common_test_utils/node_builders/convolution.hpp" -#include "common_test_utils/node_builders/eltwise.hpp" #include "common_test_utils/ov_test_utils.hpp" #include "openvino/core/model.hpp" #include "openvino/opsets/opset15.hpp" @@ -26,7 +25,8 @@ std::pair create_states(const std::vector(ov::op::util::VariableInfo{shape, netType, std::to_string(idx++)}); + auto variable = + std::make_shared(ov::op::util::VariableInfo{shape, netType, std::to_string(idx++)}); auto read_value = std::make_shared(variable); auto assign = std::make_shared(read_value, variable); read_values.push_back(read_value); @@ -37,8 +37,8 @@ std::pair create_states(const std::vector create_lora_subgraph(const ov::Output& lora_input, - const ov::Output& data_flow, +std::shared_ptr create_lora_subgraph(const ov::Output& main_flow, + const ov::Output& lora_input, const ov::OutputVector& states, bool add_transposes, size_t mul_read_value_idx = 1, @@ -62,13 +62,20 @@ std::shared_ptr create_lora_subgraph(const ov::Output& lora_ auto mm2 = std::make_shared(mul, states[2], false, true); const auto& add_sec_input = add_transposes ? create_transpose(mm2) : mm2; - const auto& add_in_0 = add_data_flow_idx == 0 ? data_flow : add_sec_input; - const auto& add_in_1 = add_data_flow_idx == 0 ? add_sec_input : data_flow; + const auto& add_in_0 = add_data_flow_idx == 0 ? main_flow : add_sec_input; + const auto& add_in_1 = add_data_flow_idx == 0 ? add_sec_input : main_flow; return std::make_shared(add_in_0, add_in_1); } class LoraSubgraphFusionTests : public TransformationTestsF { public: + LoraSubgraphFusionTests() : TransformationTestsF() { + // TODO: remove when these flags will be enabled in TransformationTestsF (ticket XXX-98039) + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); + } + void SetUp() override { TransformationTestsF::SetUp(); manager.register_pass(); @@ -86,70 +93,87 @@ class LoraSubgraphFusionMatMulTests : public LoraSubgraphFusionTests { ov::PartialShape shape_state_3 = {N, -1}; }; - TEST_F(LoraSubgraphFusionMatMulTests, StandardPattern) { { - auto param_y = std::make_shared(netType, shape_x); + auto param_lora = std::make_shared(netType, shape_x); auto param_w = std::make_shared(netType, shape_w); - auto main_mm = std::make_shared(param_y, param_w, false, true); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); - auto lora_subgraph = create_lora_subgraph(param_y, main_mm, states.first, false); - model = std::make_shared(OutputVector{lora_subgraph, main_mm}, states.second, ParameterVector{param_y, param_w}); + auto lora_subgraph = create_lora_subgraph(main_mm, param_lora, states.first, false); + lora_subgraph->set_friendly_name("lora_subgraph"); + model = std::make_shared(OutputVector{lora_subgraph, main_mm}, + states.second, + ParameterVector{param_lora, param_w}); } { - auto param_y = std::make_shared(netType, shape_x); + auto param_lora = std::make_shared(netType, shape_x); auto param_w = std::make_shared(netType, shape_w); - auto main_mm = std::make_shared(param_y, param_w, false, true); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); - auto inner_param_y = std::make_shared(netType, shape_x); + auto inner_param_lora = std::make_shared(netType, shape_x); auto inner_state_1 = std::make_shared(netType, shape_state_1); auto inner_state_2 = std::make_shared(netType, shape_state_2); auto inner_state_3 = std::make_shared(netType, shape_state_3); auto inner_param_mm = std::make_shared(netType, main_mm->get_output_partial_shape(0)); ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; - auto lora_subgraph = create_lora_subgraph(inner_param_y, inner_param_mm, states_outs, false); - ov::ParameterVector inner_params{inner_param_y, inner_state_1, inner_state_2, inner_state_3, inner_param_mm}; + auto lora_subgraph = create_lora_subgraph(inner_param_mm, inner_param_lora, states_outs, false); + lora_subgraph->set_friendly_name("lora_subgraph"); + ov::ParameterVector inner_params{inner_param_mm, inner_param_lora, inner_state_1, inner_state_2, inner_state_3}; auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); - ov::OutputVector lora_inputs{param_y, states.first[0], states.first[1], states.first[2], main_mm}; + ov::OutputVector lora_inputs{main_mm, param_lora, states.first[0], states.first[1], states.first[2]}; auto lora = std::make_shared(lora_inputs, inner_model); + lora->set_friendly_name("lora_subgraph"); - model_ref = std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_y, param_w}); + model_ref = + std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_lora, param_w}); } } TEST_F(LoraSubgraphFusionMatMulTests, ReshaffledEltwiseInputs) { { - auto param_y = std::make_shared(netType, shape_x); + auto param_lora = std::make_shared(netType, shape_x); auto param_w = std::make_shared(netType, shape_w); - auto main_mm = std::make_shared(param_y, param_w, false, true); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); + auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); - auto lora_subgraph = create_lora_subgraph(param_y, main_mm, states.first, false, 0, 1); - model = std::make_shared(OutputVector{lora_subgraph, main_mm}, states.second, ParameterVector{param_y, param_w}); + auto lora_subgraph = create_lora_subgraph(main_mm, param_lora, states.first, false, 0, 1); + lora_subgraph->set_friendly_name("lora_subgraph"); + + model = std::make_shared(OutputVector{lora_subgraph, main_mm}, + states.second, + ParameterVector{param_lora, param_w}); } { - auto param_y = std::make_shared(netType, shape_x); + auto param_lora = std::make_shared(netType, shape_x); auto param_w = std::make_shared(netType, shape_w); - auto main_mm = std::make_shared(param_y, param_w, false, true); + auto main_mm = std::make_shared(param_lora, param_w, false, true); + main_mm->set_friendly_name("main_mm"); - auto inner_param_y = std::make_shared(netType, shape_x); + auto inner_param_lora = std::make_shared(netType, shape_x); auto inner_state_1 = std::make_shared(netType, shape_state_1); auto inner_state_2 = std::make_shared(netType, shape_state_2); auto inner_state_3 = std::make_shared(netType, shape_state_3); auto inner_param_mm = std::make_shared(netType, main_mm->get_output_partial_shape(0)); ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; - auto lora_subgraph = create_lora_subgraph(inner_param_y, inner_param_mm, states_outs, false, 0, 1); - ov::ParameterVector inner_params{inner_param_y, inner_state_1, inner_state_2, inner_state_3, inner_param_mm}; + auto lora_subgraph = create_lora_subgraph(inner_param_mm, inner_param_lora, states_outs, false, 0, 1); + lora_subgraph->set_friendly_name("lora_subgraph"); + ov::ParameterVector inner_params{inner_param_mm, inner_param_lora, inner_state_1, inner_state_2, inner_state_3}; auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); - ov::OutputVector lora_inputs{param_y, states.first[0], states.first[1], states.first[2], main_mm}; + ov::OutputVector lora_inputs{main_mm, param_lora, states.first[0], states.first[1], states.first[2]}; auto lora = std::make_shared(lora_inputs, inner_model); + lora->set_friendly_name("lora_subgraph"); - model_ref = std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_y, param_w}); + model_ref = + std::make_shared(OutputVector{lora, main_mm}, states.second, ParameterVector{param_lora, param_w}); } } @@ -164,8 +188,8 @@ class LoraSubgraphFusionConvolutionTests : public LoraSubgraphFusionTests { TEST_F(LoraSubgraphFusionConvolutionTests, StandardPattern) { { - auto param_y = std::make_shared(netType, shape_x); - auto main_conv = ov::test::utils::make_convolution(param_y, + auto param_lora = std::make_shared(netType, shape_x); + auto main_conv = ov::test::utils::make_convolution(param_lora, netType, {1, 1}, {1, 1}, @@ -174,13 +198,16 @@ TEST_F(LoraSubgraphFusionConvolutionTests, StandardPattern) { {1, 1}, ov::op::PadType::EXPLICIT, num_channels.get_length()); + main_conv->set_friendly_name("main_conv"); auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); - auto lora_subgraph = create_lora_subgraph(param_y, main_conv, states.first, true); - model = std::make_shared(OutputVector{lora_subgraph, main_conv}, states.second, ParameterVector{param_y}); + auto lora_subgraph = create_lora_subgraph(main_conv, param_lora, states.first, true); + lora_subgraph->set_friendly_name("lora_subgraph"); + model = + std::make_shared(OutputVector{lora_subgraph, main_conv}, states.second, ParameterVector{param_lora}); } { - auto param_y = std::make_shared(netType, shape_x); - auto main_conv = ov::test::utils::make_convolution(param_y, + auto param_lora = std::make_shared(netType, shape_x); + auto main_conv = ov::test::utils::make_convolution(param_lora, netType, {1, 1}, {1, 1}, @@ -189,20 +216,30 @@ TEST_F(LoraSubgraphFusionConvolutionTests, StandardPattern) { {1, 1}, ov::op::PadType::EXPLICIT, num_channels.get_length()); + main_conv->set_friendly_name("main_conv"); - auto inner_param_y = std::make_shared(netType, shape_x); + auto inner_param_lora = std::make_shared(netType, shape_x); auto inner_state_1 = std::make_shared(netType, shape_state_1); auto inner_state_2 = std::make_shared(netType, shape_state_2); auto inner_state_3 = std::make_shared(netType, shape_state_3); - auto inner_param_mm = std::make_shared(netType, main_conv->get_output_partial_shape(0)); - auto lora_subgraph = create_lora_subgraph(inner_param_y, inner_param_mm, ov::OutputVector{inner_state_1, inner_state_2, inner_state_3}, true); - ov::ParameterVector inner_params{inner_param_y, inner_state_1, inner_state_2, inner_state_3, inner_param_mm}; + auto inner_param_conv = + std::make_shared(netType, main_conv->get_output_partial_shape(0)); + + ov::OutputVector states_outs{inner_state_1, inner_state_2, inner_state_3}; + auto lora_subgraph = create_lora_subgraph(inner_param_conv, inner_param_lora, states_outs, true); + lora_subgraph->set_friendly_name("lora_subgraph"); + ov::ParameterVector inner_params{inner_param_conv, + inner_param_lora, + inner_state_1, + inner_state_2, + inner_state_3}; auto inner_model = std::make_shared(OutputVector{lora_subgraph}, inner_params); auto states = create_states({shape_state_1, shape_state_2, shape_state_3}); - ov::OutputVector lora_inputs{param_y, states.first[0], states.first[1], states.first[2], main_conv}; + ov::OutputVector lora_inputs{main_conv, param_lora, states.first[0], states.first[1], states.first[2]}; auto lora = std::make_shared(lora_inputs, inner_model); + lora->set_friendly_name("lora_subgraph"); - model_ref = std::make_shared(OutputVector{lora, main_conv}, states.second, ParameterVector{param_y}); + model_ref = std::make_shared(OutputVector{lora, main_conv}, states.second, ParameterVector{param_lora}); } } From 82987069776787ef653929e584a127866fb26431 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 16 Oct 2024 13:18:37 +0200 Subject: [PATCH 3/5] Code style --- src/common/transformations/include/ov_ops/lora_subgraph.hpp | 6 +++--- .../common_optimizations/lora_subgraph_fusion.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/common/transformations/include/ov_ops/lora_subgraph.hpp b/src/common/transformations/include/ov_ops/lora_subgraph.hpp index 7cdf00bf82ccaf..5f16700e92eef8 100644 --- a/src/common/transformations/include/ov_ops/lora_subgraph.hpp +++ b/src/common/transformations/include/ov_ops/lora_subgraph.hpp @@ -15,9 +15,9 @@ namespace internal { * @interface LoraSubgraph * @brief LoraSubgraph operation, which is used for LoRA subgraphs fusion. * It always has only 1 output, and the following inputs, whose order is fixed: - * 1. main_flow_input - input from original model. - * 2. LoRA_input - data flow to which all the LoRA modifications are applied. The modified input is combined with `main_flow_input` - * 3. LoRA_matrices - 3 Low-Rank adaptation matrices applied to `LoRA_input`. + * 1. main_flow_input: input from original model. + * 2. LoRA_input: input to which the Low-Rank adaptation is applied. The adapted input is combined with `main_flow_input`. + * 3. LoRA_matrices: 3 Low-Rank adaptation matrices applied to `LoRA_input`. * The fused subgraph can be optimized in runtime based on LoRA semantic. * For instance, `main_flow_input` can be fast-forwarded to output in case of empty `LoRA_matrices`. */ diff --git a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp index 1632c9e07e6a51..a0da48a5b16241 100644 --- a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp @@ -83,10 +83,10 @@ ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { ov::ParameterVector subgraph_parameters; subgraph_parameters.reserve(internal_inputs.size()); - for (auto& input : internal_inputs) { - const auto new_parameter = std::make_shared(input.get_element_type(), input.get_partial_shape()); + for (auto& in : internal_inputs) { + auto new_parameter = std::make_shared(in.get_element_type(), in.get_partial_shape()); subgraph_parameters.push_back(new_parameter); - input.replace_source_output(new_parameter); + in.replace_source_output(new_parameter); } // Note: lora consumers should be taken before lora_subgraph creation, // because only original consumers should be replaced with lora's output From 22f795ae33642176547e45d082459a04ebeaa84e Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 16 Oct 2024 13:31:53 +0200 Subject: [PATCH 4/5] Minor corrections --- src/common/transformations/include/ov_ops/lora_subgraph.hpp | 3 ++- src/common/transformations/src/ov_ops/lora_subgraph.cpp | 4 ++++ .../common_optimizations/lora_subgraph_fusion.cpp | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/common/transformations/include/ov_ops/lora_subgraph.hpp b/src/common/transformations/include/ov_ops/lora_subgraph.hpp index 5f16700e92eef8..ecfc330750626c 100644 --- a/src/common/transformations/include/ov_ops/lora_subgraph.hpp +++ b/src/common/transformations/include/ov_ops/lora_subgraph.hpp @@ -16,7 +16,8 @@ namespace internal { * @brief LoraSubgraph operation, which is used for LoRA subgraphs fusion. * It always has only 1 output, and the following inputs, whose order is fixed: * 1. main_flow_input: input from original model. - * 2. LoRA_input: input to which the Low-Rank adaptation is applied. The adapted input is combined with `main_flow_input`. + * 2. LoRA_input: input to which the Low-Rank adaptation is applied. + * The adapted input is combined with `main_flow_input`. * 3. LoRA_matrices: 3 Low-Rank adaptation matrices applied to `LoRA_input`. * The fused subgraph can be optimized in runtime based on LoRA semantic. * For instance, `main_flow_input` can be fast-forwarded to output in case of empty `LoRA_matrices`. diff --git a/src/common/transformations/src/ov_ops/lora_subgraph.cpp b/src/common/transformations/src/ov_ops/lora_subgraph.cpp index e729682fa6cada..8a7a5a75c69c7e 100644 --- a/src/common/transformations/src/ov_ops/lora_subgraph.cpp +++ b/src/common/transformations/src/ov_ops/lora_subgraph.cpp @@ -4,6 +4,8 @@ #include "ov_ops/lora_subgraph.hpp" +#include "itt.hpp" + namespace ov { namespace op { namespace internal { @@ -18,11 +20,13 @@ LoraSubgraph::LoraSubgraph(const OutputVector& args, const std::shared_ptr LoraSubgraph::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(internal_LoraSubgraph_clone_with_new_inputs); check_new_args_count(this, new_args); return std::make_shared(new_args, get_function()->clone()); } void LoraSubgraph::validate_and_infer_types() { + INTERNAL_OP_SCOPE(internal_LoraSubgraph_validate_and_infer_types); OPENVINO_ASSERT(get_input_size() == 5, "LoraSubgraph must have 5 inputs whereas it has ", get_input_size()); OPENVINO_ASSERT(get_output_size() == 1, "LoraSubgraph must have 1 output whereas it has ", get_output_size()); const auto& body = get_function(); diff --git a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp index a0da48a5b16241..366ce00894242e 100644 --- a/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/lora_subgraph_fusion.cpp @@ -65,8 +65,8 @@ ov::pass::LoraSubgraphFusion::LoraSubgraphFusion() { // Note: internal_inputs/external_connections order corresponds to LoraSubgraph semantic const std::vector> internal_inputs{ - find_connected_input(add.get_node(), main_flow.get_node()), // For commutative eltwise ops, input idx may be any, so it must be computed + find_connected_input(add.get_node(), main_flow.get_node()), pattern_map.count(transpose1_m) ? pattern_map.at(transpose1_m).get_node()->input(0) : matmul1.get_node()->input(0), matmul1.get_node()->input(1), From de3782d85916d914771f6b957c95c176aa9c43d6 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 23 Oct 2024 12:24:59 +0200 Subject: [PATCH 5/5] Review comments applied --- src/common/transformations/include/ov_ops/lora_subgraph.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/transformations/include/ov_ops/lora_subgraph.hpp b/src/common/transformations/include/ov_ops/lora_subgraph.hpp index ecfc330750626c..75aaa16a5d280e 100644 --- a/src/common/transformations/include/ov_ops/lora_subgraph.hpp +++ b/src/common/transformations/include/ov_ops/lora_subgraph.hpp @@ -24,7 +24,7 @@ namespace internal { */ class TRANSFORMATIONS_API LoraSubgraph : public ov::op::util::SubGraphOp { public: - OPENVINO_OP("LoraSubgraph", "ie_internal_opset"); + OPENVINO_OP("LoraSubgraph", "ie_internal_opset", ov::op::util::SubGraphOp); LoraSubgraph() = default; LoraSubgraph(const OutputVector& args, const std::shared_ptr& body);