[CPU] Simplify matmul weights transposition logic

EgorDuplensky · EgorDuplensky · commit 926e544c3db6 · 2025-11-24T13:29:37.000+01:00
diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp
@@ -38,7 +38,6 @@
 #include "openvino/core/type/element_type.hpp"
 #include "post_ops.hpp"
 #include "thread_pool_imp.hpp"
-#include "utils/cpu_utils.hpp"
 #include "utils/debug_capabilities.h"
 #include "utils/general_utils.h"
 
@@ -154,24 +153,44 @@ std::shared_ptr<DnnlMatMulPrimitive> DnnlMatMulPrimitive::create(const MemoryArg
 DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const DnnlMemoryDescPtr& srcDesc,
                                                                       const DnnlMemoryDescPtr& dstDesc,
                                                                       const MatMulAttrs& attrs) {
-    if (!attrs.fcSemantic) {
-        return dstDesc;
-    }
+    OPENVINO_ASSERT(attrs.constantWeights, "DnnlMatmulExecutor: constant weights are expected");
+
+    auto getDims = [](const dnnl::memory::desc& desc, const bool transpose) {
+        auto dims = desc.get_dims();
+        if (transpose) {
+            std::swap(dims[dims.size() - 1], dims[dims.size() - 2]);
+            return dims;
+        }
+
+        return desc.get_dims();
+    };
+
+    auto getFormat = [](const size_t rank, const bool transpose) {
+        switch (rank) {
+        case 2:
+            return transpose ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
+        case 3:
+            return transpose ? dnnl::memory::format_tag::abc : dnnl::memory::format_tag::acb;
+        default:
+            OPENVINO_THROW("DnnlMatmulExecutor: unsupported weights rank: ", rank);
+        }
+    };
 
-    const bool weightsNonTransposed = attrs.weightsNonTransposed;
     const auto& weiDesc = srcDesc->getDnnlDesc();
-    auto wDims = weiDesc.get_dims();
-    std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]);
     const auto wDataType = weiDesc.get_data_type();
-    if (wDims.size() == 3 && !weightsNonTransposed) {
-        const auto format3D = dnnl::memory::format_tag::acb;
-        const auto transposed3DWeiDesc = dnnl::memory::desc{wDims, wDataType, format3D};
-        return DnnlExtensionUtils::makeDescriptor(transposed3DWeiDesc);
-    }
-
-    const dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims);
-    const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba;
-    const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, wDataType, format};
+    /* in case the second input is constant we can transpose weights beforehand
+     * and avoid transposition during execution */
+    const auto wDims = getDims(weiDesc, attrs.transposeB);
+    /**
+     * We need to transpose weights in two scenarios:
+     * - dnnl matmul is used as FullyConnected executor and weights are not transposed yet (optimization to pack weights
+     *   in one step)
+     * - dnnl mamtul is used as MatMul executor with transposeB equal to true. This case is just theoretical since
+     *   currently we always convert MatMul operation with constant second input to FullyConnected operation.
+     */
+    const bool transpose = attrs.fcSemantic ? attrs.weightsNonTransposed : attrs.transposeB;
+    const auto format = getFormat(weiDesc.get_ndims(), transpose);
+    const auto transposedWeiDesc = dnnl::memory::desc{wDims, wDataType, format};
 
     const auto reshapedWeiDesc = transposedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims());
 
@@ -424,8 +443,6 @@ static std::pair<VectorDims, VectorDims> makeDummyInputDims(const Shape& in0,
                 } else {
                     inDims1[idx1] = inDims0[idx0];
                 }
-            } else if (inDims0[idx0] != Shape::UNDEFINED_DIM && inDims1[idx1] != Shape::UNDEFINED_DIM) {
-                inDims1[idx1] = inDims0[idx0];
             }
         }
     };
@@ -530,9 +547,6 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const MatM
                                                        attrs.transposeA,
                                                        attrs.transposeB,
                                                        dstDesc->getShape().getRank());
-        if (attrs.fcSemantic && weiDymmyDims.size() == 3) {
-            std::swap(weiDymmyDims[weiDymmyDims.size() - 1], weiDymmyDims[weiDymmyDims.size() - 2]);
-        }
         srcDesc = std::make_shared<DnnlBlockedMemoryDesc>(srcDesc->getPrecision(), Shape(inDymmyDims));
         weiDesc = std::make_shared<DnnlBlockedMemoryDesc>(weiDesc->getPrecision(), Shape(weiDymmyDims));
         dstDesc = std::make_shared<DnnlBlockedMemoryDesc>(dstDesc->getPrecision(), Shape(outDymmyDims));
diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp
@@ -438,12 +438,19 @@ const std::vector<ExecutorImplementation<FCAttrs>>& getImplementations() {
             [](const FCAttrs& attrs,
                const MemoryArgs& memory,
                const ExecutorContext::CPtr& context) -> ExecutorPtr {
-                MatMulAttrs matMulAttrs{false,
-                                        false};
-                matMulAttrs.postOps = attrs.postOps;
-                matMulAttrs.weightsNonTransposed = attrs.weightsNonTransposed;
-                matMulAttrs.constantWeights = true;
-                matMulAttrs.fcSemantic = true;
+                const bool hasBias = !memory.at(ARG_BIAS)->getDesc().empty();
+                MatMulAttrs matMulAttrs {
+                    false,
+                    true,
+                    hasBias,
+                    attrs.weightsNonTransposed,
+                    false,
+                    true,
+                    true,
+                    0,
+                    {},
+                    attrs.postOps
+                };
                 
                 return std::make_shared<
                     DnnlExecutor<DnnlMatMulPrimitive, MatMulAttrs, DnnlShapeAgnosticData,