Added Squat Packing's rotate and sum operation and resolved type mism…

…atch issues. PiperOrigin-RevId: 715833899
google · Jan 16, 2025 · 48f181e · 48f181e
1 parent 3917e51
commit 48f181e
Show file tree

Hide file tree

Showing 11 changed files with 499 additions and 132 deletions.
diff --git a/lib/Dialect/LinAlg/Conversions/LinalgToTensorExt/BUILD b/lib/Dialect/LinAlg/Conversions/LinalgToTensorExt/BUILD
@@ -14,11 +14,15 @@ cc_library(
     deps = [
         ":pass_inc_gen",
         "@heir//lib/Analysis/SecretnessAnalysis",
+        "@heir//lib/Dialect/Secret/IR:Dialect",
         "@heir//lib/Dialect/TensorExt/IR:Dialect",
+        "@heir//lib/Utils",
+        "@heir//lib/Utils:ConversionUtils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:Pass",

diff --git a/lib/Dialect/LinAlg/Conversions/LinalgToTensorExt/LinalgToTensorExt.cpp b/lib/Dialect/LinAlg/Conversions/LinalgToTensorExt/LinalgToTensorExt.cpp
diff --git a/lib/Dialect/LinAlg/Conversions/LinalgToTensorExt/LinalgToTensorExt.td b/lib/Dialect/LinAlg/Conversions/LinalgToTensorExt/LinalgToTensorExt.td
@@ -9,10 +9,24 @@ def LinalgToTensorExt : Pass<"linalg-to-tensor-ext"> {
   let description = [{
     This pass lowers the `linalg.matmul` to a mixture of affine, tensor, and
     via the Halevi-Shoup and squat matrix multiplication algorithms.
+
+    We assume that the input and output values are replicated. This makes
+    aligning the matrix multiplications easier (though not necessarily optimal).
+    For example, when multiplying a 1x4 vector with a 4x2 matrix, the bias and output
+    will be a 1x2 vector. However, due to requiring tensor sizes to match, and
+    assuming replication, the matrix will be expanded to a 4x4 matrix and output
+    to a 1x4 vector (where the output is replicated twice).
+
+    For now, the tilingSize is a command line parameter that determines the
+    maximum secret vector size used in the Halevi-Shoup and squat matrix
+    multiplication algorithms. It can be specified via --linalg-to-tensor-ext=tiling-size=16.
   }];
   let dependentDialects = [
     "mlir::heir::tensor_ext::TensorExtDialect",
   ];
+  let options = [
+    Option<"tilingSize", "tiling-size", "int", "16", "tiling size of the halevi-shoup and squat packing matrix multiplication algorithms">
+  ];
 }
 
 #endif  // LIB_DIALECT_LINALG_CONVERSIONS_LINALGTOTENSOREXT_LINALGTOTENSOREXT_TD_
diff --git a/lib/Dialect/TensorExt/IR/TensorExtCanonicalization.td b/lib/Dialect/TensorExt/IR/TensorExtCanonicalization.td
@@ -29,6 +29,9 @@ def DropZeroRotation : Pat<
   [(IsZeroIntAttr $c0)]
 >;
 
+// Currently commented out because it doesn't work for multi-dimensional tensors.
+// Will be uncommented and fixed by Asra's PR. Commenting this out causes various
+// other tests to fail.
 // rotate %t, x -> rotate %t, x mod size
 def NormalizeRotationIndex : Pat<
   (TensorExt_RotateOp $tensor, (Arith_ConstantOp:$shiftOp APIntAttr:$shiftAmount)),

diff --git a/tests/Dialect/LinAlg/Conversions/linalg_to_tensor_ext/float_small_fc_network.mlir b/tests/Dialect/LinAlg/Conversions/linalg_to_tensor_ext/float_small_fc_network.mlir
@@ -0,0 +1,47 @@
+// This test verifies that a small fully connected network lowers without returning
+// an error.
+// TODO: write a test that verifies the correctness of the lowering.
+
+// RUN: heir-opt %s --linalg-to-tensor-ext=tiling-size=4 --tosa-to-secret-arith --canonicalize | FileCheck %s
+
+// CHECK:        func.func @test_float_small_fc_network(%[[ARG:.*]]: !secret.secret<tensor<1x4xf32>>)
+module {
+func.func @test_float_small_fc_network(%input : !secret.secret<tensor<1x1xf32>>) -> !secret.secret<tensor<1x1xf32>> {
+  %matrix1 = arith.constant dense<[[1.0, 2.0, 3.0, 4.0]]> : tensor<1x4xf32>
+  %bias1 = arith.constant dense<[[5.0, 6.0, 7.0, 8.0]]> : tensor<1x4xf32>
+  %layer1 = secret.generic ins (%input : !secret.secret<tensor<1x1xf32>>) {
+  ^bb0(%converted_input1: tensor<1x1xf32>):
+    %0 = linalg.matmul ins(%converted_input1, %matrix1 : tensor<1x1xf32>, tensor<1x4xf32>) outs(%bias1 : tensor<1x4xf32>) -> tensor<1x4xf32>
+    secret.yield %0 : tensor<1x4xf32>
+  } -> !secret.secret<tensor<1x4xf32>>
+
+  %activation_layer1 = secret.generic ins (%layer1 : !secret.secret<tensor<1x4xf32>>) {
+  ^bb0(%converted_activation_layer_vec1: tensor<1x4xf32>):
+    %0 = tosa.sigmoid %converted_activation_layer_vec1 : (tensor<1x4xf32>) -> tensor<1x4xf32>
+    secret.yield %0 : tensor<1x4xf32>
+  } -> !secret.secret<tensor<1x4xf32>>
+
+  %matrix2 = arith.constant dense<[[10.0, 20.0, 30.0, 40.0], [50.0, 60.0, 70.0, 80.0], [90.0, 100.0, 110.0, 120.0], [130.0, 140.0, 150.0, 160.0]]> : tensor<4x4xf32>
+  %bias2 = arith.constant dense<[[170.0, 180.0, 190.0, 200.0]]> : tensor<1x4xf32>
+  %layer2 = secret.generic ins (%layer1 : !secret.secret<tensor<1x4xf32>>) {
+  ^bb0(%converted_vec2: tensor<1x4xf32>):
+    %1 = linalg.matmul ins(%converted_vec2, %matrix2 : tensor<1x4xf32>, tensor<4x4xf32>) outs(%bias2 : tensor<1x4xf32>) -> tensor<1x4xf32>
+    secret.yield %1 : tensor<1x4xf32>
+  } -> !secret.secret<tensor<1x4xf32>>
+
+  %activation_layer2 = secret.generic ins (%layer2 : !secret.secret<tensor<1x4xf32>>) {
+  ^bb0(%converted_activation_layer_vec2: tensor<1x4xf32>):
+    %0 = tosa.sigmoid %converted_activation_layer_vec2 : (tensor<1x4xf32>) -> tensor<1x4xf32>
+    secret.yield %0 : tensor<1x4xf32>
+  } -> !secret.secret<tensor<1x4xf32>>
+
+  %matrix3 = arith.constant dense<[[100.0], [200.0], [300.0], [400.0]]> : tensor<4x1xf32>
+  %bias3 = arith.constant dense<[[500.0]]> : tensor<1x1xf32>
+  %layer3 = secret.generic ins (%activation_layer2 : !secret.secret<tensor<1x4xf32>>) {
+  ^bb0(%converted_vec3: tensor<1x4xf32>):
+    %0 = linalg.matmul ins(%converted_vec3, %matrix3 : tensor<1x4xf32>, tensor<4x1xf32>) outs(%bias3 : tensor<1x1xf32>) -> tensor<1x1xf32>
+    secret.yield %0 : tensor<1x1xf32>
+  } -> !secret.secret<tensor<1x1xf32>>
+  return %layer3 : !secret.secret<tensor<1x1xf32>>
+}
+}
diff --git a/tests/Dialect/LinAlg/Conversions/linalg_to_tensor_ext/float_vector_small_matrix_matmul.mlir b/tests/Dialect/LinAlg/Conversions/linalg_to_tensor_ext/float_vector_small_matrix_matmul.mlir
@@ -0,0 +1,33 @@
+// RUN: heir-opt %s --linalg-to-tensor-ext=tiling-size=4 --canonicalize | FileCheck %s
+
+// CHECK:      func.func @test_float_vector_small_matrix_matmul(%[[ARG:.*]]: !secret.secret<tensor<1x4xf32>>)
+// CHECK-DAG:   %[[TWO:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[BIAS:.*]] = arith.constant dense<5.{{0*}}e+00> : tensor<1x4xf32>
+// CHECK-DAG:   %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
+// CHECK-SAME{LITERAL}: <[[
+// CHECK-SAME: 1.{{0*}}e+00, 2.{{0*}}e+00, 3.{{0*}}e+00, 4.{{0*}}e+00], [2.{{0*}}e+00, 3.{{0*}}e+00, 4.{{0*}}e+00, 1.{{0*}}e+00], [3.{{0*}}e+00, 4.{{0*}}e+00, 1.{{0*}}e+00, 2.{{0*}}e+00], [4.{{0*}}e+00, 1.{{0*}}e+00, 2.{{0*}}e+00, 3.{{0*}}e+00
+// CHECK-SAME{LITERAL}: ]]>
+// CHECK-DAG:   %[[SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][3, 0] [1, 4] [1, 1]
+// CHECK:       %[[OUT:.*]] = secret.generic ins(%[[ARG]] : !secret.secret<tensor<1x4xf32>>)
+// CHECK:       ^body(%[[ARG_CONVERTED:.*]]: tensor<1x4xf32>):
+// CHECK:         %[[MUL:.*]] = arith.mulf %[[ARG_CONVERTED]], %[[SLICE]]
+// CHECK:         %[[SUM:.*]] = arith.addf %[[MUL]], %[[BIAS]]
+// CHECK:         %[[ROTATE1:.*]] = tensor_ext.rotate %[[SUM]], %[[TWO]]
+// CHECK:         %[[ROTATE_AND_SUM_1:.*]] = arith.addf %[[SUM]], %[[ROTATE1]]
+// CHECK:         %[[ROTATE2:.*]] = tensor_ext.rotate %[[ROTATE_AND_SUM_1]], %[[ONE]]
+// CHECK:         %[[FINAL_SUM:.*]] = arith.addf %[[ROTATE_AND_SUM_1]], %[[ROTATE2]]
+// CHECK:         secret.yield %[[FINAL_SUM]]
+// CHECK:       return %[[OUT]]
+module {
+func.func @test_float_vector_small_matrix_matmul(%vec : !secret.secret<tensor<1x4xf32>>) -> !secret.secret<tensor<1x1xf32>> {
+  %matrix = arith.constant dense<[[1.0], [2.0], [3.0], [4.0]]> : tensor<4x1xf32>
+  %bias = arith.constant dense<[[5.0]]> : tensor<1x1xf32>
+  %out = secret.generic ins (%vec : !secret.secret<tensor<1x4xf32>>) {
+  ^bb0(%converted_vec: tensor<1x4xf32>):
+    %0 = linalg.matmul ins(%converted_vec, %matrix : tensor<1x4xf32>, tensor<4x1xf32>) outs(%bias : tensor<1x1xf32>) -> tensor<1x1xf32>
+    secret.yield %0 : tensor<1x1xf32>
+  } -> !secret.secret<tensor<1x1xf32>>
+  return %out : !secret.secret<tensor<1x1xf32>>
+}
+}
diff --git a/...Dialect/LinAlg/Conversions/linalg_to_tensor_ext/float_vector_square_matrix_matmul_op.mlir b/...Dialect/LinAlg/Conversions/linalg_to_tensor_ext/float_vector_square_matrix_matmul_op.mlir
@@ -1,15 +1,16 @@
-// RUN: heir-opt %s --linalg-to-tensor-ext | FileCheck %s
+// RUN: heir-opt %s --linalg-to-tensor-ext=tiling-size=4 --canonicalize | FileCheck %s
 
-// CHECK:      func.func @test_float_vector_square_matrix_linalg_to_arith(%[[ARG:.*]]: !secret.secret<tensor<1x4xf16>>)
-// CHECK-DAG:      %[[ONE:.*]] = arith.constant 1 : index
-// CHECK:      %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
+// CHECK:       func.func @test_float_vector_square_matrix_matmul(%[[ARG:.*]]: !secret.secret<tensor<1x4xf16>>)
+// CHECK-DAG:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
 // CHECK-SAME{LITERAL}: <[[
 // CHECK-SAME: 1.{{0*}}e+00, 6.{{0*}}e+00, 1.1{{0*}}e+01, 1.6{{0*}}e+01], [5.{{0*}}e+00, 1.{{0*}}e+01, 1.5{{0*}}e+01, 4.{{0*}}e+00], [9.{{0*}}e+00, 1.4{{0*}}e+01, 3.{{0*}}e+00, 8.{{0*}}e+00], [1.3{{0*}}e+01, 2.{{0*}}e+00, 7.{{0*}}e+00, 1.2{{0*}}e+01
 // CHECK-SAME{LITERAL}: ]]>
-// CHECK:     %[[BIAS:.*]] = arith.constant dense
+// CHECK-DAG:   %[[BIAS:.*]] = arith.constant dense
 // CHECK-SAME{LITERAL}: <[[
 // CHECK-SAME: 1.7{{0*}}e+01, 1.8{{0*}}e+01, 1.9{{0*}}e+01, 2.{{0*}}e+01
 // CHECK-SAME{LITERAL}: ]]>
+// CHECK:      %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][3, 0] [1, 4] [1, 1]
 // CHECK:      %[[OUT:.*]] = secret.generic ins(%[[ARG]] : !secret.secret<tensor<1x4xf16>>)
 // CHECK:      ^body(%[[ARG_CONVERTED:.*]]: tensor<1x4xf16>):
 // CHECK:        %[[FOR_LOOP_OUT:.*]]:2 = affine.for %[[I:.*]] = 0 to 3 iter_args(%[[RUNNING_SUM:.*]] = %[[BIAS]], %[[ROTATED_VEC:.*]] = %[[ARG_CONVERTED]])
@@ -18,13 +19,12 @@
 // CHECK:        %[[UPDATED_SUM:.*]] = arith.addf %[[RUNNING_SUM]], %[[MUL]]
 // CHECK:        %[[UPDATED_ROTATED_VEC:.*]] = tensor_ext.rotate %[[ROTATED_VEC]], %[[ONE]]
 // CHECK:        affine.yield %[[UPDATED_SUM]], %[[UPDATED_ROTATED_VEC]]
-// CHECK:      %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][3, 0] [1, 4] [1, 1]
 // CHECK:      %[[LAST_MUL:.*]] = arith.mulf %[[FOR_LOOP_OUT]]#1, %[[LAST_SLICE]]
 // CHECK:      %[[FINAL_SUM:.*]] = arith.addf %[[FOR_LOOP_OUT]]#0, %[[LAST_MUL]]
 // CHECK:      secret.yield %[[FINAL_SUM]]
 // CHECK:      return %[[OUT]]
 module {
-func.func @test_float_vector_square_matrix_linalg_to_arith(%vec : !secret.secret<tensor<1x4xf16>>) -> !secret.secret<tensor<1x4xf16>> {
+func.func @test_float_vector_square_matrix_matmul(%vec : !secret.secret<tensor<1x4xf16>>) -> !secret.secret<tensor<1x4xf16>> {
   %matrix = arith.constant dense<[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]]> : tensor<4x4xf16>
   %bias = arith.constant dense<[[17.0, 18.0, 19.0, 20.0]]> : tensor<1x4xf16>
   %out = secret.generic ins (%vec : !secret.secret<tensor<1x4xf16>>) {

diff --git a/...Dialect/LinAlg/Conversions/linalg_to_tensor_ext/integer_rect_matrix_vector_matmul_op.mlir b/...Dialect/LinAlg/Conversions/linalg_to_tensor_ext/integer_rect_matrix_vector_matmul_op.mlir
@@ -0,0 +1,36 @@
+// RUN: heir-opt %s --linalg-to-tensor-ext=tiling-size=4 --canonicalize | FileCheck %s
+
+// CHECK:       func.func @test_integer_rect_matrix_vector_matmul(%[[ARG:.*]]: !secret.secret<tensor<4x1xi16>>)
+// CHECK-DAG:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[TWO:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[BIAS:.*]] = arith.constant dense
+// CHECK-SAME{LITERAL}: <[[17], [18], [17], [18]]> : tensor<4x1xi16>
+// CHECK-DAG:   %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
+// CHECK-SAME{LITERAL}: <[[1, 2, 3, 4], [6, 7, 8, 5], [3, 4, 1, 2], [8, 5, 6, 7]]> : tensor<4x4xi16>
+// CHECK-DAG:   %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][0, 1] [4, 1] [1, 1]
+// CHECK:       %[[OUT:.*]] = secret.generic ins(%[[ARG]] : !secret.secret<tensor<4x1xi16>>)
+// CHECK:       ^body(%[[ARG_CONVERTED:.*]]: tensor<4x1xi16>):
+// CHECK:         %[[FOR_LOOP_OUT:.*]]:2 = affine.for %[[I:.*]] = 0 to 1 iter_args(%[[RUNNING_SUM:.*]] = %[[BIAS]], %[[ROTATED_VEC:.*]] = %[[ARG_CONVERTED]])
+// CHECK:         %[[SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][0, %[[I]]] [4, 1] [1, 1]
+// CHECK:         %[[MUL:.*]] = arith.muli %[[ROTATED_VEC]], %[[SLICE]]
+// CHECK:         %[[UPDATED_SUM:.*]] = arith.addi %[[RUNNING_SUM]], %[[MUL]]
+// CHECK:         %[[UPDATED_ROTATED_VEC:.*]] = tensor_ext.rotate %[[ROTATED_VEC]], %[[ONE]]
+// CHECK:         affine.yield %[[UPDATED_SUM]], %[[UPDATED_ROTATED_VEC]]
+// CHECK:       %[[LAST_MUL:.*]] = arith.muli %[[FOR_LOOP_OUT]]#1, %[[LAST_SLICE]]
+// CHECK:       %[[BEFORE_ROTATE_AND_SUM:.*]] = arith.addi %[[FOR_LOOP_OUT]]#0, %[[LAST_MUL]]
+// CHECK:       %[[ROTATED_SUM:.*]] = tensor_ext.rotate %[[BEFORE_ROTATE_AND_SUM]], %[[TWO]]
+// CHECK:       %[[FINAL_SUM:.*]] = arith.addi %[[BEFORE_ROTATE_AND_SUM]], %[[ROTATED_SUM]]
+// CHECK:       secret.yield %[[FINAL_SUM]]
+// CHECK:       return %[[OUT]]
+module {
+func.func @test_integer_rect_matrix_vector_matmul(%vec : !secret.secret<tensor<4x1xi16>>) -> !secret.secret<tensor<2x1xi16>> {
+  %matrix = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8]]> : tensor<2x4xi16>
+  %bias = arith.constant dense<[[17], [18]]> : tensor<2x1xi16>
+  %out = secret.generic ins (%vec : !secret.secret<tensor<4x1xi16>>) {
+  ^bb0(%converted_vec: tensor<4x1xi16>):
+    %0 = linalg.matmul ins(%matrix, %converted_vec : tensor<2x4xi16>, tensor<4x1xi16>) outs(%bias : tensor<2x1xi16>) -> tensor<2x1xi16>
+    secret.yield %0 : tensor<2x1xi16>
+  } -> !secret.secret<tensor<2x1xi16>>
+  return %out : !secret.secret<tensor<2x1xi16>>
+}
+}
diff --git a/...ialect/LinAlg/Conversions/linalg_to_tensor_ext/integer_small_vector_matrix_matmul_op.mlir b/...ialect/LinAlg/Conversions/linalg_to_tensor_ext/integer_small_vector_matrix_matmul_op.mlir
@@ -0,0 +1,33 @@
+// RUN: heir-opt %s --linalg-to-tensor-ext=tiling-size=4 --canonicalize | FileCheck %s
+
+// CHECK:       func.func @test_integer_square_matrix_vector_matmul(%[[ARG:.*]]: !secret.secret<tensor<4x1xi16>>)
+// CHECK-DAG:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
+// CHECK-SAME{LITERAL}: <[[1, 2, 3, 4], [6, 7, 8, 5], [11, 12, 9, 10], [16, 13, 14, 15]]> : tensor<4x4xi16>
+// CHECK-DAG:   %[[BIAS:.*]] = arith.constant dense
+// CHECK-SAME{LITERAL}: <[[17], [18], [19], [20]]> : tensor<4x1xi16>
+// CHECK-DAG:   %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][0, 3] [4, 1] [1, 1]
+// CHECK:       %[[OUT:.*]] = secret.generic ins(%[[ARG]] : !secret.secret<tensor<4x1xi16>>)
+// CHECK:       ^body(%[[ARG_CONVERTED:.*]]: tensor<4x1xi16>):
+// CHECK:        %[[FOR_LOOP_OUT:.*]]:2 = affine.for %[[I:.*]] = 0 to 3 iter_args(%[[RUNNING_SUM:.*]] = %[[BIAS]], %[[ROTATED_VEC:.*]] = %[[ARG_CONVERTED]])
+// CHECK:        %[[SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][0, %[[I]]] [4, 1] [1, 1]
+// CHECK:        %[[MUL:.*]] = arith.muli %[[ROTATED_VEC]], %[[SLICE]]
+// CHECK:        %[[UPDATED_SUM:.*]] = arith.addi %[[RUNNING_SUM]], %[[MUL]]
+// CHECK:        %[[UPDATED_ROTATED_VEC:.*]] = tensor_ext.rotate %[[ROTATED_VEC]], %[[ONE]]
+// CHECK:        affine.yield %[[UPDATED_SUM]], %[[UPDATED_ROTATED_VEC]]
+// CHECK:      %[[LAST_MUL:.*]] = arith.muli %[[FOR_LOOP_OUT]]#1, %[[LAST_SLICE]]
+// CHECK:      %[[FINAL_SUM:.*]] = arith.addi %[[FOR_LOOP_OUT]]#0, %[[LAST_MUL]]
+// CHECK:      secret.yield %[[FINAL_SUM]]
+// CHECK:      return %[[OUT]]
+module {
+func.func @test_integer_square_matrix_vector_matmul(%vec : !secret.secret<tensor<4x1xi16>>) -> !secret.secret<tensor<4x1xi16>> {
+  %matrix = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]> : tensor<4x4xi16>
+  %bias = arith.constant dense<[[17], [18], [19], [20]]> : tensor<4x1xi16>
+  %out = secret.generic ins (%vec : !secret.secret<tensor<4x1xi16>>) {
+  ^bb0(%converted_vec: tensor<4x1xi16>):
+    %0 = linalg.matmul ins(%matrix, %converted_vec : tensor<4x4xi16>, tensor<4x1xi16>) outs(%bias : tensor<4x1xi16>) -> tensor<4x1xi16>
+    secret.yield %0 : tensor<4x1xi16>
+  } -> !secret.secret<tensor<4x1xi16>>
+  return %out : !secret.secret<tensor<4x1xi16>>
+}
+}
diff --git a/...alect/LinAlg/Conversions/linalg_to_tensor_ext/integer_square_matrix_vector_matmul_op.mlir b/...alect/LinAlg/Conversions/linalg_to_tensor_ext/integer_square_matrix_vector_matmul_op.mlir
@@ -1,11 +1,12 @@
-// RUN: heir-opt %s --linalg-to-tensor-ext | FileCheck %s
+// RUN: heir-opt %s --linalg-to-tensor-ext=tiling-size=4 --canonicalize | FileCheck %s
 
-// CHECK:      func.func @test_integer_square_matrix_vector_linalg_to_arith(%[[ARG:.*]]: !secret.secret<tensor<4x1xi16>>)
+// CHECK:      func.func @test_integer_vector_square_matrix_matmul(%[[ARG:.*]]: !secret.secret<tensor<4x1xi16>>)
 // CHECK-DAG:  %[[ONE:.*]] = arith.constant 1 : index
 // CHECK:      %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
 // CHECK-SAME{LITERAL}: <[[1, 2, 3, 4], [6, 7, 8, 5], [11, 12, 9, 10], [16, 13, 14, 15]]> : tensor<4x4xi16>
 // CHECK:      %[[BIAS:.*]] = arith.constant dense
 // CHECK-SAME{LITERAL}: <[[17], [18], [19], [20]]> : tensor<4x1xi16>
+// CHECK:      %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][0, 3] [4, 1] [1, 1]
 // CHECK:      %[[OUT:.*]] = secret.generic ins(%[[ARG]] : !secret.secret<tensor<4x1xi16>>)
 // CHECK:      ^body(%[[ARG_CONVERTED:.*]]: tensor<4x1xi16>):
 // CHECK:        %[[FOR_LOOP_OUT:.*]]:2 = affine.for %[[I:.*]] = 0 to 3 iter_args(%[[RUNNING_SUM:.*]] = %[[BIAS]], %[[ROTATED_VEC:.*]] = %[[ARG_CONVERTED]])
@@ -14,13 +15,12 @@
 // CHECK:        %[[UPDATED_SUM:.*]] = arith.addi %[[RUNNING_SUM]], %[[MUL]]
 // CHECK:        %[[UPDATED_ROTATED_VEC:.*]] = tensor_ext.rotate %[[ROTATED_VEC]], %[[ONE]]
 // CHECK:        affine.yield %[[UPDATED_SUM]], %[[UPDATED_ROTATED_VEC]]
-// CHECK:      %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][0, 3] [4, 1] [1, 1]
 // CHECK:      %[[LAST_MUL:.*]] = arith.muli %[[FOR_LOOP_OUT]]#1, %[[LAST_SLICE]]
 // CHECK:      %[[FINAL_SUM:.*]] = arith.addi %[[FOR_LOOP_OUT]]#0, %[[LAST_MUL]]
 // CHECK:      secret.yield %[[FINAL_SUM]]
 // CHECK:      return %[[OUT]]
 module {
-func.func @test_integer_square_matrix_vector_linalg_to_arith(%vec : !secret.secret<tensor<4x1xi16>>) -> !secret.secret<tensor<4x1xi16>> {
+func.func @test_integer_vector_square_matrix_matmul(%vec : !secret.secret<tensor<4x1xi16>>) -> !secret.secret<tensor<4x1xi16>> {
   %matrix = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]> : tensor<4x4xi16>
   %bias = arith.constant dense<[[17], [18], [19], [20]]> : tensor<4x1xi16>
   %out = secret.generic ins (%vec : !secret.secret<tensor<4x1xi16>>) {

diff --git a/...alect/LinAlg/Conversions/linalg_to_tensor_ext/integer_vector_square_matrix_matmul_op.mlir b/...alect/LinAlg/Conversions/linalg_to_tensor_ext/integer_vector_square_matrix_matmul_op.mlir
@@ -1,11 +1,12 @@
-// RUN: heir-opt %s --linalg-to-tensor-ext | FileCheck %s
+// RUN: heir-opt %s --linalg-to-tensor-ext=tiling-size=4 --canonicalize | FileCheck %s
 
-// CHECK:      func.func @test_integer_vector_square_matrix_linalg_to_arith(%[[ARG:.*]]: !secret.secret<tensor<1x4xi16>>)
-// CHECK-DAG:  %[[ONE:.*]] = arith.constant 1 : index
-// CHECK:      %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
+// CHECK:       func.func @test_integer_vector_square_matrix_matmul(%[[ARG:.*]]: !secret.secret<tensor<1x4xi16>>)
+// CHECK-DAG:   %[[ONE:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[DIAGONALIZED_MATRIX:.*]] = arith.constant dense
 // CHECK-SAME{LITERAL}: <[[1, 6, 11, 16], [5, 10, 15, 4], [9, 14, 3, 8], [13, 2, 7, 12]]> : tensor<4x4xi16>
-// CHECK:      %[[BIAS:.*]] = arith.constant dense
+// CHECK-DAG:   %[[BIAS:.*]] = arith.constant dense
 // CHECK-SAME{LITERAL}: <[[17, 18, 19, 20]]> : tensor<1x4xi16>
+// CHECK-DAG:  %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][3, 0] [1, 4] [1, 1]
 // CHECK:      %[[OUT:.*]] = secret.generic ins(%[[ARG]] : !secret.secret<tensor<1x4xi16>>)
 // CHECK:      ^body(%[[ARG_CONVERTED:.*]]: tensor<1x4xi16>):
 // CHECK:        %[[FOR_LOOP_OUT:.*]]:2 = affine.for %[[I:.*]] = 0 to 3 iter_args(%[[RUNNING_SUM:.*]] = %[[BIAS]], %[[ROTATED_VEC:.*]] = %[[ARG_CONVERTED]])
@@ -14,13 +15,12 @@
 // CHECK:        %[[UPDATED_SUM:.*]] = arith.addi %[[RUNNING_SUM]], %[[MUL]]
 // CHECK:        %[[UPDATED_ROTATED_VEC:.*]] = tensor_ext.rotate %[[ROTATED_VEC]], %[[ONE]]
 // CHECK:        affine.yield %[[UPDATED_SUM]], %[[UPDATED_ROTATED_VEC]]
-// CHECK:      %[[LAST_SLICE:.*]] = tensor.extract_slice %[[DIAGONALIZED_MATRIX]][3, 0] [1, 4] [1, 1]
 // CHECK:      %[[LAST_MUL:.*]] = arith.muli %[[FOR_LOOP_OUT]]#1, %[[LAST_SLICE]]
 // CHECK:      %[[FINAL_SUM:.*]] = arith.addi %[[FOR_LOOP_OUT]]#0, %[[LAST_MUL]]
 // CHECK:      secret.yield %[[FINAL_SUM]]
 // CHECK:      return %[[OUT]]
 module {
-func.func @test_integer_vector_square_matrix_linalg_to_arith(%vec : !secret.secret<tensor<1x4xi16>>) -> !secret.secret<tensor<1x4xi16>> {
+func.func @test_integer_vector_square_matrix_matmul(%vec : !secret.secret<tensor<1x4xi16>>) -> !secret.secret<tensor<1x4xi16>> {
   %matrix = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]> : tensor<4x4xi16>
   %bias = arith.constant dense<[[17, 18, 19, 20]]> : tensor<1x4xi16>
   %out = secret.generic ins (%vec : !secret.secret<tensor<1x4xi16>>) {