From 161e366f1a9114271ba00acce9f0286d96497f80 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:26:03 -0700
Subject: [PATCH] upgrade gx_kernel and roberts_cross to 64x64

---
 tests/heir_simd_vectorizer/BUILD              |  9 ++
 .../heir_simd_vectorizer/gx_kernel_64x64.mlir | 50 +++++++++++
 .../roberts_cross_64x64.mlir                  | 82 +++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 tests/heir_simd_vectorizer/gx_kernel_64x64.mlir
 create mode 100644 tests/heir_simd_vectorizer/roberts_cross_64x64.mlir

diff --git a/tests/heir_simd_vectorizer/BUILD b/tests/heir_simd_vectorizer/BUILD
index c9cee5978..105f06abc 100644
--- a/tests/heir_simd_vectorizer/BUILD
+++ b/tests/heir_simd_vectorizer/BUILD
@@ -11,6 +11,15 @@ glob_lit_tests(
     driver = "@heir//tests:run_lit.sh",
     size_override = {
         "box_blur_64x64.mlir": "large",
+        "roberts_cross_64x64.mlir": "enormous",
+        "gx_kernel_64x64.mlir": "large",
+    },
+    tags_override = {
+        "gx_kernel_64x64.mlir": [
+            "nofastbuild",
+            "notap",
+            "manual",
+        ],
     },
     test_file_exts = ["mlir"],
 )
diff --git a/tests/heir_simd_vectorizer/gx_kernel_64x64.mlir b/tests/heir_simd_vectorizer/gx_kernel_64x64.mlir
new file mode 100644
index 000000000..6a453417a
--- /dev/null
+++ b/tests/heir_simd_vectorizer/gx_kernel_64x64.mlir
@@ -0,0 +1,50 @@
+// Ported from https://github.com/MarbleHE/HECO/blob/ba027a51f4e0a376a19ca05b1dcc7ab76da78a3e/evaluation/comparison/heco_input/gxkernel_64x64.mlir
+
+// RUN: heir-opt --secretize=entry-function=gx_kernel --wrap-generic --canonicalize --cse \
+// RUN:   --heir-simd-vectorizer %s | FileCheck %s
+
+// CHECK-LABEL: @gx_kernel
+// CHECK: secret.generic
+// CHECK-COUNT-6: tensor_ext.rotate
+// CHECK-NOT: tensor_ext.rotate
+func.func @gx_kernel(%arg0: tensor<4096xi16>) -> tensor<4096xi16> {
+  %c4096 = arith.constant 4096 : index
+  %c64 = arith.constant 64 : index
+  %c1_index = arith.constant 1 : index
+  %c0_si16 = arith.constant 0 : i16
+  %c0 = arith.constant 0 : i16
+  %c1 = arith.constant 1 : i16
+  %c2 = arith.constant 2 : i16
+  %cm1= arith.constant -1 : i16
+  %cm2 = arith.constant -2 : i16
+  %weight_matrix = tensor.from_elements %c1, %cm1, %c2, %cm2, %c1, %cm1, %c0, %c0, %c0 : tensor<3x3xi16>
+  %0 = affine.for %x = 0 to 64 iter_args(%arg0_x = %arg0) -> (tensor<4096xi16>) {
+    %1 = affine.for %y = 0 to 64 iter_args(%arg0_y = %arg0_x) -> (tensor<4096xi16>) {
+      %2 = affine.for %j = -1 to 2 iter_args(%value_j = %c0_si16) -> (i16) {
+        %6 = affine.for %i = -1 to 2 iter_args(%value_i = %value_j) -> (i16) {
+          %7 = arith.addi %x, %i : index
+          %8 = arith.muli %7, %c64 : index
+          %9 = arith.addi %y, %j : index
+          %10 = arith.addi %8, %9 : index
+          %11 = arith.remui %10, %c4096 : index
+          %12 = tensor.extract %arg0[%11] : tensor<4096xi16>
+          // Get the weight from the weight matrix!
+          %ip = arith.addi %i,%c1_index : index
+          %jp = arith.addi %j,%c1_index : index
+          %w = tensor.extract %weight_matrix[%ip,%jp] : tensor<3x3xi16>
+          %mul = arith.muli %12, %w : i16
+          %13 = arith.addi %value_i, %mul : i16
+          affine.yield %13 : i16
+        }
+        affine.yield %6 : i16
+      }
+      %3 = arith.muli %c64, %x : index
+      %4 = arith.addi %3, %y : index
+      %5 = arith.remui %4, %c4096 : index
+      %6 = tensor.insert %2 into %arg0_y[%5] : tensor<4096xi16>
+      affine.yield %6 : tensor<4096xi16>
+    }
+    affine.yield %1 : tensor<4096xi16>
+  }
+  return %0 : tensor<4096xi16>
+}
diff --git a/tests/heir_simd_vectorizer/roberts_cross_64x64.mlir b/tests/heir_simd_vectorizer/roberts_cross_64x64.mlir
new file mode 100644
index 000000000..4fd1b015e
--- /dev/null
+++ b/tests/heir_simd_vectorizer/roberts_cross_64x64.mlir
@@ -0,0 +1,82 @@
+// Ported from https://github.com/MarbleHE/HECO/blob/3e13744233ab0c09030a41ef98b4e061b6fa2eac/evaluation/benchmark/heco_input/robertscross_64x64.mlir
+
+// RUN: heir-opt --secretize=entry-function=roberts_cross --wrap-generic --canonicalize --cse \
+// RUN:   --heir-simd-vectorizer %s | FileCheck %s
+
+module{
+  // CHECK-LABEL: @roberts_cross
+  // CHECK-SAME: (%[[arg0:.*]]: !secret.secret<tensor<4096xi16>>) -> !secret.secret<tensor<4096xi16>> {
+  // CHECK-NEXT: %[[cMinusOne:.*]] = arith.constant 4095 : index
+  // CHECK-NEXT: %[[cMinusRow:.*]] = arith.constant 4031 : index
+  // CHECK-NEXT: secret.generic ins(%[[arg0]] : !secret.secret<tensor<4096xi16>>) {
+  // CHECK-NEXT:  ^bb0(%[[arg1:.*]]: tensor<4096xi16>):
+  // CHECK-NEXT:    %[[v1:.*]] = tensor_ext.rotate %[[arg1]], %[[cMinusRow]]
+  // CHECK-NEXT:    %[[v2:.*]] = arith.subi %[[v1]], %[[arg1]]
+  // CHECK-NEXT:    %[[v3:.*]] = tensor_ext.rotate %[[arg1]], %[[cMinusOne]]
+  // CHECK-NEXT:    %[[v4:.*]] = arith.subi %[[v1]], %[[v3]]
+  // CHECK-DAG:     %[[v5:.*]] = arith.muli %[[v2]], %[[v2]]
+  // CHECK-DAG:     %[[v6:.*]] = arith.muli %[[v4]], %[[v4]]
+  // CHECK-NEXT:    %[[v7:.*]] = arith.addi %[[v5]], %[[v6]]
+  func.func @roberts_cross(%img: tensor<4096xi16>) -> tensor<4096xi16> {
+    %c4096 = arith.constant 4096 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c-1 =  arith.constant -1 : index
+
+    // Each point p = img[x][y], where x is row and y is column, in the new image will equal:
+    // (img[x-1][y-1] - img[x][y])^2 + (img[x-1][y] - img[x][y-1])^2
+    %r = affine.for %x = 0 to 64 iter_args(%imgx = %img) -> tensor<4096xi16> {
+      %1 = affine.for %y = 0 to 64 iter_args(%imgy = %imgx) -> tensor<4096xi16> {
+
+        // fetch img[x-1][y-1]
+        %4 = arith.addi %x, %c-1 : index
+        %5 = arith.muli %4, %c64 : index
+        %6 = arith.addi %y, %c-1 : index
+        %7 = arith.addi %5, %6 : index
+        %8 = arith.remui %7, %c4096 : index
+        %9 = tensor.extract %img[%8] : tensor<4096xi16>
+
+        // fetch img[x][y]
+        %10 = arith.muli %x, %c64 : index
+        %11 = arith.addi %10, %y : index
+        %12 = arith.remui %11, %c4096 : index
+        %13 = tensor.extract %img[%12] : tensor<4096xi16>
+
+        // subtract those two
+        %14 = arith.subi %9, %13 : i16
+
+        // fetch img[x-1][y]
+        %15 = arith.addi %x, %c-1 : index
+        %16 = arith.muli %15, %c64 : index
+        %17 = arith.addi %y, %c-1 : index
+        %18 = arith.addi %16, %17 : index
+        %19 = arith.remui %18, %c4096 : index
+        %20 = tensor.extract %img[%19] : tensor<4096xi16>
+
+        // fetch img[x][y-1]
+        %21 = arith.muli %x, %c64 : index
+        %22 = arith.addi %y, %c-1 : index
+        %23 = arith.addi %21, %22 : index
+        %24 = arith.remui %23, %c4096 : index
+        %25 = tensor.extract %img[%24] : tensor<4096xi16>
+
+        // subtract those two
+        %26 = arith.subi %20, %25 : i16
+
+        // square each difference
+        %27 = arith.muli %14, %14 :  i16
+        %28 = arith.muli %26, %26 :  i16
+
+        // add the squares
+        %29 = arith.addi %27, %28 : i16
+
+        // save to result[x][y]
+        %30 = tensor.insert %29 into %imgy[%12] : tensor<4096xi16>
+        affine.yield %30: tensor<4096xi16>
+      }
+      affine.yield %1 : tensor<4096xi16>
+    }
+    return %r : tensor<4096xi16>
+  }
+}