xdslproject · superlopuh · Jun 26, 2024 · Jun 20, 2024 · Jun 20, 2024 · Jun 21, 2024
diff --git a/tests/filecheck/projects/riscv-backend-paper/bottom_up.mlir b/tests/filecheck/projects/riscv-backend-paper/bottom_up.mlir
@@ -14,30 +14,19 @@ func.func public @conv_2d_nchw_fchw_d1_s1_3x3(
       ]
     } ins(%X, %Y : memref<1x1x8x8xf64>, memref<1x1x3x3xf64>) outs(%Z : memref<1x1x6x6xf64>) {
     ^0(%x_stream : !stream.readable<f64>, %y_stream : !stream.readable<f64>, %z_stream : !stream.writable<f64>):
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c3 = arith.constant 3 : index
-      %c6 = arith.constant 6 : index
-
-      scf.for %i0 = %c0 to %c1 step %c1 {
-        scf.for %i1 = %c0 to %c1 step %c1 {
-          scf.for %i2 = %c0 to %c6 step %c1 {
-            scf.for %i3 = %c0 to %c6 step %c1 {
-              %z = scf.for %i = %c0 to %c3 step %c1 iter_args(%acc0 = %zero_float) -> (f64) {
-                %z3 = scf.for %j = %c0 to %c3 step %c1 iter_args(%acc1 = %acc0) -> (f64) {
-                  %x = memref_stream.read from %x_stream : f64
-                  %y = memref_stream.read from %y_stream : f64
-                  %prod = arith.mulf %x, %y fastmath<fast> : f64
-                  %res = arith.addf %prod, %acc1 fastmath<fast> : f64
-                  scf.yield %res : f64
-                }
-                scf.yield %z3 : f64
-              }
-
-              memref_stream.write %z to %z_stream : f64
-            }
-          }
-        }
+      memref_stream.generic {
+        bounds = [#builtin.int<1>, #builtin.int<1>, #builtin.int<6>, #builtin.int<6>, #builtin.int<1>, #builtin.int<3>, #builtin.int<3>],
+        indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2 + d5, d3 + d6)>,
+          affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d4, d5, d6)>,
+          affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+        ],
+        iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]
+      } ins(%x_stream, %y_stream : !stream.readable<f64>, !stream.readable<f64>) outs(%z_stream : !stream.writable<f64>) inits(%zero_float : f64) {
+      ^0(%x : f64, %y : f64, %acc : f64):
+        %prod = arith.mulf %x, %y fastmath<fast> : f64
+        %res = arith.addf %prod, %acc fastmath<fast> : f64
+        memref_stream.yield %res : f64
       }
     }
 
@@ -413,29 +402,17 @@ func.func public @pooling_nchw_max_d1_s2_3x3(
       ]
     } ins(%X : memref<1x1x16x16xf64>) outs(%Y : memref<1x1x7x7xf64>) {
     ^0(%x_stream : !stream.readable<f64>, %y_stream : !stream.writable<f64>):
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c3 = arith.constant 3 : index
-      %c7 = arith.constant 7 : index
-      %c512 = arith.constant 512 : index
-
-      scf.for %i0 = %c0 to %c1 step %c1 {
-        scf.for %i1 = %c0 to %c1 step %c1 {
-          scf.for %i2 = %c0 to %c7 step %c1 {
-            scf.for %i3 = %c0 to %c7 step %c1 {
-              %y = scf.for %i = %c0 to %c3 step %c1 iter_args(%acc0 = %min_val) -> (f64) {
-                %y3 = scf.for %j = %c0 to %c3 step %c1 iter_args(%acc1 = %acc0) -> (f64) {
-                  %x = memref_stream.read from %x_stream : f64
-                  %res = arith.maximumf %x, %acc1 : f64
-                  scf.yield %res : f64
-                }
-                scf.yield %y3 : f64
-              }
-
-              memref_stream.write %y to %y_stream : f64
-            }
-          }
-        }
+      memref_stream.generic {
+        bounds = [#builtin.int<1>, #builtin.int<1>, #builtin.int<7>, #builtin.int<7>, #builtin.int<3>, #builtin.int<3>],
+        indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2 * 2 + d4, d3 * 2 + d5)>,
+          affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+        ],
+        iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]
+      } ins(%x_stream : !stream.readable<f64>) outs(%y_stream : !stream.writable<f64>) inits(%min_val : f64) {
+      ^0(%x : f64, %acc : f64):
+        %res = arith.maximumf %x, %acc : f64
+        memref_stream.yield %res : f64
       }
     }
 
@@ -545,29 +522,17 @@ func.func public @pooling_nchw_sum_d1_s2_3x3(
       ]
     } ins(%X : memref<1x1x16x16xf64>) outs(%Y : memref<1x1x7x7xf64>) {
     ^0(%x_stream : !stream.readable<f64>, %y_stream : !stream.writable<f64>):
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c3 = arith.constant 3 : index
-      %c7 = arith.constant 7 : index
-      %c512 = arith.constant 512 : index
-
-      scf.for %i0 = %c0 to %c1 step %c1 {
-        scf.for %i1 = %c0 to %c1 step %c1 {
-          scf.for %i2 = %c0 to %c7 step %c1 {
-            scf.for %i3 = %c0 to %c7 step %c1 {
-              %y = scf.for %i = %c0 to %c3 step %c1 iter_args(%acc0 = %zero_float) -> (f64) {
-                %y3 = scf.for %j = %c0 to %c3 step %c1 iter_args(%acc1 = %acc0) -> (f64) {
-                  %x = memref_stream.read from %x_stream : f64
-                  %res = arith.addf %x, %acc1 : f64
-                  scf.yield %res : f64
-                }
-                scf.yield %y3 : f64
-              }
-
-              memref_stream.write %y to %y_stream : f64
-            }
-          }
-        }
+      memref_stream.generic {
+        bounds = [#builtin.int<1>, #builtin.int<1>, #builtin.int<7>, #builtin.int<7>, #builtin.int<3>, #builtin.int<3>],
+        indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2 * 2 + d4, d3 * 2 + d5)>,
+          affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+        ],
+        iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]
+      } ins(%x_stream : !stream.readable<f64>) outs(%y_stream : !stream.writable<f64>) inits(%zero_float : f64) {
+      ^0(%x : f64, %acc : f64):
+        %res = arith.addf %x, %acc : f64
+        memref_stream.yield %res : f64
       }
     }
 

diff --git a/tests/filecheck/transforms/convert_memref_stream_to_loops.mlir b/tests/filecheck/transforms/convert_memref_stream_to_loops.mlir
@@ -255,4 +255,55 @@ func.func @nested_imperfect(%A : memref<2x3x4xf64>, %B : memref<f64>) -> memref<
 // CHECK-NEXT:      func.return %{{.*}} : memref<f64>
 // CHECK-NEXT:    }
 
+func.func @main_inits(%A : memref<4x2xf64>, %B : memref<2x3xf64>, %C : memref<4x3xf64>) -> memref<4x3xf64> {
+    %zero_float = arith.constant 0.000000e+00 : f64
+    memref_stream.streaming_region {
+      patterns = [
+        #memref_stream.stride_pattern<ub = [4, 3, 2], index_map = (d0, d1, d2) -> (d0, d2)>,
+        #memref_stream.stride_pattern<ub = [4, 3, 2], index_map = (d0, d1, d2) -> (d2, d1)>
+      ]
+    } ins(%A, %B : memref<4x2xf64>, memref<2x3xf64>) {
+    ^0(%0 : !stream.readable<f64>, %1 : !stream.readable<f64>):
+      memref_stream.generic {
+        bounds = [#builtin.int<4>, #builtin.int<3>, #builtin.int<2>],
+        indexing_maps = [
+          affine_map<(d0, d1, d2) -> (d0, d2)>,
+          affine_map<(d0, d1, d2) -> (d2, d1)>,
+          affine_map<(d0, d1) -> (d0, d1)>
+        ],
+        iterator_types = ["parallel", "parallel", "reduction"]
+      } ins(%0, %1 : !stream.readable<f64>, !stream.readable<f64>) outs(%C : memref<4x3xf64>) inits(%zero_float : f64) {
+      ^1(%a : f64, %b : f64, %acc_old : f64):
+        %prod = arith.mulf %a, %b : f64
+        %acc_new = arith.addf %acc_old, %prod : f64
+        memref_stream.yield %acc_new : f64
+      }
+    }
+    func.return %C : memref<4x3xf64>
+}
+// CHECK-NEXT:    func.func @main_inits(%{{.*}} : memref<4x2xf64>, %{{.*}} : memref<2x3xf64>, %{{.*}} : memref<4x3xf64>) -> memref<4x3xf64> {
+// CHECK-NEXT:      %zero_float = arith.constant 0.000000e+00 : f64
+// CHECK-NEXT:      memref_stream.streaming_region {patterns = [#memref_stream.stride_pattern<ub = [4, 3, 2], index_map = (d0, d1, d2) -> (d0, d2)>, #memref_stream.stride_pattern<ub = [4, 3, 2], index_map = (d0, d1, d2) -> (d2, d1)>]} ins(%{{.*}}, %{{.*}} : memref<4x2xf64>, memref<2x3xf64>) {
+// CHECK-NEXT:      ^{{.*}}(%{{.*}} : !stream.readable<f64>, %{{.*}} : !stream.readable<f64>):
+// CHECK-NEXT:        %{{.*}} = arith.constant 4 : index
+// CHECK-NEXT:        %{{.*}} = arith.constant 3 : index
+// CHECK-NEXT:        %{{.*}} = arith.constant 2 : index
+// CHECK-NEXT:        %{{.*}} = arith.constant 0 : index
+// CHECK-NEXT:        %{{.*}} = arith.constant 1 : index
+// CHECK-NEXT:        scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK-NEXT:          scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK-NEXT:            %{{.*}} = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %zero_float) -> (f64) {
+// CHECK-NEXT:              %{{.*}} = memref_stream.read from %{{.*}} : f64
+// CHECK-NEXT:              %{{.*}} = memref_stream.read from %{{.*}} : f64
+// CHECK-NEXT:              %{{.*}} = arith.mulf %{{.*}}, %{{.*}} : f64
+// CHECK-NEXT:              %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f64
+// CHECK-NEXT:              scf.yield %{{.*}} : f64
+// CHECK-NEXT:            }
+// CHECK-NEXT:            memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<4x3xf64>
+// CHECK-NEXT:          }
+// CHECK-NEXT:        }
+// CHECK-NEXT:      }
+// CHECK-NEXT:      func.return %{{.*}} : memref<4x3xf64>
+// CHECK-NEXT:    }
+
 // CHECK-NEXT:  }
diff --git a/xdsl/transforms/convert_linalg_to_loops.py b/xdsl/transforms/convert_linalg_to_loops.py
@@ -20,6 +20,7 @@
 
 
 def insert_load(
+    value_index: int,
     value: SSAValue,
     affine_map_attr: AffineMapAttr,
     ind_vars: Sequence[SSAValue],

diff --git a/xdsl/transforms/convert_memref_stream_to_loops.py b/xdsl/transforms/convert_memref_stream_to_loops.py
@@ -20,7 +20,8 @@
 )
 
 
-def insert_load(
+def _insert_load(
+    source_index: int,
     source: SSAValue,
     affine_map_attr: AffineMapAttr,
     ind_vars: Sequence[SSAValue],
@@ -64,13 +65,40 @@ class LowerGenericOpPattern(RewritePattern):
     def match_and_rewrite(
         self, op: memref_stream.GenericOp, rewriter: PatternRewriter
     ) -> None:
+        ins_count = len(op.inputs)
         if any(not isinstance(init, UnitAttr) for init in op.inits):
-            raise NotImplementedError("Operation has inits that are not UnitAttr")
+            constant_vals: list[SSAValue | None] = [None] * len(op.outputs)
+            for index, val in zip(op.init_indices, op.inits, strict=True):
+                constant_vals[index.data] = val
+
+            def insert_load(
+                source_index: int,
+                source: SSAValue,
+                affine_map_attr: AffineMapAttr,
+                ind_vars: Sequence[SSAValue],
+                rewriter: PatternRewriter,
+                insertion_point: InsertPoint,
+            ) -> SSAValue:
+                if source_index >= ins_count:
+                    constant_val = constant_vals[source_index - ins_count]
+                    if constant_val is not None:
+                        return constant_val
+
+                return _insert_load(
+                    source_index,
+                    source,
+                    affine_map_attr,
+                    ind_vars,
+                    rewriter,
+                    insertion_point,
+                )
+
+        else:
+            insert_load = _insert_load
 
         outer_ubs, inner_ubs = op.get_static_loop_ranges()
         if inner_ubs:
             # Imperfectly nested
-            ins_count = len(op.inputs)
             rewrite_generic_to_imperfect_loops(
                 rewriter,
                 InsertPoint.before(op),

diff --git a/xdsl/transforms/loop_nest_lowering_utils.py b/xdsl/transforms/loop_nest_lowering_utils.py
@@ -65,6 +65,7 @@ def indices_for_map(
 
 INSERT_LOAD: TypeAlias = Callable[
     [
+        int,
         SSAValue,
         AffineMapAttr,
         Sequence[SSAValue],
@@ -161,6 +162,7 @@ def _insert_load_ops(
     operands: Sequence[SSAValue],
     args: Sequence[BlockArgument],
     insert_load: INSERT_LOAD,
+    index_increment: int = 0,
 ) -> Sequence[tuple[int, SSAValue]]:
     """
     Inserts the load operations at the specified insertion point.
@@ -172,6 +174,7 @@ def _insert_load_ops(
     The `affine_map_attrs`, `operands`, and `args` must have the same length.
     Returns a tuple of integers indicating the locations of the returned values, and
     the values themselves.
+    The integer values are incremented by `index_increment`.
     """
     res: list[tuple[int, SSAValue]] = []
     for i, (affine_map_attr, operand, arg) in enumerate(
@@ -180,13 +183,14 @@ def _insert_load_ops(
         if not arg.uses:
             continue
         res_val = insert_load(
+            i + index_increment,
             operand,
             affine_map_attr,
             ind_vars,
             rewriter,
             insertion_point,
         )
-        res.append((i, res_val))
+        res.append((i + index_increment, res_val))
     return res
 
 
@@ -352,6 +356,7 @@ def outer_make_body(
             outer_load_operands,
             outer_load_block_args,
             insert_load,
+            index_increment=len(inner_load_block_args),
         )
 
         def inner_make_body(
@@ -377,7 +382,7 @@ def inner_make_body(
                 inner_iter_args,
                 strict=True,
             ):
-                block.args[i + len(inner_loaded_values)].replace_by(arg)
+                block.args[i].replace_by(arg)
 
             # Replace block argument use with load op results
             for i, val in inner_loaded_values: