From dc745a28b8242b59bc311493161ef3e0797585a2 Mon Sep 17 00:00:00 2001 From: George Bisbas Date: Mon, 1 Jul 2024 12:09:10 +0100 Subject: [PATCH 1/3] compiler: Init params --- devito/ir/xdsl_iet/cluster_to_ssa.py | 2 +- devito/xdsl_core/xdsl_cpu.py | 9 +++++++-- tests/test_xdsl_passes.py | 3 +++ xdsl_examples/diffusion_3D_wBCs.py | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/devito/ir/xdsl_iet/cluster_to_ssa.py b/devito/ir/xdsl_iet/cluster_to_ssa.py index b0c8bd9e2f..5349153d9c 100644 --- a/devito/ir/xdsl_iet/cluster_to_ssa.py +++ b/devito/ir/xdsl_iet/cluster_to_ssa.py @@ -470,7 +470,7 @@ def build_time_loop( ) # Name the 'time' step iterator - assert step_dim.root.name is 'time' + assert step_dim.root.name == 'time' loop.body.block.args[0].name_hint = step_dim.root.name # Store for later reference self.symbol_values[step_dim.root.name] = loop.body.block.args[0] diff --git a/devito/xdsl_core/xdsl_cpu.py b/devito/xdsl_core/xdsl_cpu.py index 6240631e2b..27c5c6b812 100644 --- a/devito/xdsl_core/xdsl_cpu.py +++ b/devito/xdsl_core/xdsl_cpu.py @@ -36,6 +36,8 @@ class XdslnoopOperator(Cpu64OperatorMixin, CoreOperator): + # This operator needs more testing as we currently compare the starting + # initial generated code against the advanced one _Target = CTarget @classmethod @@ -442,9 +444,9 @@ def _jit_compile(self): cflags += ' -lmpi ' cc = "mpicc -cc=clang" if is_omp: - cflags += " -fopenmp " + cflags += " -fopenmp" if is_gpu: - cflags += " -lmlir_cuda_runtime " + cflags += " -lmlir_cuda_runtime" cflags += " -shared " @@ -652,6 +654,9 @@ def generate_tiling_arg(nb_tiled_dims: int): """ if nb_tiled_dims < 1: return 'parallel-loop-tile-sizes=0' + + # TOFIX: 64 is hardcoded, should be a parameter + # TOFIX: Zero is also hardcoded, should be a parameter return "parallel-loop-tile-sizes=" + ",".join(["64"]*nb_tiled_dims) + ",0" diff --git a/tests/test_xdsl_passes.py b/tests/test_xdsl_passes.py index 8cea242269..3526eb9501 100644 --- a/tests/test_xdsl_passes.py +++ b/tests/test_xdsl_passes.py @@ -31,6 +31,8 @@ def test_xdsl_noop_structure(): eq = Eq(u.forward, u + 1) op1 = Operator([eq], opt='xdsl-noop') + op1.apply(time_M=1) + op2 = Operator([eq], opt='xdsl') assert Printer().print(op1._module) == Printer().print(op2._module) @@ -58,6 +60,7 @@ def test_acoustic_3D(shape, so, to, nt): op1 = Operator([stencil], opt='xdsl-noop') op2 = Operator([stencil], opt='xdsl') + # We here test only the initial code, so not really useful assert Printer().print(op1._module) == Printer().print(op2._module) diff --git a/xdsl_examples/diffusion_3D_wBCs.py b/xdsl_examples/diffusion_3D_wBCs.py index a37738eb6f..835d3c3e74 100644 --- a/xdsl_examples/diffusion_3D_wBCs.py +++ b/xdsl_examples/diffusion_3D_wBCs.py @@ -82,6 +82,7 @@ u.data[:, :, :, :] = 0 u.data[:, :, :, int(nz/2)] = 1 op = Operator([eq_stencil], name='xDSLDiffusionOperator', opt='xdsl') + # Apply the xdsl operator for a number of timesteps op.apply(time=nt, dt=dt, a=nu) print("XDSL Field norm is:", norm(u)) From db6e5b0240b2922a46d987e49f56cce0c6a7952b Mon Sep 17 00:00:00 2001 From: George Bisbas Date: Mon, 1 Jul 2024 15:46:26 +0100 Subject: [PATCH 2/3] compiler: parametrize collapsed loops --- devito/xdsl_core/xdsl_cpu.py | 24 +++++++++++++++++++++--- tests/test_xdsl_passes.py | 2 ++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/devito/xdsl_core/xdsl_cpu.py b/devito/xdsl_core/xdsl_cpu.py index 27c5c6b812..cd86fc0d9c 100644 --- a/devito/xdsl_core/xdsl_cpu.py +++ b/devito/xdsl_core/xdsl_cpu.py @@ -402,7 +402,9 @@ def _jit_compile(self): mlir_pipeline = generate_MLIR_CPU_PIPELINE() if is_omp: - mlir_pipeline = generate_MLIR_OPENMP_PIPELINE() + # We collapse as many loops as we tile + kwargs = {'num_loops': to_tile} + mlir_pipeline = generate_MLIR_OPENMP_PIPELINE(kwargs) if is_mpi: shape, _ = self.mpi_shape @@ -554,7 +556,9 @@ def generate_MLIR_CPU_noop_PIPELINE(): return generate_mlir_pipeline(passes) -def generate_MLIR_OPENMP_PIPELINE(): +def generate_MLIR_OPENMP_PIPELINE(kwargs): + num_loops = kwargs.get('num_loops') + return generate_pipeline([ generate_mlir_pipeline([ "canonicalize", @@ -574,7 +578,7 @@ def generate_MLIR_OPENMP_PIPELINE(): # "canonicalize", # "cse", ]), - "convert-scf-to-openmp{collapse=1}", + f"convert-scf-to-openmp{{{generate_collapse_arg(num_loops)}}}", generate_mlir_pipeline([ "finalize-memref-to-llvm", "convert-scf-to-cf" @@ -660,6 +664,20 @@ def generate_tiling_arg(nb_tiled_dims: int): return "parallel-loop-tile-sizes=" + ",".join(["64"]*nb_tiled_dims) + ",0" +def generate_collapse_arg(num_loops: int): + """ + Generate the number of loops that will be collapsed + Resort to 1 if no number of loops is provided + """ + + if num_loops < 1: + num_loops = 1 + + ret_arg = "collapse=" + "".join(str(num_loops)) # noqa + + return ret_arg + + def get_arg_names_from_module(op): return [ str_attr.name_hint for str_attr in op.body.block.ops.first.body.block.args # noqa diff --git a/tests/test_xdsl_passes.py b/tests/test_xdsl_passes.py index 3526eb9501..9fce974b8d 100644 --- a/tests/test_xdsl_passes.py +++ b/tests/test_xdsl_passes.py @@ -34,6 +34,8 @@ def test_xdsl_noop_structure(): op1.apply(time_M=1) op2 = Operator([eq], opt='xdsl') + op2.apply(time_M=1) + # No correctness check, just running assert Printer().print(op1._module) == Printer().print(op2._module) From e43eef3a135a32e127ff2a239d52f1a95b8e1cb3 Mon Sep 17 00:00:00 2001 From: George Bisbas Date: Mon, 1 Jul 2024 15:54:22 +0100 Subject: [PATCH 3/3] tests: Add xdsl_pipeline_mlir_openmp --- tests/filecheck/xdsl_pipeline_openmp.mlir | 404 ++++++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 tests/filecheck/xdsl_pipeline_openmp.mlir diff --git a/tests/filecheck/xdsl_pipeline_openmp.mlir b/tests/filecheck/xdsl_pipeline_openmp.mlir new file mode 100644 index 0000000000..85005e4770 --- /dev/null +++ b/tests/filecheck/xdsl_pipeline_openmp.mlir @@ -0,0 +1,404 @@ +// RUN: xdsl-opt -p "canonicalize" %s | filecheck %s + +builtin.module { + func.func @xDSLDiffusionOperator(%u_vec0 : memref<158x158x158xf32>, %u_vec1 : memref<158x158x158xf32>, %timers : !llvm.ptr) { + %0 = func.call @timer_start() : () -> f64 + %time_m = arith.constant 0 : index + %time_M = arith.constant 250 : index + %1 = arith.constant 1 : index + %2 = arith.addi %time_M, %1 : index + %step = arith.constant 1 : index + %3, %4 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1) -> (memref<158x158x158xf32>, memref<158x158x158xf32>) { + %u_t1_storeview = "memref.subview"(%u_t1) <{"static_offsets" = array, "static_sizes" = array, "static_strides" = array, "operandSegmentSizes" = array}> : (memref<158x158x158xf32>) -> memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>> + %u_t0_loadview = "memref.subview"(%u_t0) <{"static_offsets" = array, "static_sizes" = array, "static_strides" = array, "operandSegmentSizes" = array}> : (memref<158x158x158xf32>) -> memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %5 = arith.constant 0 : index + %6 = arith.constant 0 : index + %7 = arith.constant 0 : index + %8 = arith.constant 1 : index + %9 = arith.constant 1 : index + %10 = arith.constant 1 : index + %11 = arith.constant 150 : index + %12 = arith.constant 150 : index + %13 = arith.constant 150 : index + %14 = arith.constant 0 : index + %15 = arith.constant 64 : index + %16 = arith.constant 64 : index + %17 = arith.muli %8, %15 : index + %18 = arith.muli %9, %16 : index + "scf.parallel"(%5, %6, %11, %12, %17, %18) <{"operandSegmentSizes" = array}> ({ + ^0(%19 : index, %20 : index): + %21 = "affine.min"(%15, %11, %19) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index + %22 = "affine.min"(%16, %12, %20) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index + "scf.parallel"(%14, %14, %7, %21, %22, %13, %8, %9, %10) <{"operandSegmentSizes" = array}> ({ + ^1(%23 : index, %24 : index, %25 : index): + %26 = arith.addi %19, %23 : index + %27 = arith.addi %20, %24 : index + %dt = arith.constant 6.717825e-07 : f32 + %28 = arith.constant -1 : i64 + %29 = "math.fpowi"(%dt, %28) : (f32, i64) -> f32 + %30 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %31 = arith.mulf %29, %30 : f32 + %32 = arith.constant 1.333333e+00 : f32 + %h_x = arith.constant 1.342282e-02 : f32 + %33 = arith.constant -2 : i64 + %34 = "math.fpowi"(%h_x, %33) : (f32, i64) -> f32 + %35 = arith.constant -1 : index + %36 = arith.addi %26, %35 : index + %37 = memref.load %u_t0_loadview[%36, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %38 = arith.mulf %32, %34 : f32 + %39 = arith.mulf %38, %37 : f32 + %40 = arith.constant 1.333333e+00 : f32 + %h_x_1 = arith.constant 1.342282e-02 : f32 + %41 = arith.constant -2 : i64 + %42 = "math.fpowi"(%h_x_1, %41) : (f32, i64) -> f32 + %43 = arith.constant 1 : index + %44 = arith.addi %26, %43 : index + %45 = memref.load %u_t0_loadview[%44, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %46 = arith.mulf %40, %42 : f32 + %47 = arith.mulf %46, %45 : f32 + %48 = arith.constant -2.500000e+00 : f32 + %h_x_2 = arith.constant 1.342282e-02 : f32 + %49 = arith.constant -2 : i64 + %50 = "math.fpowi"(%h_x_2, %49) : (f32, i64) -> f32 + %51 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %52 = arith.mulf %48, %50 : f32 + %53 = arith.mulf %52, %51 : f32 + %54 = arith.constant -8.333333e-02 : f32 + %h_x_3 = arith.constant 1.342282e-02 : f32 + %55 = arith.constant -2 : i64 + %56 = "math.fpowi"(%h_x_3, %55) : (f32, i64) -> f32 + %57 = arith.constant -2 : index + %58 = arith.addi %26, %57 : index + %59 = memref.load %u_t0_loadview[%58, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %60 = arith.mulf %54, %56 : f32 + %61 = arith.mulf %60, %59 : f32 + %62 = arith.constant -8.333333e-02 : f32 + %h_x_4 = arith.constant 1.342282e-02 : f32 + %63 = arith.constant -2 : i64 + %64 = "math.fpowi"(%h_x_4, %63) : (f32, i64) -> f32 + %65 = arith.constant 2 : index + %66 = arith.addi %26, %65 : index + %67 = memref.load %u_t0_loadview[%66, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %68 = arith.mulf %62, %64 : f32 + %69 = arith.mulf %68, %67 : f32 + %70 = arith.addf %39, %47 : f32 + %71 = arith.addf %70, %53 : f32 + %72 = arith.addf %71, %61 : f32 + %73 = arith.addf %72, %69 : f32 + %74 = arith.constant 1.333333e+00 : f32 + %h_y = arith.constant 1.342282e-02 : f32 + %75 = arith.constant -2 : i64 + %76 = "math.fpowi"(%h_y, %75) : (f32, i64) -> f32 + %77 = arith.constant -1 : index + %78 = arith.addi %27, %77 : index + %79 = memref.load %u_t0_loadview[%26, %78, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %80 = arith.mulf %74, %76 : f32 + %81 = arith.mulf %80, %79 : f32 + %82 = arith.constant 1.333333e+00 : f32 + %h_y_1 = arith.constant 1.342282e-02 : f32 + %83 = arith.constant -2 : i64 + %84 = "math.fpowi"(%h_y_1, %83) : (f32, i64) -> f32 + %85 = arith.constant 1 : index + %86 = arith.addi %27, %85 : index + %87 = memref.load %u_t0_loadview[%26, %86, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %88 = arith.mulf %82, %84 : f32 + %89 = arith.mulf %88, %87 : f32 + %90 = arith.constant -2.500000e+00 : f32 + %h_y_2 = arith.constant 1.342282e-02 : f32 + %91 = arith.constant -2 : i64 + %92 = "math.fpowi"(%h_y_2, %91) : (f32, i64) -> f32 + %93 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %94 = arith.mulf %90, %92 : f32 + %95 = arith.mulf %94, %93 : f32 + %96 = arith.constant -8.333333e-02 : f32 + %h_y_3 = arith.constant 1.342282e-02 : f32 + %97 = arith.constant -2 : i64 + %98 = "math.fpowi"(%h_y_3, %97) : (f32, i64) -> f32 + %99 = arith.constant -2 : index + %100 = arith.addi %27, %99 : index + %101 = memref.load %u_t0_loadview[%26, %100, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %102 = arith.mulf %96, %98 : f32 + %103 = arith.mulf %102, %101 : f32 + %104 = arith.constant -8.333333e-02 : f32 + %h_y_4 = arith.constant 1.342282e-02 : f32 + %105 = arith.constant -2 : i64 + %106 = "math.fpowi"(%h_y_4, %105) : (f32, i64) -> f32 + %107 = arith.constant 2 : index + %108 = arith.addi %27, %107 : index + %109 = memref.load %u_t0_loadview[%26, %108, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %110 = arith.mulf %104, %106 : f32 + %111 = arith.mulf %110, %109 : f32 + %112 = arith.addf %81, %89 : f32 + %113 = arith.addf %112, %95 : f32 + %114 = arith.addf %113, %103 : f32 + %115 = arith.addf %114, %111 : f32 + %116 = arith.constant 1.333333e+00 : f32 + %h_z = arith.constant 1.342282e-02 : f32 + %117 = arith.constant -2 : i64 + %118 = "math.fpowi"(%h_z, %117) : (f32, i64) -> f32 + %119 = arith.constant -1 : index + %120 = arith.addi %25, %119 : index + %121 = memref.load %u_t0_loadview[%26, %27, %120] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %122 = arith.mulf %116, %118 : f32 + %123 = arith.mulf %122, %121 : f32 + %124 = arith.constant 1.333333e+00 : f32 + %h_z_1 = arith.constant 1.342282e-02 : f32 + %125 = arith.constant -2 : i64 + %126 = "math.fpowi"(%h_z_1, %125) : (f32, i64) -> f32 + %127 = arith.constant 1 : index + %128 = arith.addi %25, %127 : index + %129 = memref.load %u_t0_loadview[%26, %27, %128] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %130 = arith.mulf %124, %126 : f32 + %131 = arith.mulf %130, %129 : f32 + %132 = arith.constant -2.500000e+00 : f32 + %h_z_2 = arith.constant 1.342282e-02 : f32 + %133 = arith.constant -2 : i64 + %134 = "math.fpowi"(%h_z_2, %133) : (f32, i64) -> f32 + %135 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %136 = arith.mulf %132, %134 : f32 + %137 = arith.mulf %136, %135 : f32 + %138 = arith.constant -8.333333e-02 : f32 + %h_z_3 = arith.constant 1.342282e-02 : f32 + %139 = arith.constant -2 : i64 + %140 = "math.fpowi"(%h_z_3, %139) : (f32, i64) -> f32 + %141 = arith.constant -2 : index + %142 = arith.addi %25, %141 : index + %143 = memref.load %u_t0_loadview[%26, %27, %142] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %144 = arith.mulf %138, %140 : f32 + %145 = arith.mulf %144, %143 : f32 + %146 = arith.constant -8.333333e-02 : f32 + %h_z_4 = arith.constant 1.342282e-02 : f32 + %147 = arith.constant -2 : i64 + %148 = "math.fpowi"(%h_z_4, %147) : (f32, i64) -> f32 + %149 = arith.constant 2 : index + %150 = arith.addi %25, %149 : index + %151 = memref.load %u_t0_loadview[%26, %27, %150] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> + %152 = arith.mulf %146, %148 : f32 + %153 = arith.mulf %152, %151 : f32 + %154 = arith.addf %123, %131 : f32 + %155 = arith.addf %154, %137 : f32 + %156 = arith.addf %155, %145 : f32 + %157 = arith.addf %156, %153 : f32 + %158 = arith.addf %73, %115 : f32 + %159 = arith.addf %158, %157 : f32 + %a = arith.constant 9.000000e-01 : f32 + %160 = arith.mulf %159, %a : f32 + %161 = arith.addf %31, %160 : f32 + %dt_1 = arith.constant 6.717825e-07 : f32 + %162 = arith.mulf %161, %dt_1 : f32 + memref.store %162, %u_t1_storeview[%26, %27, %25] : memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>> + scf.yield + }) : (index, index, index, index, index, index, index, index, index) -> () + scf.yield + }) : (index, index, index, index, index, index) -> () + scf.yield %u_t1, %u_t0 : memref<158x158x158xf32>, memref<158x158x158xf32> + } + %163 = func.call @timer_end(%0) : (f64) -> f64 + "llvm.store"(%163, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> () + func.return + } + func.func private @timer_start() -> f64 + func.func private @timer_end(f64) -> f64 +} + + +// CHECK: builtin.module { +// CHECK-NEXT: func.func @xDSLDiffusionOperator(%u_vec0 : memref<158x158x158xf32>, %u_vec1 : memref<158x158x158xf32>, %timers : !llvm.ptr) { +// CHECK-NEXT: %0 = func.call @timer_start() : () -> f64 +// CHECK-NEXT: %time_m = arith.constant 0 : index +// CHECK-NEXT: %time_M = arith.constant 250 : index +// CHECK-NEXT: %1 = arith.constant 1 : index +// CHECK-NEXT: %2 = arith.addi %time_M, %1 : index +// CHECK-NEXT: %step = arith.constant 1 : index +// CHECK-NEXT: %3, %4 = scf.for %time = %time_m to %2 step %step iter_args(%u_t0 = %u_vec0, %u_t1 = %u_vec1) -> (memref<158x158x158xf32>, memref<158x158x158xf32>) { +// CHECK-NEXT: %u_t1_storeview = "memref.subview"(%u_t1) <{"static_offsets" = array, "static_sizes" = array, "static_strides" = array, "operandSegmentSizes" = array}> : (memref<158x158x158xf32>) -> memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %u_t0_loadview = "memref.subview"(%u_t0) <{"static_offsets" = array, "static_sizes" = array, "static_strides" = array, "operandSegmentSizes" = array}> : (memref<158x158x158xf32>) -> memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %5 = arith.constant 0 : index +// CHECK-NEXT: %6 = arith.constant 0 : index +// CHECK-NEXT: %7 = arith.constant 0 : index +// CHECK-NEXT: %8 = arith.constant 1 : index +// CHECK-NEXT: %9 = arith.constant 1 : index +// CHECK-NEXT: %10 = arith.constant 1 : index +// CHECK-NEXT: %11 = arith.constant 150 : index +// CHECK-NEXT: %12 = arith.constant 150 : index +// CHECK-NEXT: %13 = arith.constant 150 : index +// CHECK-NEXT: %14 = arith.constant 0 : index +// CHECK-NEXT: %15 = arith.constant 64 : index +// CHECK-NEXT: %16 = arith.constant 64 : index +// CHECK-NEXT: %17 = arith.muli %8, %15 : index +// CHECK-NEXT: %18 = arith.muli %9, %16 : index +// CHECK-NEXT: "scf.parallel"(%5, %6, %11, %12, %17, %18) <{"operandSegmentSizes" = array}> ({ +// CHECK-NEXT: ^0(%19 : index, %20 : index): +// CHECK-NEXT: %21 = "affine.min"(%15, %11, %19) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index +// CHECK-NEXT: %22 = "affine.min"(%16, %12, %20) <{"map" = affine_map<(d0, d1, d2) -> (d0, (d1 + (d2 * -1)))>}> : (index, index, index) -> index +// CHECK-NEXT: "scf.parallel"(%14, %14, %7, %21, %22, %13, %8, %9, %10) <{"operandSegmentSizes" = array}> ({ +// CHECK-NEXT: ^1(%23 : index, %24 : index, %25 : index): +// CHECK-NEXT: %26 = arith.addi %19, %23 : index +// CHECK-NEXT: %27 = arith.addi %20, %24 : index +// CHECK-NEXT: %dt = arith.constant 6.717825e-07 : f32 +// CHECK-NEXT: %28 = arith.constant -1 : i64 +// CHECK-NEXT: %29 = "math.fpowi"(%dt, %28) : (f32, i64) -> f32 +// CHECK-NEXT: %30 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %31 = arith.mulf %29, %30 : f32 +// CHECK-NEXT: %32 = arith.constant 1.333333e+00 : f32 +// CHECK-NEXT: %h_x = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %33 = arith.constant -2 : i64 +// CHECK-NEXT: %34 = "math.fpowi"(%h_x, %33) : (f32, i64) -> f32 +// CHECK-NEXT: %35 = arith.constant -1 : index +// CHECK-NEXT: %36 = arith.addi %26, %35 : index +// CHECK-NEXT: %37 = memref.load %u_t0_loadview[%36, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %38 = arith.mulf %32, %34 : f32 +// CHECK-NEXT: %39 = arith.mulf %38, %37 : f32 +// CHECK-NEXT: %40 = arith.constant 1.333333e+00 : f32 +// CHECK-NEXT: %h_x_1 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %41 = arith.constant -2 : i64 +// CHECK-NEXT: %42 = "math.fpowi"(%h_x_1, %41) : (f32, i64) -> f32 +// CHECK-NEXT: %43 = arith.constant 1 : index +// CHECK-NEXT: %44 = arith.addi %26, %43 : index +// CHECK-NEXT: %45 = memref.load %u_t0_loadview[%44, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %46 = arith.mulf %40, %42 : f32 +// CHECK-NEXT: %47 = arith.mulf %46, %45 : f32 +// CHECK-NEXT: %48 = arith.constant -2.500000e+00 : f32 +// CHECK-NEXT: %h_x_2 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %49 = arith.constant -2 : i64 +// CHECK-NEXT: %50 = "math.fpowi"(%h_x_2, %49) : (f32, i64) -> f32 +// CHECK-NEXT: %51 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %52 = arith.mulf %48, %50 : f32 +// CHECK-NEXT: %53 = arith.mulf %52, %51 : f32 +// CHECK-NEXT: %54 = arith.constant -8.333333e-02 : f32 +// CHECK-NEXT: %h_x_3 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %55 = arith.constant -2 : i64 +// CHECK-NEXT: %56 = "math.fpowi"(%h_x_3, %55) : (f32, i64) -> f32 +// CHECK-NEXT: %57 = arith.constant -2 : index +// CHECK-NEXT: %58 = arith.addi %26, %57 : index +// CHECK-NEXT: %59 = memref.load %u_t0_loadview[%58, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %60 = arith.mulf %54, %56 : f32 +// CHECK-NEXT: %61 = arith.mulf %60, %59 : f32 +// CHECK-NEXT: %62 = arith.constant -8.333333e-02 : f32 +// CHECK-NEXT: %h_x_4 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %63 = arith.constant -2 : i64 +// CHECK-NEXT: %64 = "math.fpowi"(%h_x_4, %63) : (f32, i64) -> f32 +// CHECK-NEXT: %65 = arith.constant 2 : index +// CHECK-NEXT: %66 = arith.addi %26, %65 : index +// CHECK-NEXT: %67 = memref.load %u_t0_loadview[%66, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %68 = arith.mulf %62, %64 : f32 +// CHECK-NEXT: %69 = arith.mulf %68, %67 : f32 +// CHECK-NEXT: %70 = arith.addf %39, %47 : f32 +// CHECK-NEXT: %71 = arith.addf %70, %53 : f32 +// CHECK-NEXT: %72 = arith.addf %71, %61 : f32 +// CHECK-NEXT: %73 = arith.addf %72, %69 : f32 +// CHECK-NEXT: %74 = arith.constant 1.333333e+00 : f32 +// CHECK-NEXT: %h_y = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %75 = arith.constant -2 : i64 +// CHECK-NEXT: %76 = "math.fpowi"(%h_y, %75) : (f32, i64) -> f32 +// CHECK-NEXT: %77 = arith.constant -1 : index +// CHECK-NEXT: %78 = arith.addi %27, %77 : index +// CHECK-NEXT: %79 = memref.load %u_t0_loadview[%26, %78, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %80 = arith.mulf %74, %76 : f32 +// CHECK-NEXT: %81 = arith.mulf %80, %79 : f32 +// CHECK-NEXT: %82 = arith.constant 1.333333e+00 : f32 +// CHECK-NEXT: %h_y_1 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %83 = arith.constant -2 : i64 +// CHECK-NEXT: %84 = "math.fpowi"(%h_y_1, %83) : (f32, i64) -> f32 +// CHECK-NEXT: %85 = arith.constant 1 : index +// CHECK-NEXT: %86 = arith.addi %27, %85 : index +// CHECK-NEXT: %87 = memref.load %u_t0_loadview[%26, %86, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %88 = arith.mulf %82, %84 : f32 +// CHECK-NEXT: %89 = arith.mulf %88, %87 : f32 +// CHECK-NEXT: %90 = arith.constant -2.500000e+00 : f32 +// CHECK-NEXT: %h_y_2 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %91 = arith.constant -2 : i64 +// CHECK-NEXT: %92 = "math.fpowi"(%h_y_2, %91) : (f32, i64) -> f32 +// CHECK-NEXT: %93 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %94 = arith.mulf %90, %92 : f32 +// CHECK-NEXT: %95 = arith.mulf %94, %93 : f32 +// CHECK-NEXT: %96 = arith.constant -8.333333e-02 : f32 +// CHECK-NEXT: %h_y_3 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %97 = arith.constant -2 : i64 +// CHECK-NEXT: %98 = "math.fpowi"(%h_y_3, %97) : (f32, i64) -> f32 +// CHECK-NEXT: %99 = arith.constant -2 : index +// CHECK-NEXT: %100 = arith.addi %27, %99 : index +// CHECK-NEXT: %101 = memref.load %u_t0_loadview[%26, %100, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %102 = arith.mulf %96, %98 : f32 +// CHECK-NEXT: %103 = arith.mulf %102, %101 : f32 +// CHECK-NEXT: %104 = arith.constant -8.333333e-02 : f32 +// CHECK-NEXT: %h_y_4 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %105 = arith.constant -2 : i64 +// CHECK-NEXT: %106 = "math.fpowi"(%h_y_4, %105) : (f32, i64) -> f32 +// CHECK-NEXT: %107 = arith.constant 2 : index +// CHECK-NEXT: %108 = arith.addi %27, %107 : index +// CHECK-NEXT: %109 = memref.load %u_t0_loadview[%26, %108, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %110 = arith.mulf %104, %106 : f32 +// CHECK-NEXT: %111 = arith.mulf %110, %109 : f32 +// CHECK-NEXT: %112 = arith.addf %81, %89 : f32 +// CHECK-NEXT: %113 = arith.addf %112, %95 : f32 +// CHECK-NEXT: %114 = arith.addf %113, %103 : f32 +// CHECK-NEXT: %115 = arith.addf %114, %111 : f32 +// CHECK-NEXT: %116 = arith.constant 1.333333e+00 : f32 +// CHECK-NEXT: %h_z = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %117 = arith.constant -2 : i64 +// CHECK-NEXT: %118 = "math.fpowi"(%h_z, %117) : (f32, i64) -> f32 +// CHECK-NEXT: %119 = arith.constant -1 : index +// CHECK-NEXT: %120 = arith.addi %25, %119 : index +// CHECK-NEXT: %121 = memref.load %u_t0_loadview[%26, %27, %120] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %122 = arith.mulf %116, %118 : f32 +// CHECK-NEXT: %123 = arith.mulf %122, %121 : f32 +// CHECK-NEXT: %124 = arith.constant 1.333333e+00 : f32 +// CHECK-NEXT: %h_z_1 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %125 = arith.constant -2 : i64 +// CHECK-NEXT: %126 = "math.fpowi"(%h_z_1, %125) : (f32, i64) -> f32 +// CHECK-NEXT: %127 = arith.constant 1 : index +// CHECK-NEXT: %128 = arith.addi %25, %127 : index +// CHECK-NEXT: %129 = memref.load %u_t0_loadview[%26, %27, %128] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %130 = arith.mulf %124, %126 : f32 +// CHECK-NEXT: %131 = arith.mulf %130, %129 : f32 +// CHECK-NEXT: %132 = arith.constant -2.500000e+00 : f32 +// CHECK-NEXT: %h_z_2 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %133 = arith.constant -2 : i64 +// CHECK-NEXT: %134 = "math.fpowi"(%h_z_2, %133) : (f32, i64) -> f32 +// CHECK-NEXT: %135 = memref.load %u_t0_loadview[%26, %27, %25] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %136 = arith.mulf %132, %134 : f32 +// CHECK-NEXT: %137 = arith.mulf %136, %135 : f32 +// CHECK-NEXT: %138 = arith.constant -8.333333e-02 : f32 +// CHECK-NEXT: %h_z_3 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %139 = arith.constant -2 : i64 +// CHECK-NEXT: %140 = "math.fpowi"(%h_z_3, %139) : (f32, i64) -> f32 +// CHECK-NEXT: %141 = arith.constant -2 : index +// CHECK-NEXT: %142 = arith.addi %25, %141 : index +// CHECK-NEXT: %143 = memref.load %u_t0_loadview[%26, %27, %142] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %144 = arith.mulf %138, %140 : f32 +// CHECK-NEXT: %145 = arith.mulf %144, %143 : f32 +// CHECK-NEXT: %146 = arith.constant -8.333333e-02 : f32 +// CHECK-NEXT: %h_z_4 = arith.constant 1.342282e-02 : f32 +// CHECK-NEXT: %147 = arith.constant -2 : i64 +// CHECK-NEXT: %148 = "math.fpowi"(%h_z_4, %147) : (f32, i64) -> f32 +// CHECK-NEXT: %149 = arith.constant 2 : index +// CHECK-NEXT: %150 = arith.addi %25, %149 : index +// CHECK-NEXT: %151 = memref.load %u_t0_loadview[%26, %27, %150] : memref<154x154x154xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: %152 = arith.mulf %146, %148 : f32 +// CHECK-NEXT: %153 = arith.mulf %152, %151 : f32 +// CHECK-NEXT: %154 = arith.addf %123, %131 : f32 +// CHECK-NEXT: %155 = arith.addf %154, %137 : f32 +// CHECK-NEXT: %156 = arith.addf %155, %145 : f32 +// CHECK-NEXT: %157 = arith.addf %156, %153 : f32 +// CHECK-NEXT: %158 = arith.addf %73, %115 : f32 +// CHECK-NEXT: %159 = arith.addf %158, %157 : f32 +// CHECK-NEXT: %a = arith.constant 9.000000e-01 : f32 +// CHECK-NEXT: %160 = arith.mulf %159, %a : f32 +// CHECK-NEXT: %161 = arith.addf %31, %160 : f32 +// CHECK-NEXT: %dt_1 = arith.constant 6.717825e-07 : f32 +// CHECK-NEXT: %162 = arith.mulf %161, %dt_1 : f32 +// CHECK-NEXT: memref.store %162, %u_t1_storeview[%26, %27, %25] : memref<150x150x150xf32, strided<[24964, 158, 1], offset: 100492>> +// CHECK-NEXT: scf.yield +// CHECK-NEXT: }) : (index, index, index, index, index, index, index, index, index) -> () +// CHECK-NEXT: scf.yield +// CHECK-NEXT: }) : (index, index, index, index, index, index) -> () +// CHECK-NEXT: scf.yield %u_t1, %u_t0 : memref<158x158x158xf32>, memref<158x158x158xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %163 = func.call @timer_end(%0) : (f64) -> f64 +// CHECK-NEXT: "llvm.store"(%163, %timers) <{"ordering" = 0 : i64}> : (f64, !llvm.ptr) -> () +// CHECK-NEXT: func.return +// CHECK-NEXT: } +// CHECK-NEXT: func.func private @timer_start() -> f64 +// CHECK-NEXT: func.func private @timer_end(f64) -> f64 +// CHECK-NEXT: } \ No newline at end of file