// -----// IR Dump Before LLVMGPUSelectLoweringStrategy (iree-llvmgpu-select-lowering-strategy) //----- // module { func.func @main$async_dispatch_146_attention_2x20x1024x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant 1.270000e+02 : f16 %cst_0 = arith.constant -1.280000e+02 : f16 %cst_1 = arith.constant 1.250000e-01 : f16 %c0 = arith.constant 0 : index %0 = hal.interface.constant.load[0] : i32 %1 = hal.interface.constant.load[1] : i32 %2 = hal.interface.constant.load[2] : i32 %3 = hal.interface.constant.load[3] : i32 %4 = arith.index_castui %0 : i32 to index %5 = arith.index_castui %1 : i32 to index %6 = arith.index_castui %2 : i32 to index %7 = arith.index_castui %3 : i32 to index %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 20, 1024, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x1024x64xf16> %15 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [2, 20, 1024, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x1024x64xf16> %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [2, 20, 1024, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x1024x64xf16> %17 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [20, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x64xf16> %18 = flow.dispatch.tensor.load %12, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor %19 = tensor.empty() : tensor<2x1024x20x64xi8> %20 = tensor.empty() : tensor<2x20x1024x64xf16> %21 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>]} ins(%14, %15, %16, %cst_1 : tensor<2x20x1024x64xf16>, tensor<2x20x1024x64xf16>, tensor<2x20x1024x64xf16>, f16) outs(%20 : tensor<2x20x1024x64xf16>) -> tensor<2x20x1024x64xf16> %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %17, %18 : tensor<2x20x1024x64xf16>, tensor<20x64xf16>, tensor) outs(%19 : tensor<2x1024x20x64xi8>) { ^bb0(%in: f16, %in_2: f16, %in_3: f32, %out: i8): %23 = arith.mulf %in, %in_2 : f16 %24 = arith.truncf %in_3 : f32 to f16 %25 = arith.divf %23, %24 : f16 %26 = math.roundeven %25 : f16 %27 = arith.cmpf ult, %26, %cst_0 : f16 %28 = arith.select %27, %cst_0, %26 : f16 %29 = arith.cmpf ugt, %28, %cst : f16 %30 = arith.select %29, %cst, %28 : f16 %31 = arith.fptosi %30 : f16 to i8 linalg.yield %31 : i8 } -> tensor<2x1024x20x64xi8> flow.dispatch.tensor.store %22, %13, offsets = [0, 0, 0, 0], sizes = [2, 1024, 20, 64], strides = [1, 1, 1, 1] : tensor<2x1024x20x64xi8> -> !flow.dispatch.tensor> return } } // -----// IR Dump After LLVMGPUSelectLoweringStrategy (iree-llvmgpu-select-lowering-strategy) //----- // module { func.func @main$async_dispatch_146_attention_2x20x1024x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant 1.270000e+02 : f16 %cst_0 = arith.constant -1.280000e+02 : f16 %cst_1 = arith.constant 1.250000e-01 : f16 %c0 = arith.constant 0 : index %0 = hal.interface.constant.load[0] : i32 %1 = hal.interface.constant.load[1] : i32 %2 = hal.interface.constant.load[2] : i32 %3 = hal.interface.constant.load[3] : i32 %4 = arith.index_castui %0 : i32 to index %5 = arith.index_castui %1 : i32 to index %6 = arith.index_castui %2 : i32 to index %7 = arith.index_castui %3 : i32 to index %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 20, 1024, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x1024x64xf16> %15 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [2, 20, 1024, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x1024x64xf16> %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [2, 20, 1024, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x1024x64xf16> %17 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [20, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x64xf16> %18 = flow.dispatch.tensor.load %12, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor %19 = tensor.empty() : tensor<2x1024x20x64xi8> %20 = tensor.empty() : tensor<2x20x1024x64xf16> %21 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d3)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d5)>]} ins(%14, %15, %16, %cst_1 : tensor<2x20x1024x64xf16>, tensor<2x20x1024x64xf16>, tensor<2x20x1024x64xf16>, f16) outs(%20 : tensor<2x20x1024x64xf16>) -> tensor<2x20x1024x64xf16> %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %17, %18 : tensor<2x20x1024x64xf16>, tensor<20x64xf16>, tensor) outs(%19 : tensor<2x1024x20x64xi8>) { ^bb0(%in: f16, %in_2: f16, %in_3: f32, %out: i8): %23 = arith.mulf %in, %in_2 : f16 %24 = arith.truncf %in_3 : f32 to f16 %25 = arith.divf %23, %24 : f16 %26 = math.roundeven %25 : f16 %27 = arith.cmpf ult, %26, %cst_0 : f16 %28 = arith.select %27, %cst_0, %26 : f16 %29 = arith.cmpf ugt, %28, %cst : f16 %30 = arith.select %29, %cst, %28 : f16 %31 = arith.fptosi %30 : f16 to i8 linalg.yield %31 : i8 } -> tensor<2x1024x20x64xi8> flow.dispatch.tensor.store %22, %13, offsets = [0, 0, 0, 0], sizes = [2, 1024, 20, 64], strides = [1, 1, 1, 1] : tensor<2x1024x20x64xi8> -> !flow.dispatch.tensor> return } } // -----// IR Dump Before LLVMGPULowerExecutableTarget (iree-llvmgpu-lower-executable-target) //----- // func.func @main$async_dispatch_146_attention_2x20x1024x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info} { %c992 = arith.constant 992 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x16xf32> %c0 = arith.constant 0 : index %cst_1 = arith.constant dense<1.270000e+02> : vector<2x1x16xf16> %cst_2 = arith.constant dense<-1.280000e+02> : vector<2x1x16xf16> %cst_3 = arith.constant dense<1.000000e+00> : vector<2x1x16xf32> %cst_4 = arith.constant dense<-6.550400e+04> : vector<4x1x4xf32> %cst_5 = arith.constant dense<6.550400e+04> : vector<4x1x4xf32> %cst_6 = arith.constant dense<1.802980e-01> : vector<1x4x8xf16> %cst_7 = arith.constant dense<0.000000e+00> : vector<1xf32> %cst_8 = arith.constant dense<-3.40282347E+38> : vector<1xf32> %cst_9 = arith.constant dense<0.000000e+00> : vector<2x1x16xf32> %c32 = arith.constant 32 : index %cst_10 = arith.constant dense<0.000000e+00> : vector<1x4x8xf16> %c32_i32 = arith.constant 32 : i32 %c64_i32 = arith.constant 64 : i32 %cst_11 = arith.constant dense<0.000000e+00> : vector<4x1x4xf32> %cst_12 = arith.constant dense<0.000000e+00> : vector<2x4x4xf16> %cst_13 = arith.constant dense<0.000000e+00> : vector<2x1x16xf16> %cst_14 = arith.constant dense<0.000000e+00> : vector<16xf32> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y %thread_id_z = gpu.thread_id z %alloc = memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space> %subview = memref.subview %alloc[0, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<1x1x32x68xf16, #gpu.address_space> to memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> %alloc_15 = memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space> %subview_16 = memref.subview %alloc_15[0, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<1x1x32x68xf16, #gpu.address_space> to memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> %0 = hal.interface.constant.load[0] : i32 %1 = hal.interface.constant.load[1] : i32 %2 = hal.interface.constant.load[2] : i32 %3 = hal.interface.constant.load[3] : i32 %4 = arith.index_castui %0 : i32 to index %5 = arith.index_castui %1 : i32 to index %6 = arith.index_castui %2 : i32 to index %7 = arith.index_castui %3 : i32 to index %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %10, 1 : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<20x64xf16, #hal.descriptor_type> memref.assume_alignment %11, 64 : memref<20x64xf16, #hal.descriptor_type> %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref> memref.assume_alignment %12, 64 : memref> %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%7) : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %13, 1 : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_id_z = hal.interface.workgroup.id[2] : index %14 = affine.apply affine_map<()[s0, s1, s2] -> (s0 * 128 + s2 * 32 + (s1 floordiv 64) * 32)>()[%workgroup_id_z, %thread_id_x, %thread_id_y] %15 = affine.apply affine_map<()[s0] -> (s0 mod 32)>()[%thread_id_x] %16 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8)>()[%thread_id_x] %17 = arith.addi %14, %15 : index %18 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %16] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %19 = vector.insert_strided_slice %18, %cst_10 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %20 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8 + 16)>()[%thread_id_x] %21 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %20] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %22 = vector.insert_strided_slice %21, %19 {offsets = [0, 1, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %23 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8 + 32)>()[%thread_id_x] %24 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %23] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %25 = vector.insert_strided_slice %24, %22 {offsets = [0, 2, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %26 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8 + 48)>()[%thread_id_x] %27 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %26] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %28 = vector.insert_strided_slice %27, %25 {offsets = [0, 3, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %29 = arith.mulf %28, %cst_6 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>, #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>], [#iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>]], __vector_layout_test_anchor_result_0 = #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>} : vector<1x4x8xf16> %30 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z] %31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] %32 = vector.transfer_read %9[%workgroup_id_x, %workgroup_id_y, %30, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> %33 = vector.transfer_read %10[%workgroup_id_x, %workgroup_id_y, %30, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> vector.transfer_write %32, %subview_16[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> vector.transfer_write %33, %subview[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> %34:3 = scf.for %arg0 = %c0 to %c992 step %c32 iter_args(%arg1 = %cst_8, %arg2 = %cst_7, %arg3 = %cst_9) -> (vector<1xf32>, vector<1xf32>, vector<2x1x16xf32>) { %533 = arith.addi %arg0, %c32 : index %534 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 32 + s1 floordiv 8)>()[%533, %thread_id_x, %thread_id_y, %thread_id_z] %535 = vector.transfer_read %9[%workgroup_id_x, %workgroup_id_y, %534, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> %536 = vector.transfer_read %10[%workgroup_id_x, %workgroup_id_y, %534, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> gpu.barrier %537 = vector.load %subview_16[%c0, %c0, %15, %16] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %538 = vector.insert_strided_slice %537, %cst_10 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %539 = vector.load %subview_16[%c0, %c0, %15, %20] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %540 = vector.insert_strided_slice %539, %538 {offsets = [0, 1, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %541 = vector.load %subview_16[%c0, %c0, %15, %23] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %542 = vector.insert_strided_slice %541, %540 {offsets = [0, 2, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %543 = vector.load %subview_16[%c0, %c0, %15, %26] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %544 = vector.insert_strided_slice %543, %542 {offsets = [0, 3, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %545 = vector.extract_strided_slice %544 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %546 = vector.extract_strided_slice %544 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %547 = vector.extract_strided_slice %544 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %548 = vector.extract_strided_slice %544 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %549 = vector.extract_strided_slice %544 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %550 = vector.extract_strided_slice %544 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %551 = vector.extract_strided_slice %544 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %552 = vector.extract_strided_slice %544 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %553 = vector.extract_strided_slice %29 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %554 = vector.extract_strided_slice %29 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %555 = vector.extract_strided_slice %29 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %556 = vector.extract_strided_slice %29 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %557 = vector.extract_strided_slice %29 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %558 = vector.extract_strided_slice %29 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %559 = vector.extract_strided_slice %29 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %560 = vector.extract_strided_slice %29 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %561 = vector.extract %545[0, 0] : vector<4xf16> from vector<1x1x4xf16> %562 = vector.extract %553[0, 0] : vector<4xf16> from vector<1x1x4xf16> %563 = amdgpu.mfma %561 * %562 + %cst_14 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %564 = vector.extract %546[0, 0] : vector<4xf16> from vector<1x1x4xf16> %565 = vector.extract %554[0, 0] : vector<4xf16> from vector<1x1x4xf16> %566 = amdgpu.mfma %564 * %565 + %563 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %567 = vector.extract %547[0, 0] : vector<4xf16> from vector<1x1x4xf16> %568 = vector.extract %555[0, 0] : vector<4xf16> from vector<1x1x4xf16> %569 = amdgpu.mfma %567 * %568 + %566 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %570 = vector.extract %548[0, 0] : vector<4xf16> from vector<1x1x4xf16> %571 = vector.extract %556[0, 0] : vector<4xf16> from vector<1x1x4xf16> %572 = amdgpu.mfma %570 * %571 + %569 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %573 = vector.extract %549[0, 0] : vector<4xf16> from vector<1x1x4xf16> %574 = vector.extract %557[0, 0] : vector<4xf16> from vector<1x1x4xf16> %575 = amdgpu.mfma %573 * %574 + %572 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %576 = vector.extract %550[0, 0] : vector<4xf16> from vector<1x1x4xf16> %577 = vector.extract %558[0, 0] : vector<4xf16> from vector<1x1x4xf16> %578 = amdgpu.mfma %576 * %577 + %575 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %579 = vector.extract %551[0, 0] : vector<4xf16> from vector<1x1x4xf16> %580 = vector.extract %559[0, 0] : vector<4xf16> from vector<1x1x4xf16> %581 = amdgpu.mfma %579 * %580 + %578 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %582 = vector.extract %552[0, 0] : vector<4xf16> from vector<1x1x4xf16> %583 = vector.extract %560[0, 0] : vector<4xf16> from vector<1x1x4xf16> %584 = amdgpu.mfma %582 * %583 + %581 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %585 = vector.broadcast %584 : vector<16xf32> to vector<1x1x16xf32> %586 = vector.extract %arg1[0] : f32 from vector<1xf32> %587 = vector.extract %584[0] : f32 from vector<16xf32> %588 = vector.insert %587, %cst_7 [0] : f32 into vector<1xf32> %589 = vector.extract %584[1] : f32 from vector<16xf32> %590 = vector.insert %589, %588 [0] : f32 into vector<1xf32> %591 = arith.maximumf %588, %590 : vector<1xf32> %592 = vector.extract %584[2] : f32 from vector<16xf32> %593 = vector.insert %592, %590 [0] : f32 into vector<1xf32> %594 = arith.maximumf %591, %593 : vector<1xf32> %595 = vector.extract %584[3] : f32 from vector<16xf32> %596 = vector.insert %595, %593 [0] : f32 into vector<1xf32> %597 = arith.maximumf %594, %596 : vector<1xf32> %598 = vector.extract %584[4] : f32 from vector<16xf32> %599 = vector.insert %598, %596 [0] : f32 into vector<1xf32> %600 = arith.maximumf %597, %599 : vector<1xf32> %601 = vector.extract %584[5] : f32 from vector<16xf32> %602 = vector.insert %601, %599 [0] : f32 into vector<1xf32> %603 = arith.maximumf %600, %602 : vector<1xf32> %604 = vector.extract %584[6] : f32 from vector<16xf32> %605 = vector.insert %604, %602 [0] : f32 into vector<1xf32> %606 = arith.maximumf %603, %605 : vector<1xf32> %607 = vector.extract %584[7] : f32 from vector<16xf32> %608 = vector.insert %607, %605 [0] : f32 into vector<1xf32> %609 = arith.maximumf %606, %608 : vector<1xf32> %610 = vector.extract %584[8] : f32 from vector<16xf32> %611 = vector.insert %610, %608 [0] : f32 into vector<1xf32> %612 = arith.maximumf %609, %611 : vector<1xf32> %613 = vector.extract %584[9] : f32 from vector<16xf32> %614 = vector.insert %613, %611 [0] : f32 into vector<1xf32> %615 = arith.maximumf %612, %614 : vector<1xf32> %616 = vector.extract %584[10] : f32 from vector<16xf32> %617 = vector.insert %616, %614 [0] : f32 into vector<1xf32> %618 = arith.maximumf %615, %617 : vector<1xf32> %619 = vector.extract %584[11] : f32 from vector<16xf32> %620 = vector.insert %619, %617 [0] : f32 into vector<1xf32> %621 = arith.maximumf %618, %620 : vector<1xf32> %622 = vector.extract %584[12] : f32 from vector<16xf32> %623 = vector.insert %622, %620 [0] : f32 into vector<1xf32> %624 = arith.maximumf %621, %623 : vector<1xf32> %625 = vector.extract %584[13] : f32 from vector<16xf32> %626 = vector.insert %625, %623 [0] : f32 into vector<1xf32> %627 = arith.maximumf %624, %626 : vector<1xf32> %628 = vector.extract %584[14] : f32 from vector<16xf32> %629 = vector.insert %628, %626 [0] : f32 into vector<1xf32> %630 = arith.maximumf %627, %629 : vector<1xf32> %631 = vector.extract %584[15] : f32 from vector<16xf32> %632 = vector.insert %631, %629 [0] : f32 into vector<1xf32> %633 = arith.maximumf %630, %632 : vector<1xf32> %634 = vector.bitcast %633 : vector<1xf32> to vector<1xi32> %635 = vector.extract %634[0] : i32 from vector<1xi32> %shuffleResult_19, %valid_20 = gpu.shuffle xor %635, %c32_i32, %c64_i32 : i32 %636 = vector.broadcast %shuffleResult_19 : i32 to vector<1xi32> %637 = vector.bitcast %636 : vector<1xi32> to vector<1xf32> %638 = arith.maximumf %637, %633 : vector<1xf32> %639 = vector.extract %638[0] : f32 from vector<1xf32> %640 = arith.maximumf %639, %586 : f32 %641 = vector.insert %640, %cst_7 [0] : f32 into vector<1xf32> %642 = vector.insert %640, %cst_0 [0, 0, 0] : f32 into vector<1x1x16xf32> %643 = vector.insert %640, %642 [0, 0, 1] : f32 into vector<1x1x16xf32> %644 = vector.insert %640, %643 [0, 0, 2] : f32 into vector<1x1x16xf32> %645 = vector.insert %640, %644 [0, 0, 3] : f32 into vector<1x1x16xf32> %646 = vector.insert %640, %645 [0, 0, 4] : f32 into vector<1x1x16xf32> %647 = vector.insert %640, %646 [0, 0, 5] : f32 into vector<1x1x16xf32> %648 = vector.insert %640, %647 [0, 0, 6] : f32 into vector<1x1x16xf32> %649 = vector.insert %640, %648 [0, 0, 7] : f32 into vector<1x1x16xf32> %650 = vector.insert %640, %649 [0, 0, 8] : f32 into vector<1x1x16xf32> %651 = vector.insert %640, %650 [0, 0, 9] : f32 into vector<1x1x16xf32> %652 = vector.insert %640, %651 [0, 0, 10] : f32 into vector<1x1x16xf32> %653 = vector.insert %640, %652 [0, 0, 11] : f32 into vector<1x1x16xf32> %654 = vector.insert %640, %653 [0, 0, 12] : f32 into vector<1x1x16xf32> %655 = vector.insert %640, %654 [0, 0, 13] : f32 into vector<1x1x16xf32> %656 = vector.insert %640, %655 [0, 0, 14] : f32 into vector<1x1x16xf32> %657 = vector.insert %640, %656 [0, 0, 15] : f32 into vector<1x1x16xf32> %658 = arith.subf %585, %657 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %659 = math.exp2 %658 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %660 = arith.subf %arg1, %641 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %661 = math.exp2 %660 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %662 = arith.mulf %661, %arg2 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %663 = vector.extract %662[0] : f32 from vector<1xf32> %664 = vector.extract %659[0, 0, 0] : f32 from vector<1x1x16xf32> %665 = vector.insert %664, %cst_7 [0] : f32 into vector<1xf32> %666 = vector.extract %659[0, 0, 1] : f32 from vector<1x1x16xf32> %667 = vector.insert %666, %665 [0] : f32 into vector<1xf32> %668 = arith.addf %665, %667 : vector<1xf32> %669 = vector.extract %659[0, 0, 2] : f32 from vector<1x1x16xf32> %670 = vector.insert %669, %667 [0] : f32 into vector<1xf32> %671 = arith.addf %668, %670 : vector<1xf32> %672 = vector.extract %659[0, 0, 3] : f32 from vector<1x1x16xf32> %673 = vector.insert %672, %670 [0] : f32 into vector<1xf32> %674 = arith.addf %671, %673 : vector<1xf32> %675 = vector.extract %659[0, 0, 4] : f32 from vector<1x1x16xf32> %676 = vector.insert %675, %673 [0] : f32 into vector<1xf32> %677 = arith.addf %674, %676 : vector<1xf32> %678 = vector.extract %659[0, 0, 5] : f32 from vector<1x1x16xf32> %679 = vector.insert %678, %676 [0] : f32 into vector<1xf32> %680 = arith.addf %677, %679 : vector<1xf32> %681 = vector.extract %659[0, 0, 6] : f32 from vector<1x1x16xf32> %682 = vector.insert %681, %679 [0] : f32 into vector<1xf32> %683 = arith.addf %680, %682 : vector<1xf32> %684 = vector.extract %659[0, 0, 7] : f32 from vector<1x1x16xf32> %685 = vector.insert %684, %682 [0] : f32 into vector<1xf32> %686 = arith.addf %683, %685 : vector<1xf32> %687 = vector.extract %659[0, 0, 8] : f32 from vector<1x1x16xf32> %688 = vector.insert %687, %685 [0] : f32 into vector<1xf32> %689 = arith.addf %686, %688 : vector<1xf32> %690 = vector.extract %659[0, 0, 9] : f32 from vector<1x1x16xf32> %691 = vector.insert %690, %688 [0] : f32 into vector<1xf32> %692 = arith.addf %689, %691 : vector<1xf32> %693 = vector.extract %659[0, 0, 10] : f32 from vector<1x1x16xf32> %694 = vector.insert %693, %691 [0] : f32 into vector<1xf32> %695 = arith.addf %692, %694 : vector<1xf32> %696 = vector.extract %659[0, 0, 11] : f32 from vector<1x1x16xf32> %697 = vector.insert %696, %694 [0] : f32 into vector<1xf32> %698 = arith.addf %695, %697 : vector<1xf32> %699 = vector.extract %659[0, 0, 12] : f32 from vector<1x1x16xf32> %700 = vector.insert %699, %697 [0] : f32 into vector<1xf32> %701 = arith.addf %698, %700 : vector<1xf32> %702 = vector.extract %659[0, 0, 13] : f32 from vector<1x1x16xf32> %703 = vector.insert %702, %700 [0] : f32 into vector<1xf32> %704 = arith.addf %701, %703 : vector<1xf32> %705 = vector.extract %659[0, 0, 14] : f32 from vector<1x1x16xf32> %706 = vector.insert %705, %703 [0] : f32 into vector<1xf32> %707 = arith.addf %704, %706 : vector<1xf32> %708 = vector.extract %659[0, 0, 15] : f32 from vector<1x1x16xf32> %709 = vector.insert %708, %706 [0] : f32 into vector<1xf32> %710 = arith.addf %707, %709 : vector<1xf32> %711 = vector.bitcast %710 : vector<1xf32> to vector<1xi32> %712 = vector.extract %711[0] : i32 from vector<1xi32> %shuffleResult_21, %valid_22 = gpu.shuffle xor %712, %c32_i32, %c64_i32 : i32 %713 = vector.broadcast %shuffleResult_21 : i32 to vector<1xi32> %714 = vector.bitcast %713 : vector<1xi32> to vector<1xf32> %715 = arith.addf %714, %710 : vector<1xf32> %716 = vector.extract %715[0] : f32 from vector<1xf32> %717 = arith.addf %716, %663 : f32 %718 = vector.insert %717, %cst_7 [0] : f32 into vector<1xf32> %719 = vector.extract_strided_slice %659 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %720 = vector.insert_strided_slice %719, %cst_11 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %721 = vector.extract_strided_slice %659 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %722 = vector.insert_strided_slice %721, %720 {offsets = [1, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %723 = vector.extract_strided_slice %659 {offsets = [0, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %724 = vector.insert_strided_slice %723, %722 {offsets = [2, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %725 = vector.extract_strided_slice %659 {offsets = [0, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %726 = vector.insert_strided_slice %725, %724 {offsets = [3, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %727 = arith.cmpf ogt, %726, %cst_5 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %728 = arith.cmpf olt, %726, %cst_4 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %729 = arith.select %727, %cst_5, %726 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %730 = arith.select %728, %cst_4, %729 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %731 = arith.truncf %730 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> to vector<4x1x4xf16> %732 = vector.extract %661[0] : f32 from vector<1xf32> %733 = vector.insert %732, %cst_9 [0, 0, 0] : f32 into vector<2x1x16xf32> %734 = vector.insert %732, %733 [0, 0, 1] : f32 into vector<2x1x16xf32> %735 = vector.insert %732, %734 [0, 0, 2] : f32 into vector<2x1x16xf32> %736 = vector.insert %732, %735 [0, 0, 3] : f32 into vector<2x1x16xf32> %737 = vector.insert %732, %736 [0, 0, 4] : f32 into vector<2x1x16xf32> %738 = vector.insert %732, %737 [0, 0, 5] : f32 into vector<2x1x16xf32> %739 = vector.insert %732, %738 [0, 0, 6] : f32 into vector<2x1x16xf32> %740 = vector.insert %732, %739 [0, 0, 7] : f32 into vector<2x1x16xf32> %741 = vector.insert %732, %740 [0, 0, 8] : f32 into vector<2x1x16xf32> %742 = vector.insert %732, %741 [0, 0, 9] : f32 into vector<2x1x16xf32> %743 = vector.insert %732, %742 [0, 0, 10] : f32 into vector<2x1x16xf32> %744 = vector.insert %732, %743 [0, 0, 11] : f32 into vector<2x1x16xf32> %745 = vector.insert %732, %744 [0, 0, 12] : f32 into vector<2x1x16xf32> %746 = vector.insert %732, %745 [0, 0, 13] : f32 into vector<2x1x16xf32> %747 = vector.insert %732, %746 [0, 0, 14] : f32 into vector<2x1x16xf32> %748 = vector.insert %732, %747 [0, 0, 15] : f32 into vector<2x1x16xf32> %749 = vector.insert %732, %748 [1, 0, 0] : f32 into vector<2x1x16xf32> %750 = vector.insert %732, %749 [1, 0, 1] : f32 into vector<2x1x16xf32> %751 = vector.insert %732, %750 [1, 0, 2] : f32 into vector<2x1x16xf32> %752 = vector.insert %732, %751 [1, 0, 3] : f32 into vector<2x1x16xf32> %753 = vector.insert %732, %752 [1, 0, 4] : f32 into vector<2x1x16xf32> %754 = vector.insert %732, %753 [1, 0, 5] : f32 into vector<2x1x16xf32> %755 = vector.insert %732, %754 [1, 0, 6] : f32 into vector<2x1x16xf32> %756 = vector.insert %732, %755 [1, 0, 7] : f32 into vector<2x1x16xf32> %757 = vector.insert %732, %756 [1, 0, 8] : f32 into vector<2x1x16xf32> %758 = vector.insert %732, %757 [1, 0, 9] : f32 into vector<2x1x16xf32> %759 = vector.insert %732, %758 [1, 0, 10] : f32 into vector<2x1x16xf32> %760 = vector.insert %732, %759 [1, 0, 11] : f32 into vector<2x1x16xf32> %761 = vector.insert %732, %760 [1, 0, 12] : f32 into vector<2x1x16xf32> %762 = vector.insert %732, %761 [1, 0, 13] : f32 into vector<2x1x16xf32> %763 = vector.insert %732, %762 [1, 0, 14] : f32 into vector<2x1x16xf32> %764 = vector.insert %732, %763 [1, 0, 15] : f32 into vector<2x1x16xf32> %765 = arith.mulf %764, %arg3 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> %766 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4)>()[%thread_id_x] %767 = vector.load %subview[%c0, %c0, %766, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %768 = vector.insert_strided_slice %767, %cst_12 {offsets = [0, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %769 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 1)>()[%thread_id_x] %770 = vector.load %subview[%c0, %c0, %769, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %771 = vector.insert_strided_slice %770, %768 {offsets = [0, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %772 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 2)>()[%thread_id_x] %773 = vector.load %subview[%c0, %c0, %772, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %774 = vector.insert_strided_slice %773, %771 {offsets = [0, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %775 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 3)>()[%thread_id_x] %776 = vector.load %subview[%c0, %c0, %775, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %777 = vector.insert_strided_slice %776, %774 {offsets = [0, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %778 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 2) floordiv 8)>()[%thread_id_x] %779 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 8 + 2) floordiv 8)>()[%thread_id_x] %780 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 32 + 8)>()[%thread_id_x] %781 = vector.load %subview[%778, %779, %780, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %782 = vector.insert_strided_slice %781, %777 {offsets = [0, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %783 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32)>()[%thread_id_x] %784 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9) floordiv 32)>()[%thread_id_x] %785 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9)>()[%thread_id_x] %786 = vector.load %subview[%783, %784, %785, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %787 = vector.insert_strided_slice %786, %782 {offsets = [0, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %788 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16)>()[%thread_id_x] %789 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 16 + 5) floordiv 16)>()[%thread_id_x] %790 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 32 + 10)>()[%thread_id_x] %791 = vector.load %subview[%788, %789, %790, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %792 = vector.insert_strided_slice %791, %787 {offsets = [0, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %793 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32)>()[%thread_id_x] %794 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11) floordiv 32)>()[%thread_id_x] %795 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11)>()[%thread_id_x] %796 = vector.load %subview[%793, %794, %795, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %797 = vector.insert_strided_slice %796, %792 {offsets = [0, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %798 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 4) floordiv 8)>()[%thread_id_x] %799 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 8 + 4) floordiv 8)>()[%thread_id_x] %800 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 32 + 16)>()[%thread_id_x] %801 = vector.load %subview[%798, %799, %800, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %802 = vector.insert_strided_slice %801, %797 {offsets = [0, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %803 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32)>()[%thread_id_x] %804 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17) floordiv 32)>()[%thread_id_x] %805 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17)>()[%thread_id_x] %806 = vector.load %subview[%803, %804, %805, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %807 = vector.insert_strided_slice %806, %802 {offsets = [0, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %808 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16)>()[%thread_id_x] %809 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 16 + 9) floordiv 16)>()[%thread_id_x] %810 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 32 + 18)>()[%thread_id_x] %811 = vector.load %subview[%808, %809, %810, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %812 = vector.insert_strided_slice %811, %807 {offsets = [0, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %813 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32)>()[%thread_id_x] %814 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19) floordiv 32)>()[%thread_id_x] %815 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19)>()[%thread_id_x] %816 = vector.load %subview[%813, %814, %815, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %817 = vector.insert_strided_slice %816, %812 {offsets = [0, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %818 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 6) floordiv 8)>()[%thread_id_x] %819 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 8 + 6) floordiv 8)>()[%thread_id_x] %820 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 32 + 24)>()[%thread_id_x] %821 = vector.load %subview[%818, %819, %820, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %822 = vector.insert_strided_slice %821, %817 {offsets = [0, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %823 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32)>()[%thread_id_x] %824 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25) floordiv 32)>()[%thread_id_x] %825 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25)>()[%thread_id_x] %826 = vector.load %subview[%823, %824, %825, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %827 = vector.insert_strided_slice %826, %822 {offsets = [0, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %828 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16)>()[%thread_id_x] %829 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 16 + 13) floordiv 16)>()[%thread_id_x] %830 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 32 + 26)>()[%thread_id_x] %831 = vector.load %subview[%828, %829, %830, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %832 = vector.insert_strided_slice %831, %827 {offsets = [0, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %833 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32)>()[%thread_id_x] %834 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27) floordiv 32)>()[%thread_id_x] %835 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27)>()[%thread_id_x] %836 = vector.load %subview[%833, %834, %835, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %837 = vector.insert_strided_slice %836, %832 {offsets = [0, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %838 = affine.apply affine_map<()[s0] -> (s0 mod 32 + 32)>()[%thread_id_x] %839 = vector.load %subview[%c0, %c0, %766, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %840 = vector.insert_strided_slice %839, %837 {offsets = [1, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %841 = vector.load %subview[%c0, %c0, %769, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %842 = vector.insert_strided_slice %841, %840 {offsets = [1, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %843 = vector.load %subview[%c0, %c0, %772, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %844 = vector.insert_strided_slice %843, %842 {offsets = [1, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %845 = vector.load %subview[%c0, %c0, %775, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %846 = vector.insert_strided_slice %845, %844 {offsets = [1, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %847 = vector.load %subview[%778, %779, %780, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %848 = vector.insert_strided_slice %847, %846 {offsets = [1, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %849 = vector.load %subview[%783, %784, %785, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %850 = vector.insert_strided_slice %849, %848 {offsets = [1, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %851 = vector.load %subview[%788, %789, %790, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %852 = vector.insert_strided_slice %851, %850 {offsets = [1, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %853 = vector.load %subview[%793, %794, %795, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %854 = vector.insert_strided_slice %853, %852 {offsets = [1, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %855 = vector.load %subview[%798, %799, %800, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %856 = vector.insert_strided_slice %855, %854 {offsets = [1, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %857 = vector.load %subview[%803, %804, %805, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %858 = vector.insert_strided_slice %857, %856 {offsets = [1, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %859 = vector.load %subview[%808, %809, %810, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %860 = vector.insert_strided_slice %859, %858 {offsets = [1, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %861 = vector.load %subview[%813, %814, %815, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %862 = vector.insert_strided_slice %861, %860 {offsets = [1, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %863 = vector.load %subview[%818, %819, %820, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %864 = vector.insert_strided_slice %863, %862 {offsets = [1, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %865 = vector.load %subview[%823, %824, %825, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %866 = vector.insert_strided_slice %865, %864 {offsets = [1, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %867 = vector.load %subview[%828, %829, %830, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %868 = vector.insert_strided_slice %867, %866 {offsets = [1, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %869 = vector.load %subview[%833, %834, %835, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %870 = vector.insert_strided_slice %869, %868 {offsets = [1, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %871 = vector.extract %765[0, 0] : vector<16xf32> from vector<2x1x16xf32> %872 = vector.extract %870[0, 0] : vector<4xf16> from vector<2x4x4xf16> %873 = vector.extract %731[0, 0] : vector<4xf16> from vector<4x1x4xf16> %874 = amdgpu.mfma %872 * %873 + %871 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %875 = vector.extract %870[0, 1] : vector<4xf16> from vector<2x4x4xf16> %876 = vector.extract %731[1, 0] : vector<4xf16> from vector<4x1x4xf16> %877 = amdgpu.mfma %875 * %876 + %874 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %878 = vector.extract %870[0, 2] : vector<4xf16> from vector<2x4x4xf16> %879 = vector.extract %731[2, 0] : vector<4xf16> from vector<4x1x4xf16> %880 = amdgpu.mfma %878 * %879 + %877 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %881 = vector.extract %870[0, 3] : vector<4xf16> from vector<2x4x4xf16> %882 = vector.extract %731[3, 0] : vector<4xf16> from vector<4x1x4xf16> %883 = amdgpu.mfma %881 * %882 + %880 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %884 = vector.insert %883, %cst_9 [0, 0] : vector<16xf32> into vector<2x1x16xf32> %885 = vector.extract %765[1, 0] : vector<16xf32> from vector<2x1x16xf32> %886 = vector.extract %870[1, 0] : vector<4xf16> from vector<2x4x4xf16> %887 = amdgpu.mfma %886 * %873 + %885 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %888 = vector.extract %870[1, 1] : vector<4xf16> from vector<2x4x4xf16> %889 = amdgpu.mfma %888 * %876 + %887 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %890 = vector.extract %870[1, 2] : vector<4xf16> from vector<2x4x4xf16> %891 = amdgpu.mfma %890 * %879 + %889 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %892 = vector.extract %870[1, 3] : vector<4xf16> from vector<2x4x4xf16> %893 = amdgpu.mfma %892 * %882 + %891 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %894 = vector.insert %893, %884 [1, 0] : vector<16xf32> into vector<2x1x16xf32> gpu.barrier vector.transfer_write %535, %subview_16[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> vector.transfer_write %536, %subview[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> scf.yield %641, %718, %894 : vector<1xf32>, vector<1xf32>, vector<2x1x16xf32> } gpu.barrier %35 = vector.load %subview_16[%c0, %c0, %15, %16] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %36 = vector.insert_strided_slice %35, %cst_10 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %37 = vector.load %subview_16[%c0, %c0, %15, %20] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %38 = vector.insert_strided_slice %37, %36 {offsets = [0, 1, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %39 = vector.load %subview_16[%c0, %c0, %15, %23] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %40 = vector.insert_strided_slice %39, %38 {offsets = [0, 2, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %41 = vector.load %subview_16[%c0, %c0, %15, %26] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %42 = vector.insert_strided_slice %41, %40 {offsets = [0, 3, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %43 = vector.extract_strided_slice %42 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %44 = vector.extract_strided_slice %42 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %45 = vector.extract_strided_slice %42 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %46 = vector.extract_strided_slice %42 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %47 = vector.extract_strided_slice %42 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %48 = vector.extract_strided_slice %42 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %49 = vector.extract_strided_slice %42 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %50 = vector.extract_strided_slice %42 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %51 = vector.extract_strided_slice %29 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %52 = vector.extract_strided_slice %29 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %53 = vector.extract_strided_slice %29 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %54 = vector.extract_strided_slice %29 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %55 = vector.extract_strided_slice %29 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %56 = vector.extract_strided_slice %29 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %57 = vector.extract_strided_slice %29 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %58 = vector.extract_strided_slice %29 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %59 = vector.extract %43[0, 0] : vector<4xf16> from vector<1x1x4xf16> %60 = vector.extract %51[0, 0] : vector<4xf16> from vector<1x1x4xf16> %61 = amdgpu.mfma %59 * %60 + %cst_14 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %62 = vector.extract %44[0, 0] : vector<4xf16> from vector<1x1x4xf16> %63 = vector.extract %52[0, 0] : vector<4xf16> from vector<1x1x4xf16> %64 = amdgpu.mfma %62 * %63 + %61 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %65 = vector.extract %45[0, 0] : vector<4xf16> from vector<1x1x4xf16> %66 = vector.extract %53[0, 0] : vector<4xf16> from vector<1x1x4xf16> %67 = amdgpu.mfma %65 * %66 + %64 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %68 = vector.extract %46[0, 0] : vector<4xf16> from vector<1x1x4xf16> %69 = vector.extract %54[0, 0] : vector<4xf16> from vector<1x1x4xf16> %70 = amdgpu.mfma %68 * %69 + %67 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %71 = vector.extract %47[0, 0] : vector<4xf16> from vector<1x1x4xf16> %72 = vector.extract %55[0, 0] : vector<4xf16> from vector<1x1x4xf16> %73 = amdgpu.mfma %71 * %72 + %70 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %74 = vector.extract %48[0, 0] : vector<4xf16> from vector<1x1x4xf16> %75 = vector.extract %56[0, 0] : vector<4xf16> from vector<1x1x4xf16> %76 = amdgpu.mfma %74 * %75 + %73 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %77 = vector.extract %49[0, 0] : vector<4xf16> from vector<1x1x4xf16> %78 = vector.extract %57[0, 0] : vector<4xf16> from vector<1x1x4xf16> %79 = amdgpu.mfma %77 * %78 + %76 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %80 = vector.extract %50[0, 0] : vector<4xf16> from vector<1x1x4xf16> %81 = vector.extract %58[0, 0] : vector<4xf16> from vector<1x1x4xf16> %82 = amdgpu.mfma %80 * %81 + %79 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %83 = vector.broadcast %82 : vector<16xf32> to vector<1x1x16xf32> %84 = vector.extract %34#0[0] : f32 from vector<1xf32> %85 = vector.extract %82[0] : f32 from vector<16xf32> %86 = vector.insert %85, %cst_7 [0] : f32 into vector<1xf32> %87 = vector.extract %82[1] : f32 from vector<16xf32> %88 = vector.insert %87, %86 [0] : f32 into vector<1xf32> %89 = arith.maximumf %86, %88 : vector<1xf32> %90 = vector.extract %82[2] : f32 from vector<16xf32> %91 = vector.insert %90, %88 [0] : f32 into vector<1xf32> %92 = arith.maximumf %89, %91 : vector<1xf32> %93 = vector.extract %82[3] : f32 from vector<16xf32> %94 = vector.insert %93, %91 [0] : f32 into vector<1xf32> %95 = arith.maximumf %92, %94 : vector<1xf32> %96 = vector.extract %82[4] : f32 from vector<16xf32> %97 = vector.insert %96, %94 [0] : f32 into vector<1xf32> %98 = arith.maximumf %95, %97 : vector<1xf32> %99 = vector.extract %82[5] : f32 from vector<16xf32> %100 = vector.insert %99, %97 [0] : f32 into vector<1xf32> %101 = arith.maximumf %98, %100 : vector<1xf32> %102 = vector.extract %82[6] : f32 from vector<16xf32> %103 = vector.insert %102, %100 [0] : f32 into vector<1xf32> %104 = arith.maximumf %101, %103 : vector<1xf32> %105 = vector.extract %82[7] : f32 from vector<16xf32> %106 = vector.insert %105, %103 [0] : f32 into vector<1xf32> %107 = arith.maximumf %104, %106 : vector<1xf32> %108 = vector.extract %82[8] : f32 from vector<16xf32> %109 = vector.insert %108, %106 [0] : f32 into vector<1xf32> %110 = arith.maximumf %107, %109 : vector<1xf32> %111 = vector.extract %82[9] : f32 from vector<16xf32> %112 = vector.insert %111, %109 [0] : f32 into vector<1xf32> %113 = arith.maximumf %110, %112 : vector<1xf32> %114 = vector.extract %82[10] : f32 from vector<16xf32> %115 = vector.insert %114, %112 [0] : f32 into vector<1xf32> %116 = arith.maximumf %113, %115 : vector<1xf32> %117 = vector.extract %82[11] : f32 from vector<16xf32> %118 = vector.insert %117, %115 [0] : f32 into vector<1xf32> %119 = arith.maximumf %116, %118 : vector<1xf32> %120 = vector.extract %82[12] : f32 from vector<16xf32> %121 = vector.insert %120, %118 [0] : f32 into vector<1xf32> %122 = arith.maximumf %119, %121 : vector<1xf32> %123 = vector.extract %82[13] : f32 from vector<16xf32> %124 = vector.insert %123, %121 [0] : f32 into vector<1xf32> %125 = arith.maximumf %122, %124 : vector<1xf32> %126 = vector.extract %82[14] : f32 from vector<16xf32> %127 = vector.insert %126, %124 [0] : f32 into vector<1xf32> %128 = arith.maximumf %125, %127 : vector<1xf32> %129 = vector.extract %82[15] : f32 from vector<16xf32> %130 = vector.insert %129, %127 [0] : f32 into vector<1xf32> %131 = arith.maximumf %128, %130 : vector<1xf32> %132 = vector.bitcast %131 : vector<1xf32> to vector<1xi32> %133 = vector.extract %132[0] : i32 from vector<1xi32> %shuffleResult, %valid = gpu.shuffle xor %133, %c32_i32, %c64_i32 : i32 %134 = vector.broadcast %shuffleResult : i32 to vector<1xi32> %135 = vector.bitcast %134 : vector<1xi32> to vector<1xf32> %136 = arith.maximumf %135, %131 : vector<1xf32> %137 = vector.extract %136[0] : f32 from vector<1xf32> %138 = arith.maximumf %137, %84 : f32 %139 = vector.insert %138, %cst_7 [0] : f32 into vector<1xf32> %140 = vector.insert %138, %cst_0 [0, 0, 0] : f32 into vector<1x1x16xf32> %141 = vector.insert %138, %140 [0, 0, 1] : f32 into vector<1x1x16xf32> %142 = vector.insert %138, %141 [0, 0, 2] : f32 into vector<1x1x16xf32> %143 = vector.insert %138, %142 [0, 0, 3] : f32 into vector<1x1x16xf32> %144 = vector.insert %138, %143 [0, 0, 4] : f32 into vector<1x1x16xf32> %145 = vector.insert %138, %144 [0, 0, 5] : f32 into vector<1x1x16xf32> %146 = vector.insert %138, %145 [0, 0, 6] : f32 into vector<1x1x16xf32> %147 = vector.insert %138, %146 [0, 0, 7] : f32 into vector<1x1x16xf32> %148 = vector.insert %138, %147 [0, 0, 8] : f32 into vector<1x1x16xf32> %149 = vector.insert %138, %148 [0, 0, 9] : f32 into vector<1x1x16xf32> %150 = vector.insert %138, %149 [0, 0, 10] : f32 into vector<1x1x16xf32> %151 = vector.insert %138, %150 [0, 0, 11] : f32 into vector<1x1x16xf32> %152 = vector.insert %138, %151 [0, 0, 12] : f32 into vector<1x1x16xf32> %153 = vector.insert %138, %152 [0, 0, 13] : f32 into vector<1x1x16xf32> %154 = vector.insert %138, %153 [0, 0, 14] : f32 into vector<1x1x16xf32> %155 = vector.insert %138, %154 [0, 0, 15] : f32 into vector<1x1x16xf32> %156 = arith.subf %83, %155 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %157 = math.exp2 %156 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %158 = arith.subf %34#0, %139 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %159 = math.exp2 %158 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %160 = arith.mulf %159, %34#1 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %161 = vector.extract %160[0] : f32 from vector<1xf32> %162 = vector.extract %157[0, 0, 0] : f32 from vector<1x1x16xf32> %163 = vector.insert %162, %cst_7 [0] : f32 into vector<1xf32> %164 = vector.extract %157[0, 0, 1] : f32 from vector<1x1x16xf32> %165 = vector.insert %164, %163 [0] : f32 into vector<1xf32> %166 = arith.addf %163, %165 : vector<1xf32> %167 = vector.extract %157[0, 0, 2] : f32 from vector<1x1x16xf32> %168 = vector.insert %167, %165 [0] : f32 into vector<1xf32> %169 = arith.addf %166, %168 : vector<1xf32> %170 = vector.extract %157[0, 0, 3] : f32 from vector<1x1x16xf32> %171 = vector.insert %170, %168 [0] : f32 into vector<1xf32> %172 = arith.addf %169, %171 : vector<1xf32> %173 = vector.extract %157[0, 0, 4] : f32 from vector<1x1x16xf32> %174 = vector.insert %173, %171 [0] : f32 into vector<1xf32> %175 = arith.addf %172, %174 : vector<1xf32> %176 = vector.extract %157[0, 0, 5] : f32 from vector<1x1x16xf32> %177 = vector.insert %176, %174 [0] : f32 into vector<1xf32> %178 = arith.addf %175, %177 : vector<1xf32> %179 = vector.extract %157[0, 0, 6] : f32 from vector<1x1x16xf32> %180 = vector.insert %179, %177 [0] : f32 into vector<1xf32> %181 = arith.addf %178, %180 : vector<1xf32> %182 = vector.extract %157[0, 0, 7] : f32 from vector<1x1x16xf32> %183 = vector.insert %182, %180 [0] : f32 into vector<1xf32> %184 = arith.addf %181, %183 : vector<1xf32> %185 = vector.extract %157[0, 0, 8] : f32 from vector<1x1x16xf32> %186 = vector.insert %185, %183 [0] : f32 into vector<1xf32> %187 = arith.addf %184, %186 : vector<1xf32> %188 = vector.extract %157[0, 0, 9] : f32 from vector<1x1x16xf32> %189 = vector.insert %188, %186 [0] : f32 into vector<1xf32> %190 = arith.addf %187, %189 : vector<1xf32> %191 = vector.extract %157[0, 0, 10] : f32 from vector<1x1x16xf32> %192 = vector.insert %191, %189 [0] : f32 into vector<1xf32> %193 = arith.addf %190, %192 : vector<1xf32> %194 = vector.extract %157[0, 0, 11] : f32 from vector<1x1x16xf32> %195 = vector.insert %194, %192 [0] : f32 into vector<1xf32> %196 = arith.addf %193, %195 : vector<1xf32> %197 = vector.extract %157[0, 0, 12] : f32 from vector<1x1x16xf32> %198 = vector.insert %197, %195 [0] : f32 into vector<1xf32> %199 = arith.addf %196, %198 : vector<1xf32> %200 = vector.extract %157[0, 0, 13] : f32 from vector<1x1x16xf32> %201 = vector.insert %200, %198 [0] : f32 into vector<1xf32> %202 = arith.addf %199, %201 : vector<1xf32> %203 = vector.extract %157[0, 0, 14] : f32 from vector<1x1x16xf32> %204 = vector.insert %203, %201 [0] : f32 into vector<1xf32> %205 = arith.addf %202, %204 : vector<1xf32> %206 = vector.extract %157[0, 0, 15] : f32 from vector<1x1x16xf32> %207 = vector.insert %206, %204 [0] : f32 into vector<1xf32> %208 = arith.addf %205, %207 : vector<1xf32> %209 = vector.bitcast %208 : vector<1xf32> to vector<1xi32> %210 = vector.extract %209[0] : i32 from vector<1xi32> %shuffleResult_17, %valid_18 = gpu.shuffle xor %210, %c32_i32, %c64_i32 : i32 %211 = vector.broadcast %shuffleResult_17 : i32 to vector<1xi32> %212 = vector.bitcast %211 : vector<1xi32> to vector<1xf32> %213 = arith.addf %212, %208 : vector<1xf32> %214 = vector.extract %213[0] : f32 from vector<1xf32> %215 = arith.addf %214, %161 : f32 %216 = vector.extract_strided_slice %157 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %217 = vector.insert_strided_slice %216, %cst_11 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %218 = vector.extract_strided_slice %157 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %219 = vector.insert_strided_slice %218, %217 {offsets = [1, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %220 = vector.extract_strided_slice %157 {offsets = [0, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %221 = vector.insert_strided_slice %220, %219 {offsets = [2, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %222 = vector.extract_strided_slice %157 {offsets = [0, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %223 = vector.insert_strided_slice %222, %221 {offsets = [3, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %224 = arith.cmpf ogt, %223, %cst_5 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %225 = arith.cmpf olt, %223, %cst_4 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %226 = arith.select %224, %cst_5, %223 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %227 = arith.select %225, %cst_4, %226 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %228 = arith.truncf %227 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> to vector<4x1x4xf16> %229 = vector.extract %159[0] : f32 from vector<1xf32> %230 = vector.insert %229, %cst_9 [0, 0, 0] : f32 into vector<2x1x16xf32> %231 = vector.insert %229, %230 [0, 0, 1] : f32 into vector<2x1x16xf32> %232 = vector.insert %229, %231 [0, 0, 2] : f32 into vector<2x1x16xf32> %233 = vector.insert %229, %232 [0, 0, 3] : f32 into vector<2x1x16xf32> %234 = vector.insert %229, %233 [0, 0, 4] : f32 into vector<2x1x16xf32> %235 = vector.insert %229, %234 [0, 0, 5] : f32 into vector<2x1x16xf32> %236 = vector.insert %229, %235 [0, 0, 6] : f32 into vector<2x1x16xf32> %237 = vector.insert %229, %236 [0, 0, 7] : f32 into vector<2x1x16xf32> %238 = vector.insert %229, %237 [0, 0, 8] : f32 into vector<2x1x16xf32> %239 = vector.insert %229, %238 [0, 0, 9] : f32 into vector<2x1x16xf32> %240 = vector.insert %229, %239 [0, 0, 10] : f32 into vector<2x1x16xf32> %241 = vector.insert %229, %240 [0, 0, 11] : f32 into vector<2x1x16xf32> %242 = vector.insert %229, %241 [0, 0, 12] : f32 into vector<2x1x16xf32> %243 = vector.insert %229, %242 [0, 0, 13] : f32 into vector<2x1x16xf32> %244 = vector.insert %229, %243 [0, 0, 14] : f32 into vector<2x1x16xf32> %245 = vector.insert %229, %244 [0, 0, 15] : f32 into vector<2x1x16xf32> %246 = vector.insert %229, %245 [1, 0, 0] : f32 into vector<2x1x16xf32> %247 = vector.insert %229, %246 [1, 0, 1] : f32 into vector<2x1x16xf32> %248 = vector.insert %229, %247 [1, 0, 2] : f32 into vector<2x1x16xf32> %249 = vector.insert %229, %248 [1, 0, 3] : f32 into vector<2x1x16xf32> %250 = vector.insert %229, %249 [1, 0, 4] : f32 into vector<2x1x16xf32> %251 = vector.insert %229, %250 [1, 0, 5] : f32 into vector<2x1x16xf32> %252 = vector.insert %229, %251 [1, 0, 6] : f32 into vector<2x1x16xf32> %253 = vector.insert %229, %252 [1, 0, 7] : f32 into vector<2x1x16xf32> %254 = vector.insert %229, %253 [1, 0, 8] : f32 into vector<2x1x16xf32> %255 = vector.insert %229, %254 [1, 0, 9] : f32 into vector<2x1x16xf32> %256 = vector.insert %229, %255 [1, 0, 10] : f32 into vector<2x1x16xf32> %257 = vector.insert %229, %256 [1, 0, 11] : f32 into vector<2x1x16xf32> %258 = vector.insert %229, %257 [1, 0, 12] : f32 into vector<2x1x16xf32> %259 = vector.insert %229, %258 [1, 0, 13] : f32 into vector<2x1x16xf32> %260 = vector.insert %229, %259 [1, 0, 14] : f32 into vector<2x1x16xf32> %261 = vector.insert %229, %260 [1, 0, 15] : f32 into vector<2x1x16xf32> %262 = arith.mulf %261, %34#2 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> %263 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4)>()[%thread_id_x] %264 = vector.load %subview[%c0, %c0, %263, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %265 = vector.insert_strided_slice %264, %cst_12 {offsets = [0, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %266 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 1)>()[%thread_id_x] %267 = vector.load %subview[%c0, %c0, %266, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %268 = vector.insert_strided_slice %267, %265 {offsets = [0, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %269 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 2)>()[%thread_id_x] %270 = vector.load %subview[%c0, %c0, %269, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %271 = vector.insert_strided_slice %270, %268 {offsets = [0, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %272 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 3)>()[%thread_id_x] %273 = vector.load %subview[%c0, %c0, %272, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %274 = vector.insert_strided_slice %273, %271 {offsets = [0, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %275 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 2) floordiv 8)>()[%thread_id_x] %276 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 8 + 2) floordiv 8)>()[%thread_id_x] %277 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 32 + 8)>()[%thread_id_x] %278 = vector.load %subview[%275, %276, %277, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %279 = vector.insert_strided_slice %278, %274 {offsets = [0, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %280 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32)>()[%thread_id_x] %281 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9) floordiv 32)>()[%thread_id_x] %282 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9)>()[%thread_id_x] %283 = vector.load %subview[%280, %281, %282, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %284 = vector.insert_strided_slice %283, %279 {offsets = [0, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %285 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16)>()[%thread_id_x] %286 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 16 + 5) floordiv 16)>()[%thread_id_x] %287 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 32 + 10)>()[%thread_id_x] %288 = vector.load %subview[%285, %286, %287, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %289 = vector.insert_strided_slice %288, %284 {offsets = [0, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %290 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32)>()[%thread_id_x] %291 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11) floordiv 32)>()[%thread_id_x] %292 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11)>()[%thread_id_x] %293 = vector.load %subview[%290, %291, %292, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %294 = vector.insert_strided_slice %293, %289 {offsets = [0, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %295 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 4) floordiv 8)>()[%thread_id_x] %296 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 8 + 4) floordiv 8)>()[%thread_id_x] %297 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 32 + 16)>()[%thread_id_x] %298 = vector.load %subview[%295, %296, %297, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %299 = vector.insert_strided_slice %298, %294 {offsets = [0, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %300 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32)>()[%thread_id_x] %301 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17) floordiv 32)>()[%thread_id_x] %302 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17)>()[%thread_id_x] %303 = vector.load %subview[%300, %301, %302, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %304 = vector.insert_strided_slice %303, %299 {offsets = [0, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %305 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16)>()[%thread_id_x] %306 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 16 + 9) floordiv 16)>()[%thread_id_x] %307 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 32 + 18)>()[%thread_id_x] %308 = vector.load %subview[%305, %306, %307, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %309 = vector.insert_strided_slice %308, %304 {offsets = [0, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %310 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32)>()[%thread_id_x] %311 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19) floordiv 32)>()[%thread_id_x] %312 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19)>()[%thread_id_x] %313 = vector.load %subview[%310, %311, %312, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %314 = vector.insert_strided_slice %313, %309 {offsets = [0, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %315 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 6) floordiv 8)>()[%thread_id_x] %316 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 8 + 6) floordiv 8)>()[%thread_id_x] %317 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 32 + 24)>()[%thread_id_x] %318 = vector.load %subview[%315, %316, %317, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %319 = vector.insert_strided_slice %318, %314 {offsets = [0, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %320 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32)>()[%thread_id_x] %321 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25) floordiv 32)>()[%thread_id_x] %322 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25)>()[%thread_id_x] %323 = vector.load %subview[%320, %321, %322, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %324 = vector.insert_strided_slice %323, %319 {offsets = [0, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %325 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16)>()[%thread_id_x] %326 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 16 + 13) floordiv 16)>()[%thread_id_x] %327 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 32 + 26)>()[%thread_id_x] %328 = vector.load %subview[%325, %326, %327, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %329 = vector.insert_strided_slice %328, %324 {offsets = [0, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %330 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32)>()[%thread_id_x] %331 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27) floordiv 32)>()[%thread_id_x] %332 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27)>()[%thread_id_x] %333 = vector.load %subview[%330, %331, %332, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %334 = vector.insert_strided_slice %333, %329 {offsets = [0, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %335 = affine.apply affine_map<()[s0] -> (s0 mod 32 + 32)>()[%thread_id_x] %336 = vector.load %subview[%c0, %c0, %263, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %337 = vector.insert_strided_slice %336, %334 {offsets = [1, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %338 = vector.load %subview[%c0, %c0, %266, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %339 = vector.insert_strided_slice %338, %337 {offsets = [1, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %340 = vector.load %subview[%c0, %c0, %269, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %341 = vector.insert_strided_slice %340, %339 {offsets = [1, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %342 = vector.load %subview[%c0, %c0, %272, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %343 = vector.insert_strided_slice %342, %341 {offsets = [1, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %344 = vector.load %subview[%275, %276, %277, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %345 = vector.insert_strided_slice %344, %343 {offsets = [1, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %346 = vector.load %subview[%280, %281, %282, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %347 = vector.insert_strided_slice %346, %345 {offsets = [1, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %348 = vector.load %subview[%285, %286, %287, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %349 = vector.insert_strided_slice %348, %347 {offsets = [1, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %350 = vector.load %subview[%290, %291, %292, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %351 = vector.insert_strided_slice %350, %349 {offsets = [1, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %352 = vector.load %subview[%295, %296, %297, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %353 = vector.insert_strided_slice %352, %351 {offsets = [1, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %354 = vector.load %subview[%300, %301, %302, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %355 = vector.insert_strided_slice %354, %353 {offsets = [1, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %356 = vector.load %subview[%305, %306, %307, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %357 = vector.insert_strided_slice %356, %355 {offsets = [1, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %358 = vector.load %subview[%310, %311, %312, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %359 = vector.insert_strided_slice %358, %357 {offsets = [1, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %360 = vector.load %subview[%315, %316, %317, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %361 = vector.insert_strided_slice %360, %359 {offsets = [1, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %362 = vector.load %subview[%320, %321, %322, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %363 = vector.insert_strided_slice %362, %361 {offsets = [1, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %364 = vector.load %subview[%325, %326, %327, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %365 = vector.insert_strided_slice %364, %363 {offsets = [1, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %366 = vector.load %subview[%330, %331, %332, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %367 = vector.insert_strided_slice %366, %365 {offsets = [1, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %368 = vector.extract %262[0, 0] : vector<16xf32> from vector<2x1x16xf32> %369 = vector.extract %367[0, 0] : vector<4xf16> from vector<2x4x4xf16> %370 = vector.extract %228[0, 0] : vector<4xf16> from vector<4x1x4xf16> %371 = amdgpu.mfma %369 * %370 + %368 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %372 = vector.extract %367[0, 1] : vector<4xf16> from vector<2x4x4xf16> %373 = vector.extract %228[1, 0] : vector<4xf16> from vector<4x1x4xf16> %374 = amdgpu.mfma %372 * %373 + %371 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %375 = vector.extract %367[0, 2] : vector<4xf16> from vector<2x4x4xf16> %376 = vector.extract %228[2, 0] : vector<4xf16> from vector<4x1x4xf16> %377 = amdgpu.mfma %375 * %376 + %374 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %378 = vector.extract %367[0, 3] : vector<4xf16> from vector<2x4x4xf16> %379 = vector.extract %228[3, 0] : vector<4xf16> from vector<4x1x4xf16> %380 = amdgpu.mfma %378 * %379 + %377 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %381 = vector.insert %380, %cst_9 [0, 0] : vector<16xf32> into vector<2x1x16xf32> %382 = vector.extract %262[1, 0] : vector<16xf32> from vector<2x1x16xf32> %383 = vector.extract %367[1, 0] : vector<4xf16> from vector<2x4x4xf16> %384 = amdgpu.mfma %383 * %370 + %382 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %385 = vector.extract %367[1, 1] : vector<4xf16> from vector<2x4x4xf16> %386 = amdgpu.mfma %385 * %373 + %384 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %387 = vector.extract %367[1, 2] : vector<4xf16> from vector<2x4x4xf16> %388 = amdgpu.mfma %387 * %376 + %386 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %389 = vector.extract %367[1, 3] : vector<4xf16> from vector<2x4x4xf16> %390 = amdgpu.mfma %389 * %379 + %388 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %391 = vector.insert %390, %381 [1, 0] : vector<16xf32> into vector<2x1x16xf32> %392 = vector.insert %215, %cst_9 [0, 0, 0] : f32 into vector<2x1x16xf32> %393 = vector.insert %215, %392 [0, 0, 1] : f32 into vector<2x1x16xf32> %394 = vector.insert %215, %393 [0, 0, 2] : f32 into vector<2x1x16xf32> %395 = vector.insert %215, %394 [0, 0, 3] : f32 into vector<2x1x16xf32> %396 = vector.insert %215, %395 [0, 0, 4] : f32 into vector<2x1x16xf32> %397 = vector.insert %215, %396 [0, 0, 5] : f32 into vector<2x1x16xf32> %398 = vector.insert %215, %397 [0, 0, 6] : f32 into vector<2x1x16xf32> %399 = vector.insert %215, %398 [0, 0, 7] : f32 into vector<2x1x16xf32> %400 = vector.insert %215, %399 [0, 0, 8] : f32 into vector<2x1x16xf32> %401 = vector.insert %215, %400 [0, 0, 9] : f32 into vector<2x1x16xf32> %402 = vector.insert %215, %401 [0, 0, 10] : f32 into vector<2x1x16xf32> %403 = vector.insert %215, %402 [0, 0, 11] : f32 into vector<2x1x16xf32> %404 = vector.insert %215, %403 [0, 0, 12] : f32 into vector<2x1x16xf32> %405 = vector.insert %215, %404 [0, 0, 13] : f32 into vector<2x1x16xf32> %406 = vector.insert %215, %405 [0, 0, 14] : f32 into vector<2x1x16xf32> %407 = vector.insert %215, %406 [0, 0, 15] : f32 into vector<2x1x16xf32> %408 = vector.insert %215, %407 [1, 0, 0] : f32 into vector<2x1x16xf32> %409 = vector.insert %215, %408 [1, 0, 1] : f32 into vector<2x1x16xf32> %410 = vector.insert %215, %409 [1, 0, 2] : f32 into vector<2x1x16xf32> %411 = vector.insert %215, %410 [1, 0, 3] : f32 into vector<2x1x16xf32> %412 = vector.insert %215, %411 [1, 0, 4] : f32 into vector<2x1x16xf32> %413 = vector.insert %215, %412 [1, 0, 5] : f32 into vector<2x1x16xf32> %414 = vector.insert %215, %413 [1, 0, 6] : f32 into vector<2x1x16xf32> %415 = vector.insert %215, %414 [1, 0, 7] : f32 into vector<2x1x16xf32> %416 = vector.insert %215, %415 [1, 0, 8] : f32 into vector<2x1x16xf32> %417 = vector.insert %215, %416 [1, 0, 9] : f32 into vector<2x1x16xf32> %418 = vector.insert %215, %417 [1, 0, 10] : f32 into vector<2x1x16xf32> %419 = vector.insert %215, %418 [1, 0, 11] : f32 into vector<2x1x16xf32> %420 = vector.insert %215, %419 [1, 0, 12] : f32 into vector<2x1x16xf32> %421 = vector.insert %215, %420 [1, 0, 13] : f32 into vector<2x1x16xf32> %422 = vector.insert %215, %421 [1, 0, 14] : f32 into vector<2x1x16xf32> %423 = vector.insert %215, %422 [1, 0, 15] : f32 into vector<2x1x16xf32> %424 = vector.load %11[%workgroup_id_y, %263] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %425 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 8)>()[%thread_id_x] %426 = vector.load %11[%workgroup_id_y, %425] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %427 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 16)>()[%thread_id_x] %428 = vector.load %11[%workgroup_id_y, %427] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %429 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 24)>()[%thread_id_x] %430 = vector.load %11[%workgroup_id_y, %429] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %431 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 32)>()[%thread_id_x] %432 = vector.load %11[%workgroup_id_y, %431] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %433 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 40)>()[%thread_id_x] %434 = vector.load %11[%workgroup_id_y, %433] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %435 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 48)>()[%thread_id_x] %436 = vector.load %11[%workgroup_id_y, %435] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %437 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 56)>()[%thread_id_x] %438 = vector.load %11[%workgroup_id_y, %437] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %439 = vector.extract %424[0] : f16 from vector<4xf16> %440 = vector.insert %439, %cst_13 [0, 0, 0] : f16 into vector<2x1x16xf16> %441 = vector.extract %424[1] : f16 from vector<4xf16> %442 = vector.insert %441, %440 [0, 0, 1] : f16 into vector<2x1x16xf16> %443 = vector.extract %424[2] : f16 from vector<4xf16> %444 = vector.insert %443, %442 [0, 0, 2] : f16 into vector<2x1x16xf16> %445 = vector.extract %424[3] : f16 from vector<4xf16> %446 = vector.insert %445, %444 [0, 0, 3] : f16 into vector<2x1x16xf16> %447 = vector.extract %426[0] : f16 from vector<4xf16> %448 = vector.insert %447, %446 [0, 0, 4] : f16 into vector<2x1x16xf16> %449 = vector.extract %426[1] : f16 from vector<4xf16> %450 = vector.insert %449, %448 [0, 0, 5] : f16 into vector<2x1x16xf16> %451 = vector.extract %426[2] : f16 from vector<4xf16> %452 = vector.insert %451, %450 [0, 0, 6] : f16 into vector<2x1x16xf16> %453 = vector.extract %426[3] : f16 from vector<4xf16> %454 = vector.insert %453, %452 [0, 0, 7] : f16 into vector<2x1x16xf16> %455 = vector.extract %428[0] : f16 from vector<4xf16> %456 = vector.insert %455, %454 [0, 0, 8] : f16 into vector<2x1x16xf16> %457 = vector.extract %428[1] : f16 from vector<4xf16> %458 = vector.insert %457, %456 [0, 0, 9] : f16 into vector<2x1x16xf16> %459 = vector.extract %428[2] : f16 from vector<4xf16> %460 = vector.insert %459, %458 [0, 0, 10] : f16 into vector<2x1x16xf16> %461 = vector.extract %428[3] : f16 from vector<4xf16> %462 = vector.insert %461, %460 [0, 0, 11] : f16 into vector<2x1x16xf16> %463 = vector.extract %430[0] : f16 from vector<4xf16> %464 = vector.insert %463, %462 [0, 0, 12] : f16 into vector<2x1x16xf16> %465 = vector.extract %430[1] : f16 from vector<4xf16> %466 = vector.insert %465, %464 [0, 0, 13] : f16 into vector<2x1x16xf16> %467 = vector.extract %430[2] : f16 from vector<4xf16> %468 = vector.insert %467, %466 [0, 0, 14] : f16 into vector<2x1x16xf16> %469 = vector.extract %430[3] : f16 from vector<4xf16> %470 = vector.insert %469, %468 [0, 0, 15] : f16 into vector<2x1x16xf16> %471 = vector.extract %432[0] : f16 from vector<4xf16> %472 = vector.insert %471, %470 [1, 0, 0] : f16 into vector<2x1x16xf16> %473 = vector.extract %432[1] : f16 from vector<4xf16> %474 = vector.insert %473, %472 [1, 0, 1] : f16 into vector<2x1x16xf16> %475 = vector.extract %432[2] : f16 from vector<4xf16> %476 = vector.insert %475, %474 [1, 0, 2] : f16 into vector<2x1x16xf16> %477 = vector.extract %432[3] : f16 from vector<4xf16> %478 = vector.insert %477, %476 [1, 0, 3] : f16 into vector<2x1x16xf16> %479 = vector.extract %434[0] : f16 from vector<4xf16> %480 = vector.insert %479, %478 [1, 0, 4] : f16 into vector<2x1x16xf16> %481 = vector.extract %434[1] : f16 from vector<4xf16> %482 = vector.insert %481, %480 [1, 0, 5] : f16 into vector<2x1x16xf16> %483 = vector.extract %434[2] : f16 from vector<4xf16> %484 = vector.insert %483, %482 [1, 0, 6] : f16 into vector<2x1x16xf16> %485 = vector.extract %434[3] : f16 from vector<4xf16> %486 = vector.insert %485, %484 [1, 0, 7] : f16 into vector<2x1x16xf16> %487 = vector.extract %436[0] : f16 from vector<4xf16> %488 = vector.insert %487, %486 [1, 0, 8] : f16 into vector<2x1x16xf16> %489 = vector.extract %436[1] : f16 from vector<4xf16> %490 = vector.insert %489, %488 [1, 0, 9] : f16 into vector<2x1x16xf16> %491 = vector.extract %436[2] : f16 from vector<4xf16> %492 = vector.insert %491, %490 [1, 0, 10] : f16 into vector<2x1x16xf16> %493 = vector.extract %436[3] : f16 from vector<4xf16> %494 = vector.insert %493, %492 [1, 0, 11] : f16 into vector<2x1x16xf16> %495 = vector.extract %438[0] : f16 from vector<4xf16> %496 = vector.insert %495, %494 [1, 0, 12] : f16 into vector<2x1x16xf16> %497 = vector.extract %438[1] : f16 from vector<4xf16> %498 = vector.insert %497, %496 [1, 0, 13] : f16 into vector<2x1x16xf16> %499 = vector.extract %438[2] : f16 from vector<4xf16> %500 = vector.insert %499, %498 [1, 0, 14] : f16 into vector<2x1x16xf16> %501 = vector.extract %438[3] : f16 from vector<4xf16> %502 = vector.insert %501, %500 [1, 0, 15] : f16 into vector<2x1x16xf16> %503 = memref.load %12[] : memref> %504 = arith.divf %cst_3, %423 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>, <[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>, <[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>, <[ BATCHY, LANEX], [1, 32]>>]]} : vector<2x1x16xf32> %505 = arith.mulf %504, %391 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> %506 = arith.truncf %505 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> to vector<2x1x16xf16> %507 = arith.mulf %506, %502 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %508 = arith.truncf %503 : f32 to f16 %509 = vector.broadcast %508 : f16 to vector<2x1x16xf16> %510 = arith.divf %507, %509 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %511 = math.roundeven %510 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %512 = arith.cmpf ult, %511, %cst_2 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %513 = arith.select %512, %cst_2, %511 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xi1>, vector<2x1x16xf16> %514 = arith.cmpf ugt, %513, %cst_1 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %515 = arith.select %514, %cst_1, %513 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xi1>, vector<2x1x16xf16> %516 = arith.fptosi %515 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> to vector<2x1x16xi8> %517 = vector.extract_strided_slice %516 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %518 = vector.extract %517[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %518, %13[%workgroup_id_x, %17, %workgroup_id_y, %263] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %519 = vector.extract_strided_slice %516 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %520 = vector.extract %519[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %520, %13[%workgroup_id_x, %17, %workgroup_id_y, %425] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %521 = vector.extract_strided_slice %516 {offsets = [0, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %522 = vector.extract %521[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %522, %13[%workgroup_id_x, %17, %workgroup_id_y, %427] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %523 = vector.extract_strided_slice %516 {offsets = [0, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %524 = vector.extract %523[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %524, %13[%workgroup_id_x, %17, %workgroup_id_y, %429] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %525 = vector.extract_strided_slice %516 {offsets = [1, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %526 = vector.extract %525[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %526, %13[%workgroup_id_x, %17, %workgroup_id_y, %431] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %527 = vector.extract_strided_slice %516 {offsets = [1, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %528 = vector.extract %527[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %528, %13[%workgroup_id_x, %17, %workgroup_id_y, %433] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %529 = vector.extract_strided_slice %516 {offsets = [1, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %530 = vector.extract %529[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %530, %13[%workgroup_id_x, %17, %workgroup_id_y, %435] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %531 = vector.extract_strided_slice %516 {offsets = [1, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %532 = vector.extract %531[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %532, %13[%workgroup_id_x, %17, %workgroup_id_y, %437] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> return } // -----// IR Dump After LLVMGPULowerExecutableTarget (iree-llvmgpu-lower-executable-target) //----- // func.func @main$async_dispatch_146_attention_2x20x1024x64xf16_generic() attributes {translation_info = #iree_codegen.translation_info} { %c992 = arith.constant 992 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x16xf32> %c0 = arith.constant 0 : index %cst_1 = arith.constant dense<1.270000e+02> : vector<2x1x16xf16> %cst_2 = arith.constant dense<-1.280000e+02> : vector<2x1x16xf16> %cst_3 = arith.constant dense<1.000000e+00> : vector<2x1x16xf32> %cst_4 = arith.constant dense<-6.550400e+04> : vector<4x1x4xf32> %cst_5 = arith.constant dense<6.550400e+04> : vector<4x1x4xf32> %cst_6 = arith.constant dense<1.802980e-01> : vector<1x4x8xf16> %cst_7 = arith.constant dense<0.000000e+00> : vector<1xf32> %cst_8 = arith.constant dense<-3.40282347E+38> : vector<1xf32> %cst_9 = arith.constant dense<0.000000e+00> : vector<2x1x16xf32> %c32 = arith.constant 32 : index %cst_10 = arith.constant dense<0.000000e+00> : vector<1x4x8xf16> %c32_i32 = arith.constant 32 : i32 %c64_i32 = arith.constant 64 : i32 %cst_11 = arith.constant dense<0.000000e+00> : vector<4x1x4xf32> %cst_12 = arith.constant dense<0.000000e+00> : vector<2x4x4xf16> %cst_13 = arith.constant dense<0.000000e+00> : vector<2x1x16xf16> %cst_14 = arith.constant dense<0.000000e+00> : vector<16xf32> %thread_id_x = gpu.thread_id x %thread_id_y = gpu.thread_id y %thread_id_z = gpu.thread_id z %alloc = memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space> %subview = memref.subview %alloc[0, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<1x1x32x68xf16, #gpu.address_space> to memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> %alloc_15 = memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space> %subview_16 = memref.subview %alloc_15[0, 0, 0, 0] [1, 1, 32, 64] [1, 1, 1, 1] : memref<1x1x32x68xf16, #gpu.address_space> to memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> %0 = hal.interface.constant.load[0] : i32 %1 = hal.interface.constant.load[1] : i32 %2 = hal.interface.constant.load[2] : i32 %3 = hal.interface.constant.load[3] : i32 %4 = arith.index_castui %0 : i32 to index %5 = arith.index_castui %1 : i32 to index %6 = arith.index_castui %2 : i32 to index %7 = arith.index_castui %3 : i32 to index %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %8, 1 : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %9, 1 : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %10, 1 : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type> %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<20x64xf16, #hal.descriptor_type> memref.assume_alignment %11, 64 : memref<20x64xf16, #hal.descriptor_type> %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref> memref.assume_alignment %12, 64 : memref> %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%7) : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type> memref.assume_alignment %13, 1 : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_id_z = hal.interface.workgroup.id[2] : index %14 = affine.apply affine_map<()[s0, s1, s2] -> (s0 * 128 + s2 * 32 + (s1 floordiv 64) * 32)>()[%workgroup_id_z, %thread_id_x, %thread_id_y] %15 = affine.apply affine_map<()[s0] -> (s0 mod 32)>()[%thread_id_x] %16 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8)>()[%thread_id_x] %17 = arith.addi %14, %15 : index %18 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %16] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %19 = vector.insert_strided_slice %18, %cst_10 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %20 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8 + 16)>()[%thread_id_x] %21 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %20] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %22 = vector.insert_strided_slice %21, %19 {offsets = [0, 1, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %23 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8 + 32)>()[%thread_id_x] %24 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %23] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %25 = vector.insert_strided_slice %24, %22 {offsets = [0, 2, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %26 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 8 + 48)>()[%thread_id_x] %27 = vector.load %8[%workgroup_id_x, %workgroup_id_y, %17, %26] : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<8xf16> %28 = vector.insert_strided_slice %27, %25 {offsets = [0, 3, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %29 = arith.mulf %28, %cst_6 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>, #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>], [#iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>]], __vector_layout_test_anchor_result_0 = #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 32]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 8]>>} : vector<1x4x8xf16> %30 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z] %31 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 8) * 64)>()[%thread_id_x] %32 = vector.transfer_read %9[%workgroup_id_x, %workgroup_id_y, %30, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> %33 = vector.transfer_read %10[%workgroup_id_x, %workgroup_id_y, %30, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> vector.transfer_write %32, %subview_16[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> vector.transfer_write %33, %subview[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> %34:3 = scf.for %arg0 = %c0 to %c992 step %c32 iter_args(%arg1 = %cst_8, %arg2 = %cst_7, %arg3 = %cst_9) -> (vector<1xf32>, vector<1xf32>, vector<2x1x16xf32>) { %533 = arith.addi %arg0, %c32 : index %534 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 32 + s1 floordiv 8)>()[%533, %thread_id_x, %thread_id_y, %thread_id_z] %535 = vector.transfer_read %9[%workgroup_id_x, %workgroup_id_y, %534, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> %536 = vector.transfer_read %10[%workgroup_id_x, %workgroup_id_y, %534, %31], %cst {in_bounds = [true, true, true, true]} : memref<2x20x1024x64xf16, strided<[1310720, 65536, 64, 1], offset: ?>, #hal.descriptor_type>, vector<1x1x1x8xf16> gpu.barrier %537 = vector.load %subview_16[%c0, %c0, %15, %16] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %538 = vector.insert_strided_slice %537, %cst_10 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %539 = vector.load %subview_16[%c0, %c0, %15, %20] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %540 = vector.insert_strided_slice %539, %538 {offsets = [0, 1, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %541 = vector.load %subview_16[%c0, %c0, %15, %23] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %542 = vector.insert_strided_slice %541, %540 {offsets = [0, 2, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %543 = vector.load %subview_16[%c0, %c0, %15, %26] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %544 = vector.insert_strided_slice %543, %542 {offsets = [0, 3, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %545 = vector.extract_strided_slice %544 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %546 = vector.extract_strided_slice %544 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %547 = vector.extract_strided_slice %544 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %548 = vector.extract_strided_slice %544 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %549 = vector.extract_strided_slice %544 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %550 = vector.extract_strided_slice %544 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %551 = vector.extract_strided_slice %544 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %552 = vector.extract_strided_slice %544 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %553 = vector.extract_strided_slice %29 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %554 = vector.extract_strided_slice %29 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %555 = vector.extract_strided_slice %29 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %556 = vector.extract_strided_slice %29 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %557 = vector.extract_strided_slice %29 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %558 = vector.extract_strided_slice %29 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %559 = vector.extract_strided_slice %29 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %560 = vector.extract_strided_slice %29 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %561 = vector.extract %545[0, 0] : vector<4xf16> from vector<1x1x4xf16> %562 = vector.extract %553[0, 0] : vector<4xf16> from vector<1x1x4xf16> %563 = amdgpu.mfma %561 * %562 + %cst_14 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %564 = vector.extract %546[0, 0] : vector<4xf16> from vector<1x1x4xf16> %565 = vector.extract %554[0, 0] : vector<4xf16> from vector<1x1x4xf16> %566 = amdgpu.mfma %564 * %565 + %563 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %567 = vector.extract %547[0, 0] : vector<4xf16> from vector<1x1x4xf16> %568 = vector.extract %555[0, 0] : vector<4xf16> from vector<1x1x4xf16> %569 = amdgpu.mfma %567 * %568 + %566 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %570 = vector.extract %548[0, 0] : vector<4xf16> from vector<1x1x4xf16> %571 = vector.extract %556[0, 0] : vector<4xf16> from vector<1x1x4xf16> %572 = amdgpu.mfma %570 * %571 + %569 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %573 = vector.extract %549[0, 0] : vector<4xf16> from vector<1x1x4xf16> %574 = vector.extract %557[0, 0] : vector<4xf16> from vector<1x1x4xf16> %575 = amdgpu.mfma %573 * %574 + %572 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %576 = vector.extract %550[0, 0] : vector<4xf16> from vector<1x1x4xf16> %577 = vector.extract %558[0, 0] : vector<4xf16> from vector<1x1x4xf16> %578 = amdgpu.mfma %576 * %577 + %575 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %579 = vector.extract %551[0, 0] : vector<4xf16> from vector<1x1x4xf16> %580 = vector.extract %559[0, 0] : vector<4xf16> from vector<1x1x4xf16> %581 = amdgpu.mfma %579 * %580 + %578 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %582 = vector.extract %552[0, 0] : vector<4xf16> from vector<1x1x4xf16> %583 = vector.extract %560[0, 0] : vector<4xf16> from vector<1x1x4xf16> %584 = amdgpu.mfma %582 * %583 + %581 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %585 = vector.broadcast %584 : vector<16xf32> to vector<1x1x16xf32> %586 = vector.extract %arg1[0] : f32 from vector<1xf32> %587 = vector.extract %584[0] : f32 from vector<16xf32> %588 = vector.insert %587, %cst_7 [0] : f32 into vector<1xf32> %589 = vector.extract %584[1] : f32 from vector<16xf32> %590 = vector.insert %589, %588 [0] : f32 into vector<1xf32> %591 = arith.maximumf %588, %590 : vector<1xf32> %592 = vector.extract %584[2] : f32 from vector<16xf32> %593 = vector.insert %592, %590 [0] : f32 into vector<1xf32> %594 = arith.maximumf %591, %593 : vector<1xf32> %595 = vector.extract %584[3] : f32 from vector<16xf32> %596 = vector.insert %595, %593 [0] : f32 into vector<1xf32> %597 = arith.maximumf %594, %596 : vector<1xf32> %598 = vector.extract %584[4] : f32 from vector<16xf32> %599 = vector.insert %598, %596 [0] : f32 into vector<1xf32> %600 = arith.maximumf %597, %599 : vector<1xf32> %601 = vector.extract %584[5] : f32 from vector<16xf32> %602 = vector.insert %601, %599 [0] : f32 into vector<1xf32> %603 = arith.maximumf %600, %602 : vector<1xf32> %604 = vector.extract %584[6] : f32 from vector<16xf32> %605 = vector.insert %604, %602 [0] : f32 into vector<1xf32> %606 = arith.maximumf %603, %605 : vector<1xf32> %607 = vector.extract %584[7] : f32 from vector<16xf32> %608 = vector.insert %607, %605 [0] : f32 into vector<1xf32> %609 = arith.maximumf %606, %608 : vector<1xf32> %610 = vector.extract %584[8] : f32 from vector<16xf32> %611 = vector.insert %610, %608 [0] : f32 into vector<1xf32> %612 = arith.maximumf %609, %611 : vector<1xf32> %613 = vector.extract %584[9] : f32 from vector<16xf32> %614 = vector.insert %613, %611 [0] : f32 into vector<1xf32> %615 = arith.maximumf %612, %614 : vector<1xf32> %616 = vector.extract %584[10] : f32 from vector<16xf32> %617 = vector.insert %616, %614 [0] : f32 into vector<1xf32> %618 = arith.maximumf %615, %617 : vector<1xf32> %619 = vector.extract %584[11] : f32 from vector<16xf32> %620 = vector.insert %619, %617 [0] : f32 into vector<1xf32> %621 = arith.maximumf %618, %620 : vector<1xf32> %622 = vector.extract %584[12] : f32 from vector<16xf32> %623 = vector.insert %622, %620 [0] : f32 into vector<1xf32> %624 = arith.maximumf %621, %623 : vector<1xf32> %625 = vector.extract %584[13] : f32 from vector<16xf32> %626 = vector.insert %625, %623 [0] : f32 into vector<1xf32> %627 = arith.maximumf %624, %626 : vector<1xf32> %628 = vector.extract %584[14] : f32 from vector<16xf32> %629 = vector.insert %628, %626 [0] : f32 into vector<1xf32> %630 = arith.maximumf %627, %629 : vector<1xf32> %631 = vector.extract %584[15] : f32 from vector<16xf32> %632 = vector.insert %631, %629 [0] : f32 into vector<1xf32> %633 = arith.maximumf %630, %632 : vector<1xf32> %634 = vector.bitcast %633 : vector<1xf32> to vector<1xi32> %635 = vector.extract %634[0] : i32 from vector<1xi32> %shuffleResult_19, %valid_20 = gpu.shuffle xor %635, %c32_i32, %c64_i32 : i32 %636 = vector.broadcast %shuffleResult_19 : i32 to vector<1xi32> %637 = vector.bitcast %636 : vector<1xi32> to vector<1xf32> %638 = arith.maximumf %637, %633 : vector<1xf32> %639 = vector.extract %638[0] : f32 from vector<1xf32> %640 = arith.maximumf %639, %586 : f32 %641 = vector.insert %640, %cst_7 [0] : f32 into vector<1xf32> %642 = vector.insert %640, %cst_0 [0, 0, 0] : f32 into vector<1x1x16xf32> %643 = vector.insert %640, %642 [0, 0, 1] : f32 into vector<1x1x16xf32> %644 = vector.insert %640, %643 [0, 0, 2] : f32 into vector<1x1x16xf32> %645 = vector.insert %640, %644 [0, 0, 3] : f32 into vector<1x1x16xf32> %646 = vector.insert %640, %645 [0, 0, 4] : f32 into vector<1x1x16xf32> %647 = vector.insert %640, %646 [0, 0, 5] : f32 into vector<1x1x16xf32> %648 = vector.insert %640, %647 [0, 0, 6] : f32 into vector<1x1x16xf32> %649 = vector.insert %640, %648 [0, 0, 7] : f32 into vector<1x1x16xf32> %650 = vector.insert %640, %649 [0, 0, 8] : f32 into vector<1x1x16xf32> %651 = vector.insert %640, %650 [0, 0, 9] : f32 into vector<1x1x16xf32> %652 = vector.insert %640, %651 [0, 0, 10] : f32 into vector<1x1x16xf32> %653 = vector.insert %640, %652 [0, 0, 11] : f32 into vector<1x1x16xf32> %654 = vector.insert %640, %653 [0, 0, 12] : f32 into vector<1x1x16xf32> %655 = vector.insert %640, %654 [0, 0, 13] : f32 into vector<1x1x16xf32> %656 = vector.insert %640, %655 [0, 0, 14] : f32 into vector<1x1x16xf32> %657 = vector.insert %640, %656 [0, 0, 15] : f32 into vector<1x1x16xf32> %658 = arith.subf %585, %657 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %659 = math.exp2 %658 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %660 = arith.subf %arg1, %641 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %661 = math.exp2 %660 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %662 = arith.mulf %661, %arg2 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %663 = vector.extract %662[0] : f32 from vector<1xf32> %664 = vector.extract %659[0, 0, 0] : f32 from vector<1x1x16xf32> %665 = vector.insert %664, %cst_7 [0] : f32 into vector<1xf32> %666 = vector.extract %659[0, 0, 1] : f32 from vector<1x1x16xf32> %667 = vector.insert %666, %665 [0] : f32 into vector<1xf32> %668 = arith.addf %665, %667 : vector<1xf32> %669 = vector.extract %659[0, 0, 2] : f32 from vector<1x1x16xf32> %670 = vector.insert %669, %667 [0] : f32 into vector<1xf32> %671 = arith.addf %668, %670 : vector<1xf32> %672 = vector.extract %659[0, 0, 3] : f32 from vector<1x1x16xf32> %673 = vector.insert %672, %670 [0] : f32 into vector<1xf32> %674 = arith.addf %671, %673 : vector<1xf32> %675 = vector.extract %659[0, 0, 4] : f32 from vector<1x1x16xf32> %676 = vector.insert %675, %673 [0] : f32 into vector<1xf32> %677 = arith.addf %674, %676 : vector<1xf32> %678 = vector.extract %659[0, 0, 5] : f32 from vector<1x1x16xf32> %679 = vector.insert %678, %676 [0] : f32 into vector<1xf32> %680 = arith.addf %677, %679 : vector<1xf32> %681 = vector.extract %659[0, 0, 6] : f32 from vector<1x1x16xf32> %682 = vector.insert %681, %679 [0] : f32 into vector<1xf32> %683 = arith.addf %680, %682 : vector<1xf32> %684 = vector.extract %659[0, 0, 7] : f32 from vector<1x1x16xf32> %685 = vector.insert %684, %682 [0] : f32 into vector<1xf32> %686 = arith.addf %683, %685 : vector<1xf32> %687 = vector.extract %659[0, 0, 8] : f32 from vector<1x1x16xf32> %688 = vector.insert %687, %685 [0] : f32 into vector<1xf32> %689 = arith.addf %686, %688 : vector<1xf32> %690 = vector.extract %659[0, 0, 9] : f32 from vector<1x1x16xf32> %691 = vector.insert %690, %688 [0] : f32 into vector<1xf32> %692 = arith.addf %689, %691 : vector<1xf32> %693 = vector.extract %659[0, 0, 10] : f32 from vector<1x1x16xf32> %694 = vector.insert %693, %691 [0] : f32 into vector<1xf32> %695 = arith.addf %692, %694 : vector<1xf32> %696 = vector.extract %659[0, 0, 11] : f32 from vector<1x1x16xf32> %697 = vector.insert %696, %694 [0] : f32 into vector<1xf32> %698 = arith.addf %695, %697 : vector<1xf32> %699 = vector.extract %659[0, 0, 12] : f32 from vector<1x1x16xf32> %700 = vector.insert %699, %697 [0] : f32 into vector<1xf32> %701 = arith.addf %698, %700 : vector<1xf32> %702 = vector.extract %659[0, 0, 13] : f32 from vector<1x1x16xf32> %703 = vector.insert %702, %700 [0] : f32 into vector<1xf32> %704 = arith.addf %701, %703 : vector<1xf32> %705 = vector.extract %659[0, 0, 14] : f32 from vector<1x1x16xf32> %706 = vector.insert %705, %703 [0] : f32 into vector<1xf32> %707 = arith.addf %704, %706 : vector<1xf32> %708 = vector.extract %659[0, 0, 15] : f32 from vector<1x1x16xf32> %709 = vector.insert %708, %706 [0] : f32 into vector<1xf32> %710 = arith.addf %707, %709 : vector<1xf32> %711 = vector.bitcast %710 : vector<1xf32> to vector<1xi32> %712 = vector.extract %711[0] : i32 from vector<1xi32> %shuffleResult_21, %valid_22 = gpu.shuffle xor %712, %c32_i32, %c64_i32 : i32 %713 = vector.broadcast %shuffleResult_21 : i32 to vector<1xi32> %714 = vector.bitcast %713 : vector<1xi32> to vector<1xf32> %715 = arith.addf %714, %710 : vector<1xf32> %716 = vector.extract %715[0] : f32 from vector<1xf32> %717 = arith.addf %716, %663 : f32 %718 = vector.insert %717, %cst_7 [0] : f32 into vector<1xf32> %719 = vector.extract_strided_slice %659 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %720 = vector.insert_strided_slice %719, %cst_11 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %721 = vector.extract_strided_slice %659 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %722 = vector.insert_strided_slice %721, %720 {offsets = [1, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %723 = vector.extract_strided_slice %659 {offsets = [0, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %724 = vector.insert_strided_slice %723, %722 {offsets = [2, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %725 = vector.extract_strided_slice %659 {offsets = [0, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %726 = vector.insert_strided_slice %725, %724 {offsets = [3, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %727 = arith.cmpf ogt, %726, %cst_5 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %728 = arith.cmpf olt, %726, %cst_4 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %729 = arith.select %727, %cst_5, %726 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %730 = arith.select %728, %cst_4, %729 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %731 = arith.truncf %730 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> to vector<4x1x4xf16> %732 = vector.extract %661[0] : f32 from vector<1xf32> %733 = vector.insert %732, %cst_9 [0, 0, 0] : f32 into vector<2x1x16xf32> %734 = vector.insert %732, %733 [0, 0, 1] : f32 into vector<2x1x16xf32> %735 = vector.insert %732, %734 [0, 0, 2] : f32 into vector<2x1x16xf32> %736 = vector.insert %732, %735 [0, 0, 3] : f32 into vector<2x1x16xf32> %737 = vector.insert %732, %736 [0, 0, 4] : f32 into vector<2x1x16xf32> %738 = vector.insert %732, %737 [0, 0, 5] : f32 into vector<2x1x16xf32> %739 = vector.insert %732, %738 [0, 0, 6] : f32 into vector<2x1x16xf32> %740 = vector.insert %732, %739 [0, 0, 7] : f32 into vector<2x1x16xf32> %741 = vector.insert %732, %740 [0, 0, 8] : f32 into vector<2x1x16xf32> %742 = vector.insert %732, %741 [0, 0, 9] : f32 into vector<2x1x16xf32> %743 = vector.insert %732, %742 [0, 0, 10] : f32 into vector<2x1x16xf32> %744 = vector.insert %732, %743 [0, 0, 11] : f32 into vector<2x1x16xf32> %745 = vector.insert %732, %744 [0, 0, 12] : f32 into vector<2x1x16xf32> %746 = vector.insert %732, %745 [0, 0, 13] : f32 into vector<2x1x16xf32> %747 = vector.insert %732, %746 [0, 0, 14] : f32 into vector<2x1x16xf32> %748 = vector.insert %732, %747 [0, 0, 15] : f32 into vector<2x1x16xf32> %749 = vector.insert %732, %748 [1, 0, 0] : f32 into vector<2x1x16xf32> %750 = vector.insert %732, %749 [1, 0, 1] : f32 into vector<2x1x16xf32> %751 = vector.insert %732, %750 [1, 0, 2] : f32 into vector<2x1x16xf32> %752 = vector.insert %732, %751 [1, 0, 3] : f32 into vector<2x1x16xf32> %753 = vector.insert %732, %752 [1, 0, 4] : f32 into vector<2x1x16xf32> %754 = vector.insert %732, %753 [1, 0, 5] : f32 into vector<2x1x16xf32> %755 = vector.insert %732, %754 [1, 0, 6] : f32 into vector<2x1x16xf32> %756 = vector.insert %732, %755 [1, 0, 7] : f32 into vector<2x1x16xf32> %757 = vector.insert %732, %756 [1, 0, 8] : f32 into vector<2x1x16xf32> %758 = vector.insert %732, %757 [1, 0, 9] : f32 into vector<2x1x16xf32> %759 = vector.insert %732, %758 [1, 0, 10] : f32 into vector<2x1x16xf32> %760 = vector.insert %732, %759 [1, 0, 11] : f32 into vector<2x1x16xf32> %761 = vector.insert %732, %760 [1, 0, 12] : f32 into vector<2x1x16xf32> %762 = vector.insert %732, %761 [1, 0, 13] : f32 into vector<2x1x16xf32> %763 = vector.insert %732, %762 [1, 0, 14] : f32 into vector<2x1x16xf32> %764 = vector.insert %732, %763 [1, 0, 15] : f32 into vector<2x1x16xf32> %765 = arith.mulf %764, %arg3 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> %766 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4)>()[%thread_id_x] %767 = vector.load %subview[%c0, %c0, %766, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %768 = vector.insert_strided_slice %767, %cst_12 {offsets = [0, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %769 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 1)>()[%thread_id_x] %770 = vector.load %subview[%c0, %c0, %769, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %771 = vector.insert_strided_slice %770, %768 {offsets = [0, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %772 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 2)>()[%thread_id_x] %773 = vector.load %subview[%c0, %c0, %772, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %774 = vector.insert_strided_slice %773, %771 {offsets = [0, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %775 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 3)>()[%thread_id_x] %776 = vector.load %subview[%c0, %c0, %775, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %777 = vector.insert_strided_slice %776, %774 {offsets = [0, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %778 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 2) floordiv 8)>()[%thread_id_x] %779 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 8 + 2) floordiv 8)>()[%thread_id_x] %780 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 32 + 8)>()[%thread_id_x] %781 = vector.load %subview[%778, %779, %780, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %782 = vector.insert_strided_slice %781, %777 {offsets = [0, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %783 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32)>()[%thread_id_x] %784 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9) floordiv 32)>()[%thread_id_x] %785 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9)>()[%thread_id_x] %786 = vector.load %subview[%783, %784, %785, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %787 = vector.insert_strided_slice %786, %782 {offsets = [0, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %788 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16)>()[%thread_id_x] %789 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 16 + 5) floordiv 16)>()[%thread_id_x] %790 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 32 + 10)>()[%thread_id_x] %791 = vector.load %subview[%788, %789, %790, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %792 = vector.insert_strided_slice %791, %787 {offsets = [0, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %793 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32)>()[%thread_id_x] %794 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11) floordiv 32)>()[%thread_id_x] %795 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11)>()[%thread_id_x] %796 = vector.load %subview[%793, %794, %795, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %797 = vector.insert_strided_slice %796, %792 {offsets = [0, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %798 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 4) floordiv 8)>()[%thread_id_x] %799 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 8 + 4) floordiv 8)>()[%thread_id_x] %800 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 32 + 16)>()[%thread_id_x] %801 = vector.load %subview[%798, %799, %800, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %802 = vector.insert_strided_slice %801, %797 {offsets = [0, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %803 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32)>()[%thread_id_x] %804 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17) floordiv 32)>()[%thread_id_x] %805 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17)>()[%thread_id_x] %806 = vector.load %subview[%803, %804, %805, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %807 = vector.insert_strided_slice %806, %802 {offsets = [0, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %808 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16)>()[%thread_id_x] %809 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 16 + 9) floordiv 16)>()[%thread_id_x] %810 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 32 + 18)>()[%thread_id_x] %811 = vector.load %subview[%808, %809, %810, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %812 = vector.insert_strided_slice %811, %807 {offsets = [0, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %813 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32)>()[%thread_id_x] %814 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19) floordiv 32)>()[%thread_id_x] %815 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19)>()[%thread_id_x] %816 = vector.load %subview[%813, %814, %815, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %817 = vector.insert_strided_slice %816, %812 {offsets = [0, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %818 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 6) floordiv 8)>()[%thread_id_x] %819 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 8 + 6) floordiv 8)>()[%thread_id_x] %820 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 32 + 24)>()[%thread_id_x] %821 = vector.load %subview[%818, %819, %820, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %822 = vector.insert_strided_slice %821, %817 {offsets = [0, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %823 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32)>()[%thread_id_x] %824 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25) floordiv 32)>()[%thread_id_x] %825 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25)>()[%thread_id_x] %826 = vector.load %subview[%823, %824, %825, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %827 = vector.insert_strided_slice %826, %822 {offsets = [0, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %828 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16)>()[%thread_id_x] %829 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 16 + 13) floordiv 16)>()[%thread_id_x] %830 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 32 + 26)>()[%thread_id_x] %831 = vector.load %subview[%828, %829, %830, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %832 = vector.insert_strided_slice %831, %827 {offsets = [0, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %833 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32)>()[%thread_id_x] %834 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27) floordiv 32)>()[%thread_id_x] %835 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27)>()[%thread_id_x] %836 = vector.load %subview[%833, %834, %835, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %837 = vector.insert_strided_slice %836, %832 {offsets = [0, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %838 = affine.apply affine_map<()[s0] -> (s0 mod 32 + 32)>()[%thread_id_x] %839 = vector.load %subview[%c0, %c0, %766, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %840 = vector.insert_strided_slice %839, %837 {offsets = [1, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %841 = vector.load %subview[%c0, %c0, %769, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %842 = vector.insert_strided_slice %841, %840 {offsets = [1, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %843 = vector.load %subview[%c0, %c0, %772, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %844 = vector.insert_strided_slice %843, %842 {offsets = [1, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %845 = vector.load %subview[%c0, %c0, %775, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %846 = vector.insert_strided_slice %845, %844 {offsets = [1, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %847 = vector.load %subview[%778, %779, %780, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %848 = vector.insert_strided_slice %847, %846 {offsets = [1, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %849 = vector.load %subview[%783, %784, %785, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %850 = vector.insert_strided_slice %849, %848 {offsets = [1, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %851 = vector.load %subview[%788, %789, %790, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %852 = vector.insert_strided_slice %851, %850 {offsets = [1, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %853 = vector.load %subview[%793, %794, %795, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %854 = vector.insert_strided_slice %853, %852 {offsets = [1, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %855 = vector.load %subview[%798, %799, %800, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %856 = vector.insert_strided_slice %855, %854 {offsets = [1, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %857 = vector.load %subview[%803, %804, %805, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %858 = vector.insert_strided_slice %857, %856 {offsets = [1, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %859 = vector.load %subview[%808, %809, %810, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %860 = vector.insert_strided_slice %859, %858 {offsets = [1, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %861 = vector.load %subview[%813, %814, %815, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %862 = vector.insert_strided_slice %861, %860 {offsets = [1, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %863 = vector.load %subview[%818, %819, %820, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %864 = vector.insert_strided_slice %863, %862 {offsets = [1, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %865 = vector.load %subview[%823, %824, %825, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %866 = vector.insert_strided_slice %865, %864 {offsets = [1, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %867 = vector.load %subview[%828, %829, %830, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %868 = vector.insert_strided_slice %867, %866 {offsets = [1, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %869 = vector.load %subview[%833, %834, %835, %838] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %870 = vector.insert_strided_slice %869, %868 {offsets = [1, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %871 = vector.extract %765[0, 0] : vector<16xf32> from vector<2x1x16xf32> %872 = vector.extract %870[0, 0] : vector<4xf16> from vector<2x4x4xf16> %873 = vector.extract %731[0, 0] : vector<4xf16> from vector<4x1x4xf16> %874 = amdgpu.mfma %872 * %873 + %871 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %875 = vector.extract %870[0, 1] : vector<4xf16> from vector<2x4x4xf16> %876 = vector.extract %731[1, 0] : vector<4xf16> from vector<4x1x4xf16> %877 = amdgpu.mfma %875 * %876 + %874 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %878 = vector.extract %870[0, 2] : vector<4xf16> from vector<2x4x4xf16> %879 = vector.extract %731[2, 0] : vector<4xf16> from vector<4x1x4xf16> %880 = amdgpu.mfma %878 * %879 + %877 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %881 = vector.extract %870[0, 3] : vector<4xf16> from vector<2x4x4xf16> %882 = vector.extract %731[3, 0] : vector<4xf16> from vector<4x1x4xf16> %883 = amdgpu.mfma %881 * %882 + %880 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %884 = vector.insert %883, %cst_9 [0, 0] : vector<16xf32> into vector<2x1x16xf32> %885 = vector.extract %765[1, 0] : vector<16xf32> from vector<2x1x16xf32> %886 = vector.extract %870[1, 0] : vector<4xf16> from vector<2x4x4xf16> %887 = amdgpu.mfma %886 * %873 + %885 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %888 = vector.extract %870[1, 1] : vector<4xf16> from vector<2x4x4xf16> %889 = amdgpu.mfma %888 * %876 + %887 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %890 = vector.extract %870[1, 2] : vector<4xf16> from vector<2x4x4xf16> %891 = amdgpu.mfma %890 * %879 + %889 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %892 = vector.extract %870[1, 3] : vector<4xf16> from vector<2x4x4xf16> %893 = amdgpu.mfma %892 * %882 + %891 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %894 = vector.insert %893, %884 [1, 0] : vector<16xf32> into vector<2x1x16xf32> gpu.barrier vector.transfer_write %535, %subview_16[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> vector.transfer_write %536, %subview[%c0, %c0, %30, %31] {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf16>, memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space> scf.yield %641, %718, %894 : vector<1xf32>, vector<1xf32>, vector<2x1x16xf32> } gpu.barrier %35 = vector.load %subview_16[%c0, %c0, %15, %16] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %36 = vector.insert_strided_slice %35, %cst_10 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %37 = vector.load %subview_16[%c0, %c0, %15, %20] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %38 = vector.insert_strided_slice %37, %36 {offsets = [0, 1, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %39 = vector.load %subview_16[%c0, %c0, %15, %23] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %40 = vector.insert_strided_slice %39, %38 {offsets = [0, 2, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %41 = vector.load %subview_16[%c0, %c0, %15, %26] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<8xf16> %42 = vector.insert_strided_slice %41, %40 {offsets = [0, 3, 0], strides = [1]} : vector<8xf16> into vector<1x4x8xf16> %43 = vector.extract_strided_slice %42 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %44 = vector.extract_strided_slice %42 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %45 = vector.extract_strided_slice %42 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %46 = vector.extract_strided_slice %42 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %47 = vector.extract_strided_slice %42 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %48 = vector.extract_strided_slice %42 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %49 = vector.extract_strided_slice %42 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %50 = vector.extract_strided_slice %42 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %51 = vector.extract_strided_slice %29 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %52 = vector.extract_strided_slice %29 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %53 = vector.extract_strided_slice %29 {offsets = [0, 1, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %54 = vector.extract_strided_slice %29 {offsets = [0, 1, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %55 = vector.extract_strided_slice %29 {offsets = [0, 2, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %56 = vector.extract_strided_slice %29 {offsets = [0, 2, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %57 = vector.extract_strided_slice %29 {offsets = [0, 3, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %58 = vector.extract_strided_slice %29 {offsets = [0, 3, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x4x8xf16> to vector<1x1x4xf16> %59 = vector.extract %43[0, 0] : vector<4xf16> from vector<1x1x4xf16> %60 = vector.extract %51[0, 0] : vector<4xf16> from vector<1x1x4xf16> %61 = amdgpu.mfma %59 * %60 + %cst_14 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %62 = vector.extract %44[0, 0] : vector<4xf16> from vector<1x1x4xf16> %63 = vector.extract %52[0, 0] : vector<4xf16> from vector<1x1x4xf16> %64 = amdgpu.mfma %62 * %63 + %61 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %65 = vector.extract %45[0, 0] : vector<4xf16> from vector<1x1x4xf16> %66 = vector.extract %53[0, 0] : vector<4xf16> from vector<1x1x4xf16> %67 = amdgpu.mfma %65 * %66 + %64 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %68 = vector.extract %46[0, 0] : vector<4xf16> from vector<1x1x4xf16> %69 = vector.extract %54[0, 0] : vector<4xf16> from vector<1x1x4xf16> %70 = amdgpu.mfma %68 * %69 + %67 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %71 = vector.extract %47[0, 0] : vector<4xf16> from vector<1x1x4xf16> %72 = vector.extract %55[0, 0] : vector<4xf16> from vector<1x1x4xf16> %73 = amdgpu.mfma %71 * %72 + %70 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %74 = vector.extract %48[0, 0] : vector<4xf16> from vector<1x1x4xf16> %75 = vector.extract %56[0, 0] : vector<4xf16> from vector<1x1x4xf16> %76 = amdgpu.mfma %74 * %75 + %73 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %77 = vector.extract %49[0, 0] : vector<4xf16> from vector<1x1x4xf16> %78 = vector.extract %57[0, 0] : vector<4xf16> from vector<1x1x4xf16> %79 = amdgpu.mfma %77 * %78 + %76 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %80 = vector.extract %50[0, 0] : vector<4xf16> from vector<1x1x4xf16> %81 = vector.extract %58[0, 0] : vector<4xf16> from vector<1x1x4xf16> %82 = amdgpu.mfma %80 * %81 + %79 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %83 = vector.broadcast %82 : vector<16xf32> to vector<1x1x16xf32> %84 = vector.extract %34#0[0] : f32 from vector<1xf32> %85 = vector.extract %82[0] : f32 from vector<16xf32> %86 = vector.insert %85, %cst_7 [0] : f32 into vector<1xf32> %87 = vector.extract %82[1] : f32 from vector<16xf32> %88 = vector.insert %87, %86 [0] : f32 into vector<1xf32> %89 = arith.maximumf %86, %88 : vector<1xf32> %90 = vector.extract %82[2] : f32 from vector<16xf32> %91 = vector.insert %90, %88 [0] : f32 into vector<1xf32> %92 = arith.maximumf %89, %91 : vector<1xf32> %93 = vector.extract %82[3] : f32 from vector<16xf32> %94 = vector.insert %93, %91 [0] : f32 into vector<1xf32> %95 = arith.maximumf %92, %94 : vector<1xf32> %96 = vector.extract %82[4] : f32 from vector<16xf32> %97 = vector.insert %96, %94 [0] : f32 into vector<1xf32> %98 = arith.maximumf %95, %97 : vector<1xf32> %99 = vector.extract %82[5] : f32 from vector<16xf32> %100 = vector.insert %99, %97 [0] : f32 into vector<1xf32> %101 = arith.maximumf %98, %100 : vector<1xf32> %102 = vector.extract %82[6] : f32 from vector<16xf32> %103 = vector.insert %102, %100 [0] : f32 into vector<1xf32> %104 = arith.maximumf %101, %103 : vector<1xf32> %105 = vector.extract %82[7] : f32 from vector<16xf32> %106 = vector.insert %105, %103 [0] : f32 into vector<1xf32> %107 = arith.maximumf %104, %106 : vector<1xf32> %108 = vector.extract %82[8] : f32 from vector<16xf32> %109 = vector.insert %108, %106 [0] : f32 into vector<1xf32> %110 = arith.maximumf %107, %109 : vector<1xf32> %111 = vector.extract %82[9] : f32 from vector<16xf32> %112 = vector.insert %111, %109 [0] : f32 into vector<1xf32> %113 = arith.maximumf %110, %112 : vector<1xf32> %114 = vector.extract %82[10] : f32 from vector<16xf32> %115 = vector.insert %114, %112 [0] : f32 into vector<1xf32> %116 = arith.maximumf %113, %115 : vector<1xf32> %117 = vector.extract %82[11] : f32 from vector<16xf32> %118 = vector.insert %117, %115 [0] : f32 into vector<1xf32> %119 = arith.maximumf %116, %118 : vector<1xf32> %120 = vector.extract %82[12] : f32 from vector<16xf32> %121 = vector.insert %120, %118 [0] : f32 into vector<1xf32> %122 = arith.maximumf %119, %121 : vector<1xf32> %123 = vector.extract %82[13] : f32 from vector<16xf32> %124 = vector.insert %123, %121 [0] : f32 into vector<1xf32> %125 = arith.maximumf %122, %124 : vector<1xf32> %126 = vector.extract %82[14] : f32 from vector<16xf32> %127 = vector.insert %126, %124 [0] : f32 into vector<1xf32> %128 = arith.maximumf %125, %127 : vector<1xf32> %129 = vector.extract %82[15] : f32 from vector<16xf32> %130 = vector.insert %129, %127 [0] : f32 into vector<1xf32> %131 = arith.maximumf %128, %130 : vector<1xf32> %132 = vector.bitcast %131 : vector<1xf32> to vector<1xi32> %133 = vector.extract %132[0] : i32 from vector<1xi32> %shuffleResult, %valid = gpu.shuffle xor %133, %c32_i32, %c64_i32 : i32 %134 = vector.broadcast %shuffleResult : i32 to vector<1xi32> %135 = vector.bitcast %134 : vector<1xi32> to vector<1xf32> %136 = arith.maximumf %135, %131 : vector<1xf32> %137 = vector.extract %136[0] : f32 from vector<1xf32> %138 = arith.maximumf %137, %84 : f32 %139 = vector.insert %138, %cst_7 [0] : f32 into vector<1xf32> %140 = vector.insert %138, %cst_0 [0, 0, 0] : f32 into vector<1x1x16xf32> %141 = vector.insert %138, %140 [0, 0, 1] : f32 into vector<1x1x16xf32> %142 = vector.insert %138, %141 [0, 0, 2] : f32 into vector<1x1x16xf32> %143 = vector.insert %138, %142 [0, 0, 3] : f32 into vector<1x1x16xf32> %144 = vector.insert %138, %143 [0, 0, 4] : f32 into vector<1x1x16xf32> %145 = vector.insert %138, %144 [0, 0, 5] : f32 into vector<1x1x16xf32> %146 = vector.insert %138, %145 [0, 0, 6] : f32 into vector<1x1x16xf32> %147 = vector.insert %138, %146 [0, 0, 7] : f32 into vector<1x1x16xf32> %148 = vector.insert %138, %147 [0, 0, 8] : f32 into vector<1x1x16xf32> %149 = vector.insert %138, %148 [0, 0, 9] : f32 into vector<1x1x16xf32> %150 = vector.insert %138, %149 [0, 0, 10] : f32 into vector<1x1x16xf32> %151 = vector.insert %138, %150 [0, 0, 11] : f32 into vector<1x1x16xf32> %152 = vector.insert %138, %151 [0, 0, 12] : f32 into vector<1x1x16xf32> %153 = vector.insert %138, %152 [0, 0, 13] : f32 into vector<1x1x16xf32> %154 = vector.insert %138, %153 [0, 0, 14] : f32 into vector<1x1x16xf32> %155 = vector.insert %138, %154 [0, 0, 15] : f32 into vector<1x1x16xf32> %156 = arith.subf %83, %155 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %157 = math.exp2 %156 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>>]]} : vector<1x1x16xf32> %158 = arith.subf %34#0, %139 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %159 = math.exp2 %158 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %160 = arith.mulf %159, %34#1 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>>]]} : vector<1xf32> %161 = vector.extract %160[0] : f32 from vector<1xf32> %162 = vector.extract %157[0, 0, 0] : f32 from vector<1x1x16xf32> %163 = vector.insert %162, %cst_7 [0] : f32 into vector<1xf32> %164 = vector.extract %157[0, 0, 1] : f32 from vector<1x1x16xf32> %165 = vector.insert %164, %163 [0] : f32 into vector<1xf32> %166 = arith.addf %163, %165 : vector<1xf32> %167 = vector.extract %157[0, 0, 2] : f32 from vector<1x1x16xf32> %168 = vector.insert %167, %165 [0] : f32 into vector<1xf32> %169 = arith.addf %166, %168 : vector<1xf32> %170 = vector.extract %157[0, 0, 3] : f32 from vector<1x1x16xf32> %171 = vector.insert %170, %168 [0] : f32 into vector<1xf32> %172 = arith.addf %169, %171 : vector<1xf32> %173 = vector.extract %157[0, 0, 4] : f32 from vector<1x1x16xf32> %174 = vector.insert %173, %171 [0] : f32 into vector<1xf32> %175 = arith.addf %172, %174 : vector<1xf32> %176 = vector.extract %157[0, 0, 5] : f32 from vector<1x1x16xf32> %177 = vector.insert %176, %174 [0] : f32 into vector<1xf32> %178 = arith.addf %175, %177 : vector<1xf32> %179 = vector.extract %157[0, 0, 6] : f32 from vector<1x1x16xf32> %180 = vector.insert %179, %177 [0] : f32 into vector<1xf32> %181 = arith.addf %178, %180 : vector<1xf32> %182 = vector.extract %157[0, 0, 7] : f32 from vector<1x1x16xf32> %183 = vector.insert %182, %180 [0] : f32 into vector<1xf32> %184 = arith.addf %181, %183 : vector<1xf32> %185 = vector.extract %157[0, 0, 8] : f32 from vector<1x1x16xf32> %186 = vector.insert %185, %183 [0] : f32 into vector<1xf32> %187 = arith.addf %184, %186 : vector<1xf32> %188 = vector.extract %157[0, 0, 9] : f32 from vector<1x1x16xf32> %189 = vector.insert %188, %186 [0] : f32 into vector<1xf32> %190 = arith.addf %187, %189 : vector<1xf32> %191 = vector.extract %157[0, 0, 10] : f32 from vector<1x1x16xf32> %192 = vector.insert %191, %189 [0] : f32 into vector<1xf32> %193 = arith.addf %190, %192 : vector<1xf32> %194 = vector.extract %157[0, 0, 11] : f32 from vector<1x1x16xf32> %195 = vector.insert %194, %192 [0] : f32 into vector<1xf32> %196 = arith.addf %193, %195 : vector<1xf32> %197 = vector.extract %157[0, 0, 12] : f32 from vector<1x1x16xf32> %198 = vector.insert %197, %195 [0] : f32 into vector<1xf32> %199 = arith.addf %196, %198 : vector<1xf32> %200 = vector.extract %157[0, 0, 13] : f32 from vector<1x1x16xf32> %201 = vector.insert %200, %198 [0] : f32 into vector<1xf32> %202 = arith.addf %199, %201 : vector<1xf32> %203 = vector.extract %157[0, 0, 14] : f32 from vector<1x1x16xf32> %204 = vector.insert %203, %201 [0] : f32 into vector<1xf32> %205 = arith.addf %202, %204 : vector<1xf32> %206 = vector.extract %157[0, 0, 15] : f32 from vector<1x1x16xf32> %207 = vector.insert %206, %204 [0] : f32 into vector<1xf32> %208 = arith.addf %205, %207 : vector<1xf32> %209 = vector.bitcast %208 : vector<1xf32> to vector<1xi32> %210 = vector.extract %209[0] : i32 from vector<1xi32> %shuffleResult_17, %valid_18 = gpu.shuffle xor %210, %c32_i32, %c64_i32 : i32 %211 = vector.broadcast %shuffleResult_17 : i32 to vector<1xi32> %212 = vector.bitcast %211 : vector<1xi32> to vector<1xf32> %213 = arith.addf %212, %208 : vector<1xf32> %214 = vector.extract %213[0] : f32 from vector<1xf32> %215 = arith.addf %214, %161 : f32 %216 = vector.extract_strided_slice %157 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %217 = vector.insert_strided_slice %216, %cst_11 {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %218 = vector.extract_strided_slice %157 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %219 = vector.insert_strided_slice %218, %217 {offsets = [1, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %220 = vector.extract_strided_slice %157 {offsets = [0, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %221 = vector.insert_strided_slice %220, %219 {offsets = [2, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %222 = vector.extract_strided_slice %157 {offsets = [0, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x16xf32> to vector<1x1x4xf32> %223 = vector.insert_strided_slice %222, %221 {offsets = [3, 0, 0], strides = [1, 1, 1]} : vector<1x1x4xf32> into vector<4x1x4xf32> %224 = arith.cmpf ogt, %223, %cst_5 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %225 = arith.cmpf olt, %223, %cst_4 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> %226 = arith.select %224, %cst_5, %223 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %227 = arith.select %225, %cst_4, %226 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xi1>, vector<4x1x4xf32> %228 = arith.truncf %227 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, LANEY, VECTORX], [4, 2, 4]>>]]} : vector<4x1x4xf32> to vector<4x1x4xf16> %229 = vector.extract %159[0] : f32 from vector<1xf32> %230 = vector.insert %229, %cst_9 [0, 0, 0] : f32 into vector<2x1x16xf32> %231 = vector.insert %229, %230 [0, 0, 1] : f32 into vector<2x1x16xf32> %232 = vector.insert %229, %231 [0, 0, 2] : f32 into vector<2x1x16xf32> %233 = vector.insert %229, %232 [0, 0, 3] : f32 into vector<2x1x16xf32> %234 = vector.insert %229, %233 [0, 0, 4] : f32 into vector<2x1x16xf32> %235 = vector.insert %229, %234 [0, 0, 5] : f32 into vector<2x1x16xf32> %236 = vector.insert %229, %235 [0, 0, 6] : f32 into vector<2x1x16xf32> %237 = vector.insert %229, %236 [0, 0, 7] : f32 into vector<2x1x16xf32> %238 = vector.insert %229, %237 [0, 0, 8] : f32 into vector<2x1x16xf32> %239 = vector.insert %229, %238 [0, 0, 9] : f32 into vector<2x1x16xf32> %240 = vector.insert %229, %239 [0, 0, 10] : f32 into vector<2x1x16xf32> %241 = vector.insert %229, %240 [0, 0, 11] : f32 into vector<2x1x16xf32> %242 = vector.insert %229, %241 [0, 0, 12] : f32 into vector<2x1x16xf32> %243 = vector.insert %229, %242 [0, 0, 13] : f32 into vector<2x1x16xf32> %244 = vector.insert %229, %243 [0, 0, 14] : f32 into vector<2x1x16xf32> %245 = vector.insert %229, %244 [0, 0, 15] : f32 into vector<2x1x16xf32> %246 = vector.insert %229, %245 [1, 0, 0] : f32 into vector<2x1x16xf32> %247 = vector.insert %229, %246 [1, 0, 1] : f32 into vector<2x1x16xf32> %248 = vector.insert %229, %247 [1, 0, 2] : f32 into vector<2x1x16xf32> %249 = vector.insert %229, %248 [1, 0, 3] : f32 into vector<2x1x16xf32> %250 = vector.insert %229, %249 [1, 0, 4] : f32 into vector<2x1x16xf32> %251 = vector.insert %229, %250 [1, 0, 5] : f32 into vector<2x1x16xf32> %252 = vector.insert %229, %251 [1, 0, 6] : f32 into vector<2x1x16xf32> %253 = vector.insert %229, %252 [1, 0, 7] : f32 into vector<2x1x16xf32> %254 = vector.insert %229, %253 [1, 0, 8] : f32 into vector<2x1x16xf32> %255 = vector.insert %229, %254 [1, 0, 9] : f32 into vector<2x1x16xf32> %256 = vector.insert %229, %255 [1, 0, 10] : f32 into vector<2x1x16xf32> %257 = vector.insert %229, %256 [1, 0, 11] : f32 into vector<2x1x16xf32> %258 = vector.insert %229, %257 [1, 0, 12] : f32 into vector<2x1x16xf32> %259 = vector.insert %229, %258 [1, 0, 13] : f32 into vector<2x1x16xf32> %260 = vector.insert %229, %259 [1, 0, 14] : f32 into vector<2x1x16xf32> %261 = vector.insert %229, %260 [1, 0, 15] : f32 into vector<2x1x16xf32> %262 = arith.mulf %261, %34#2 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> %263 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4)>()[%thread_id_x] %264 = vector.load %subview[%c0, %c0, %263, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %265 = vector.insert_strided_slice %264, %cst_12 {offsets = [0, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %266 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 1)>()[%thread_id_x] %267 = vector.load %subview[%c0, %c0, %266, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %268 = vector.insert_strided_slice %267, %265 {offsets = [0, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %269 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 2)>()[%thread_id_x] %270 = vector.load %subview[%c0, %c0, %269, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %271 = vector.insert_strided_slice %270, %268 {offsets = [0, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %272 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 3)>()[%thread_id_x] %273 = vector.load %subview[%c0, %c0, %272, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %274 = vector.insert_strided_slice %273, %271 {offsets = [0, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %275 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 2) floordiv 8)>()[%thread_id_x] %276 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 8 + 2) floordiv 8)>()[%thread_id_x] %277 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 2) floordiv 8) * 32 + 8)>()[%thread_id_x] %278 = vector.load %subview[%275, %276, %277, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %279 = vector.insert_strided_slice %278, %274 {offsets = [0, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %280 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32)>()[%thread_id_x] %281 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9) floordiv 32)>()[%thread_id_x] %282 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 9) floordiv 32) * 32 + 9)>()[%thread_id_x] %283 = vector.load %subview[%280, %281, %282, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %284 = vector.insert_strided_slice %283, %279 {offsets = [0, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %285 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16)>()[%thread_id_x] %286 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 16 + 5) floordiv 16)>()[%thread_id_x] %287 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 5) floordiv 16) * 32 + 10)>()[%thread_id_x] %288 = vector.load %subview[%285, %286, %287, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %289 = vector.insert_strided_slice %288, %284 {offsets = [0, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %290 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32)>()[%thread_id_x] %291 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11) floordiv 32)>()[%thread_id_x] %292 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 11) floordiv 32) * 32 + 11)>()[%thread_id_x] %293 = vector.load %subview[%290, %291, %292, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %294 = vector.insert_strided_slice %293, %289 {offsets = [0, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %295 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 4) floordiv 8)>()[%thread_id_x] %296 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 8 + 4) floordiv 8)>()[%thread_id_x] %297 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 4) floordiv 8) * 32 + 16)>()[%thread_id_x] %298 = vector.load %subview[%295, %296, %297, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %299 = vector.insert_strided_slice %298, %294 {offsets = [0, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %300 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32)>()[%thread_id_x] %301 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17) floordiv 32)>()[%thread_id_x] %302 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 17) floordiv 32) * 32 + 17)>()[%thread_id_x] %303 = vector.load %subview[%300, %301, %302, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %304 = vector.insert_strided_slice %303, %299 {offsets = [0, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %305 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16)>()[%thread_id_x] %306 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 16 + 9) floordiv 16)>()[%thread_id_x] %307 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 9) floordiv 16) * 32 + 18)>()[%thread_id_x] %308 = vector.load %subview[%305, %306, %307, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %309 = vector.insert_strided_slice %308, %304 {offsets = [0, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %310 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32)>()[%thread_id_x] %311 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19) floordiv 32)>()[%thread_id_x] %312 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 19) floordiv 32) * 32 + 19)>()[%thread_id_x] %313 = vector.load %subview[%310, %311, %312, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %314 = vector.insert_strided_slice %313, %309 {offsets = [0, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %315 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 + 6) floordiv 8)>()[%thread_id_x] %316 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 8 + 6) floordiv 8)>()[%thread_id_x] %317 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - (((s0 mod 64) floordiv 32 + 6) floordiv 8) * 32 + 24)>()[%thread_id_x] %318 = vector.load %subview[%315, %316, %317, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %319 = vector.insert_strided_slice %318, %314 {offsets = [0, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %320 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32)>()[%thread_id_x] %321 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25) floordiv 32)>()[%thread_id_x] %322 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 25) floordiv 32) * 32 + 25)>()[%thread_id_x] %323 = vector.load %subview[%320, %321, %322, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %324 = vector.insert_strided_slice %323, %319 {offsets = [0, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %325 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16)>()[%thread_id_x] %326 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 2 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 16 + 13) floordiv 16)>()[%thread_id_x] %327 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 2 + 13) floordiv 16) * 32 + 26)>()[%thread_id_x] %328 = vector.load %subview[%325, %326, %327, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %329 = vector.insert_strided_slice %328, %324 {offsets = [0, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %330 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32)>()[%thread_id_x] %331 = affine.apply affine_map<()[s0] -> ((((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27) floordiv 32)>()[%thread_id_x] %332 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 - ((((s0 mod 64) floordiv 32) * 4 + 27) floordiv 32) * 32 + 27)>()[%thread_id_x] %333 = vector.load %subview[%330, %331, %332, %15] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %334 = vector.insert_strided_slice %333, %329 {offsets = [0, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %335 = affine.apply affine_map<()[s0] -> (s0 mod 32 + 32)>()[%thread_id_x] %336 = vector.load %subview[%c0, %c0, %263, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %337 = vector.insert_strided_slice %336, %334 {offsets = [1, 0, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %338 = vector.load %subview[%c0, %c0, %266, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %339 = vector.insert_strided_slice %338, %337 {offsets = [1, 0, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %340 = vector.load %subview[%c0, %c0, %269, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %341 = vector.insert_strided_slice %340, %339 {offsets = [1, 0, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %342 = vector.load %subview[%c0, %c0, %272, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %343 = vector.insert_strided_slice %342, %341 {offsets = [1, 0, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %344 = vector.load %subview[%275, %276, %277, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %345 = vector.insert_strided_slice %344, %343 {offsets = [1, 1, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %346 = vector.load %subview[%280, %281, %282, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %347 = vector.insert_strided_slice %346, %345 {offsets = [1, 1, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %348 = vector.load %subview[%285, %286, %287, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %349 = vector.insert_strided_slice %348, %347 {offsets = [1, 1, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %350 = vector.load %subview[%290, %291, %292, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %351 = vector.insert_strided_slice %350, %349 {offsets = [1, 1, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %352 = vector.load %subview[%295, %296, %297, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %353 = vector.insert_strided_slice %352, %351 {offsets = [1, 2, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %354 = vector.load %subview[%300, %301, %302, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %355 = vector.insert_strided_slice %354, %353 {offsets = [1, 2, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %356 = vector.load %subview[%305, %306, %307, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %357 = vector.insert_strided_slice %356, %355 {offsets = [1, 2, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %358 = vector.load %subview[%310, %311, %312, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %359 = vector.insert_strided_slice %358, %357 {offsets = [1, 2, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %360 = vector.load %subview[%315, %316, %317, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %361 = vector.insert_strided_slice %360, %359 {offsets = [1, 3, 0], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %362 = vector.load %subview[%320, %321, %322, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %363 = vector.insert_strided_slice %362, %361 {offsets = [1, 3, 1], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %364 = vector.load %subview[%325, %326, %327, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %365 = vector.insert_strided_slice %364, %363 {offsets = [1, 3, 2], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %366 = vector.load %subview[%330, %331, %332, %335] : memref<1x1x32x64xf16, strided<[2176, 2176, 68, 1]>, #gpu.address_space>, vector<1xf16> %367 = vector.insert_strided_slice %366, %365 {offsets = [1, 3, 3], strides = [1]} : vector<1xf16> into vector<2x4x4xf16> %368 = vector.extract %262[0, 0] : vector<16xf32> from vector<2x1x16xf32> %369 = vector.extract %367[0, 0] : vector<4xf16> from vector<2x4x4xf16> %370 = vector.extract %228[0, 0] : vector<4xf16> from vector<4x1x4xf16> %371 = amdgpu.mfma %369 * %370 + %368 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %372 = vector.extract %367[0, 1] : vector<4xf16> from vector<2x4x4xf16> %373 = vector.extract %228[1, 0] : vector<4xf16> from vector<4x1x4xf16> %374 = amdgpu.mfma %372 * %373 + %371 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %375 = vector.extract %367[0, 2] : vector<4xf16> from vector<2x4x4xf16> %376 = vector.extract %228[2, 0] : vector<4xf16> from vector<4x1x4xf16> %377 = amdgpu.mfma %375 * %376 + %374 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %378 = vector.extract %367[0, 3] : vector<4xf16> from vector<2x4x4xf16> %379 = vector.extract %228[3, 0] : vector<4xf16> from vector<4x1x4xf16> %380 = amdgpu.mfma %378 * %379 + %377 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %381 = vector.insert %380, %cst_9 [0, 0] : vector<16xf32> into vector<2x1x16xf32> %382 = vector.extract %262[1, 0] : vector<16xf32> from vector<2x1x16xf32> %383 = vector.extract %367[1, 0] : vector<4xf16> from vector<2x4x4xf16> %384 = amdgpu.mfma %383 * %370 + %382 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %385 = vector.extract %367[1, 1] : vector<4xf16> from vector<2x4x4xf16> %386 = amdgpu.mfma %385 * %373 + %384 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %387 = vector.extract %367[1, 2] : vector<4xf16> from vector<2x4x4xf16> %388 = amdgpu.mfma %387 * %376 + %386 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %389 = vector.extract %367[1, 3] : vector<4xf16> from vector<2x4x4xf16> %390 = amdgpu.mfma %389 * %379 + %388 {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> %391 = vector.insert %390, %381 [1, 0] : vector<16xf32> into vector<2x1x16xf32> %392 = vector.insert %215, %cst_9 [0, 0, 0] : f32 into vector<2x1x16xf32> %393 = vector.insert %215, %392 [0, 0, 1] : f32 into vector<2x1x16xf32> %394 = vector.insert %215, %393 [0, 0, 2] : f32 into vector<2x1x16xf32> %395 = vector.insert %215, %394 [0, 0, 3] : f32 into vector<2x1x16xf32> %396 = vector.insert %215, %395 [0, 0, 4] : f32 into vector<2x1x16xf32> %397 = vector.insert %215, %396 [0, 0, 5] : f32 into vector<2x1x16xf32> %398 = vector.insert %215, %397 [0, 0, 6] : f32 into vector<2x1x16xf32> %399 = vector.insert %215, %398 [0, 0, 7] : f32 into vector<2x1x16xf32> %400 = vector.insert %215, %399 [0, 0, 8] : f32 into vector<2x1x16xf32> %401 = vector.insert %215, %400 [0, 0, 9] : f32 into vector<2x1x16xf32> %402 = vector.insert %215, %401 [0, 0, 10] : f32 into vector<2x1x16xf32> %403 = vector.insert %215, %402 [0, 0, 11] : f32 into vector<2x1x16xf32> %404 = vector.insert %215, %403 [0, 0, 12] : f32 into vector<2x1x16xf32> %405 = vector.insert %215, %404 [0, 0, 13] : f32 into vector<2x1x16xf32> %406 = vector.insert %215, %405 [0, 0, 14] : f32 into vector<2x1x16xf32> %407 = vector.insert %215, %406 [0, 0, 15] : f32 into vector<2x1x16xf32> %408 = vector.insert %215, %407 [1, 0, 0] : f32 into vector<2x1x16xf32> %409 = vector.insert %215, %408 [1, 0, 1] : f32 into vector<2x1x16xf32> %410 = vector.insert %215, %409 [1, 0, 2] : f32 into vector<2x1x16xf32> %411 = vector.insert %215, %410 [1, 0, 3] : f32 into vector<2x1x16xf32> %412 = vector.insert %215, %411 [1, 0, 4] : f32 into vector<2x1x16xf32> %413 = vector.insert %215, %412 [1, 0, 5] : f32 into vector<2x1x16xf32> %414 = vector.insert %215, %413 [1, 0, 6] : f32 into vector<2x1x16xf32> %415 = vector.insert %215, %414 [1, 0, 7] : f32 into vector<2x1x16xf32> %416 = vector.insert %215, %415 [1, 0, 8] : f32 into vector<2x1x16xf32> %417 = vector.insert %215, %416 [1, 0, 9] : f32 into vector<2x1x16xf32> %418 = vector.insert %215, %417 [1, 0, 10] : f32 into vector<2x1x16xf32> %419 = vector.insert %215, %418 [1, 0, 11] : f32 into vector<2x1x16xf32> %420 = vector.insert %215, %419 [1, 0, 12] : f32 into vector<2x1x16xf32> %421 = vector.insert %215, %420 [1, 0, 13] : f32 into vector<2x1x16xf32> %422 = vector.insert %215, %421 [1, 0, 14] : f32 into vector<2x1x16xf32> %423 = vector.insert %215, %422 [1, 0, 15] : f32 into vector<2x1x16xf32> %424 = vector.load %11[%workgroup_id_y, %263] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %425 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 8)>()[%thread_id_x] %426 = vector.load %11[%workgroup_id_y, %425] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %427 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 16)>()[%thread_id_x] %428 = vector.load %11[%workgroup_id_y, %427] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %429 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 24)>()[%thread_id_x] %430 = vector.load %11[%workgroup_id_y, %429] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %431 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 32)>()[%thread_id_x] %432 = vector.load %11[%workgroup_id_y, %431] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %433 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 40)>()[%thread_id_x] %434 = vector.load %11[%workgroup_id_y, %433] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %435 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 48)>()[%thread_id_x] %436 = vector.load %11[%workgroup_id_y, %435] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %437 = affine.apply affine_map<()[s0] -> (((s0 mod 64) floordiv 32) * 4 + 56)>()[%thread_id_x] %438 = vector.load %11[%workgroup_id_y, %437] : memref<20x64xf16, #hal.descriptor_type>, vector<4xf16> %439 = vector.extract %424[0] : f16 from vector<4xf16> %440 = vector.insert %439, %cst_13 [0, 0, 0] : f16 into vector<2x1x16xf16> %441 = vector.extract %424[1] : f16 from vector<4xf16> %442 = vector.insert %441, %440 [0, 0, 1] : f16 into vector<2x1x16xf16> %443 = vector.extract %424[2] : f16 from vector<4xf16> %444 = vector.insert %443, %442 [0, 0, 2] : f16 into vector<2x1x16xf16> %445 = vector.extract %424[3] : f16 from vector<4xf16> %446 = vector.insert %445, %444 [0, 0, 3] : f16 into vector<2x1x16xf16> %447 = vector.extract %426[0] : f16 from vector<4xf16> %448 = vector.insert %447, %446 [0, 0, 4] : f16 into vector<2x1x16xf16> %449 = vector.extract %426[1] : f16 from vector<4xf16> %450 = vector.insert %449, %448 [0, 0, 5] : f16 into vector<2x1x16xf16> %451 = vector.extract %426[2] : f16 from vector<4xf16> %452 = vector.insert %451, %450 [0, 0, 6] : f16 into vector<2x1x16xf16> %453 = vector.extract %426[3] : f16 from vector<4xf16> %454 = vector.insert %453, %452 [0, 0, 7] : f16 into vector<2x1x16xf16> %455 = vector.extract %428[0] : f16 from vector<4xf16> %456 = vector.insert %455, %454 [0, 0, 8] : f16 into vector<2x1x16xf16> %457 = vector.extract %428[1] : f16 from vector<4xf16> %458 = vector.insert %457, %456 [0, 0, 9] : f16 into vector<2x1x16xf16> %459 = vector.extract %428[2] : f16 from vector<4xf16> %460 = vector.insert %459, %458 [0, 0, 10] : f16 into vector<2x1x16xf16> %461 = vector.extract %428[3] : f16 from vector<4xf16> %462 = vector.insert %461, %460 [0, 0, 11] : f16 into vector<2x1x16xf16> %463 = vector.extract %430[0] : f16 from vector<4xf16> %464 = vector.insert %463, %462 [0, 0, 12] : f16 into vector<2x1x16xf16> %465 = vector.extract %430[1] : f16 from vector<4xf16> %466 = vector.insert %465, %464 [0, 0, 13] : f16 into vector<2x1x16xf16> %467 = vector.extract %430[2] : f16 from vector<4xf16> %468 = vector.insert %467, %466 [0, 0, 14] : f16 into vector<2x1x16xf16> %469 = vector.extract %430[3] : f16 from vector<4xf16> %470 = vector.insert %469, %468 [0, 0, 15] : f16 into vector<2x1x16xf16> %471 = vector.extract %432[0] : f16 from vector<4xf16> %472 = vector.insert %471, %470 [1, 0, 0] : f16 into vector<2x1x16xf16> %473 = vector.extract %432[1] : f16 from vector<4xf16> %474 = vector.insert %473, %472 [1, 0, 1] : f16 into vector<2x1x16xf16> %475 = vector.extract %432[2] : f16 from vector<4xf16> %476 = vector.insert %475, %474 [1, 0, 2] : f16 into vector<2x1x16xf16> %477 = vector.extract %432[3] : f16 from vector<4xf16> %478 = vector.insert %477, %476 [1, 0, 3] : f16 into vector<2x1x16xf16> %479 = vector.extract %434[0] : f16 from vector<4xf16> %480 = vector.insert %479, %478 [1, 0, 4] : f16 into vector<2x1x16xf16> %481 = vector.extract %434[1] : f16 from vector<4xf16> %482 = vector.insert %481, %480 [1, 0, 5] : f16 into vector<2x1x16xf16> %483 = vector.extract %434[2] : f16 from vector<4xf16> %484 = vector.insert %483, %482 [1, 0, 6] : f16 into vector<2x1x16xf16> %485 = vector.extract %434[3] : f16 from vector<4xf16> %486 = vector.insert %485, %484 [1, 0, 7] : f16 into vector<2x1x16xf16> %487 = vector.extract %436[0] : f16 from vector<4xf16> %488 = vector.insert %487, %486 [1, 0, 8] : f16 into vector<2x1x16xf16> %489 = vector.extract %436[1] : f16 from vector<4xf16> %490 = vector.insert %489, %488 [1, 0, 9] : f16 into vector<2x1x16xf16> %491 = vector.extract %436[2] : f16 from vector<4xf16> %492 = vector.insert %491, %490 [1, 0, 10] : f16 into vector<2x1x16xf16> %493 = vector.extract %436[3] : f16 from vector<4xf16> %494 = vector.insert %493, %492 [1, 0, 11] : f16 into vector<2x1x16xf16> %495 = vector.extract %438[0] : f16 from vector<4xf16> %496 = vector.insert %495, %494 [1, 0, 12] : f16 into vector<2x1x16xf16> %497 = vector.extract %438[1] : f16 from vector<4xf16> %498 = vector.insert %497, %496 [1, 0, 13] : f16 into vector<2x1x16xf16> %499 = vector.extract %438[2] : f16 from vector<4xf16> %500 = vector.insert %499, %498 [1, 0, 14] : f16 into vector<2x1x16xf16> %501 = vector.extract %438[3] : f16 from vector<4xf16> %502 = vector.insert %501, %500 [1, 0, 15] : f16 into vector<2x1x16xf16> %503 = memref.load %12[] : memref> %504 = arith.divf %cst_3, %423 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>, <[ BATCHY, LANEX], [1, 32]>>, #iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>, <[ BATCHY, LANEX], [1, 32]>>], [#iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>, <[ BATCHY, LANEX], [1, 32]>>]]} : vector<2x1x16xf32> %505 = arith.mulf %504, %391 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> %506 = arith.truncf %505 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf32> to vector<2x1x16xf16> %507 = arith.mulf %506, %502 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %508 = arith.truncf %503 : f32 to f16 %509 = vector.broadcast %508 : f16 to vector<2x1x16xf16> %510 = arith.divf %507, %509 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %511 = math.roundeven %510 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %512 = arith.cmpf ult, %511, %cst_2 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %513 = arith.select %512, %cst_2, %511 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xi1>, vector<2x1x16xf16> %514 = arith.cmpf ugt, %513, %cst_1 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> %515 = arith.select %514, %cst_1, %513 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>, #iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xi1>, vector<2x1x16xf16> %516 = arith.fptosi %515 {__vector_layout_fetcher_storage = [[#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>], [#iree_vector_ext.layout<<[ BATCHY, LANEX], [1, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>]]} : vector<2x1x16xf16> to vector<2x1x16xi8> %517 = vector.extract_strided_slice %516 {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %518 = vector.extract %517[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %518, %13[%workgroup_id_x, %17, %workgroup_id_y, %263] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %519 = vector.extract_strided_slice %516 {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %520 = vector.extract %519[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %520, %13[%workgroup_id_x, %17, %workgroup_id_y, %425] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %521 = vector.extract_strided_slice %516 {offsets = [0, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %522 = vector.extract %521[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %522, %13[%workgroup_id_x, %17, %workgroup_id_y, %427] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %523 = vector.extract_strided_slice %516 {offsets = [0, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %524 = vector.extract %523[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %524, %13[%workgroup_id_x, %17, %workgroup_id_y, %429] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %525 = vector.extract_strided_slice %516 {offsets = [1, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %526 = vector.extract %525[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %526, %13[%workgroup_id_x, %17, %workgroup_id_y, %431] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %527 = vector.extract_strided_slice %516 {offsets = [1, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %528 = vector.extract %527[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %528, %13[%workgroup_id_x, %17, %workgroup_id_y, %433] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %529 = vector.extract_strided_slice %516 {offsets = [1, 0, 8], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %530 = vector.extract %529[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %530, %13[%workgroup_id_x, %17, %workgroup_id_y, %435] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> %531 = vector.extract_strided_slice %516 {offsets = [1, 0, 12], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<2x1x16xi8> to vector<1x1x4xi8> %532 = vector.extract %531[0, 0] : vector<4xi8> from vector<1x1x4xi8> vector.store %532, %13[%workgroup_id_x, %17, %workgroup_id_y, %437] : memref<2x1024x20x64xi8, strided<[1310720, 1280, 64, 1], offset: ?>, #hal.descriptor_type>, vector<4xi8> return }