| 
3 | 3 | // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=thread}, canonicalize, cse))" %s | FileCheck %s --check-prefix=THREAD  | 
4 | 4 | // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=subgroup}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SUBGROUP  | 
5 | 5 | // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=partial_reduction}, canonicalize, cse))" %s | FileCheck %s --check-prefix=PARTRED  | 
 | 6 | +// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{normalize-loops}, canonicalize, cse))" %s | FileCheck %s --check-prefix=NORM-REDUCTION  | 
6 | 7 | 
 
  | 
7 | 8 | #config = #iree_gpu.lowering_config<{thread = [2, 16], subgroup = [2, 16]}>  | 
8 | 9 | #map = affine_map<(d0, d1) -> (d0, d1)>  | 
@@ -536,3 +537,67 @@ func.func @partial_reduction(%3: tensor<?x?xf32>) -> tensor<?xf32> {  | 
536 | 537 | //       PARTRED:   scf.yield  | 
537 | 538 | //       PARTRED:   linalg.reduce ins(%[[OUT]] : tensor<?x8xf32>)  | 
538 | 539 | //  PARTRED-SAME:                 outs(%[[FULL]] : tensor<?xf32>)  | 
 | 540 | + | 
 | 541 | +// -----  | 
 | 542 | + | 
 | 543 | +#config = #iree_gpu.lowering_config<{reduction = [0, 32]}>  | 
 | 544 | +func.func @swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {  | 
 | 545 | +  %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>  | 
 | 546 | +  %empty = tensor.empty() : tensor<32x2592xf32>  | 
 | 547 | +  %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>  | 
 | 548 | +  return %0: tensor<32x2592xf32>  | 
 | 549 | +}  | 
 | 550 | + | 
 | 551 | +// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice  | 
 | 552 | +//   NORM-REDUCTION-DAG:   %[[C1:.+]] = arith.constant 1 : index  | 
 | 553 | +//   NORM-REDUCTION-DAG:   %[[C81:.+]] = arith.constant 81 : index  | 
 | 554 | +//   NORM-REDUCTION-DAG:   %[[C0:.+]] = arith.constant 0 : index  | 
 | 555 | +//       NORM-REDUCTION:   scf.for %[[ARG1:.+]] = %[[C0]] to %[[C81]] step %[[C1]]  | 
 | 556 | +//       NORM-REDUCTION:     %[[APPLY:.+]] = affine.apply affine_map<(d0) -> (d0 * 32)>(%[[ARG1]])  | 
 | 557 | +//       NORM-REDUCTION:     %[[IDX:.+]]:3 = affine.delinearize_index %[[APPLY]] into (3, 3, 288) : index, index, index  | 
 | 558 | +//       NORM-REDUCTION:     %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[0, %[[IDX]]#0, %[[IDX]]#1, %[[IDX]]#2] [32, 1, 1, 32] [1, 1, 1, 1] : tensor<32x3x3x288xf32> to tensor<32x1x1x32xf32>  | 
 | 559 | +//       NORM-REDUCTION:     %[[COLLAPSE:.+]] = tensor.collapse_shape %[[SLICE]] {{\[}}[0], [1, 2, 3]] : tensor<32x1x1x32xf32> into tensor<32x32xf32>  | 
 | 560 | +//       NORM-REDUCTION:     linalg.copy {{.*}} ins(%[[COLLAPSE]]  | 
 | 561 | + | 
 | 562 | +// Without loop normalization, no swap would happen.  | 
 | 563 | +//                CHECK:   tensor.collapse_shape  | 
 | 564 | +//                CHECK:   scf.for  | 
 | 565 | +//                CHECK:     tensor.extract_slice  | 
 | 566 | +//            CHECK-NOT:     tensor.collapse_shape  | 
 | 567 | +//                CHECK:     linalg.copy  | 
 | 568 | + | 
 | 569 | +// -----  | 
 | 570 | + | 
 | 571 | +#config = #iree_gpu.lowering_config<{reduction = [0, 30]}>  | 
 | 572 | +func.func @no_swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {  | 
 | 573 | +  %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>  | 
 | 574 | +  %empty = tensor.empty() : tensor<32x2592xf32>  | 
 | 575 | +  %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>  | 
 | 576 | +  return %0: tensor<32x2592xf32>  | 
 | 577 | +}  | 
 | 578 | + | 
 | 579 | +// No swap would happen when collapsed size is not divisible by offset multiplier.  | 
 | 580 | +// NORM-REDUCTION-LABEL: func.func @no_swap_collapse_shape_with_extract_slice  | 
 | 581 | +//       NORM-REDUCTION:   tensor.collapse_shape  | 
 | 582 | +//       NORM-REDUCTION:   scf.for  | 
 | 583 | +//       NORM-REDUCTION:     tensor.extract_slice  | 
 | 584 | +//   NORM-REDUCTION-NOT:     tensor.collapse_shape  | 
 | 585 | +//       NORM-REDUCTION:     linalg.copy  | 
 | 586 | + | 
 | 587 | +// -----  | 
 | 588 | + | 
 | 589 | +#config = #iree_gpu.lowering_config<{reduction = [0, 32]}>  | 
 | 590 | +func.func @no_swap_collapse_shape_with_extract_slice_2(%arg0: tensor<32x2x2x16xf32>) -> tensor<32x64xf32> {  | 
 | 591 | +  %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x2x2x16xf32> into tensor<32x64xf32>  | 
 | 592 | +  %empty = tensor.empty() : tensor<32x64xf32>  | 
 | 593 | +  %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x64xf32>) outs(%empty : tensor<32x64xf32>) -> tensor<32x64xf32>  | 
 | 594 | +  return %0: tensor<32x64xf32>  | 
 | 595 | +}  | 
 | 596 | + | 
 | 597 | +// No swap would happen when the last expanded size is not divisible by collapse size.  | 
 | 598 | +// NORM-REDUCTION-LABEL: func.func @no_swap_collapse_shape_with_extract_slice_2  | 
 | 599 | +//       NORM-REDUCTION:   tensor.collapse_shape  | 
 | 600 | +//       NORM-REDUCTION:   scf.for  | 
 | 601 | +//       NORM-REDUCTION:     tensor.extract_slice  | 
 | 602 | +//   NORM-REDUCTION-NOT:     tensor.collapse_shape  | 
 | 603 | +//       NORM-REDUCTION:     linalg.copy  | 
0 commit comments