Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMD] Define an extract slice operation #4804

Merged
merged 16 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ jobs:
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
fi
pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
cd python/test/unit
pytest --capture=tee-sys -rfs -n 16 language runtime \
--ignore=language/test_line_info.py \
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/integration-tests.yml.in
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ jobs:
echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
fi
pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
cd python/test/unit
pytest --capture=tee-sys -rfs -n 16 language runtime \
--ignore=language/test_line_info.py \
Expand Down
111 changes: 111 additions & 0 deletions test/Conversion/amd/invalid_extractslice_to_llvm.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// RUN: triton-opt -split-input-file %s --convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics

// Invalid size
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{sizes [256, 2] must be a multiple of shapePerCTATile [256, 16]}}
%1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x2xi32, #blocked1>
tt.return
}

// -----

// Invalid zero source dimension
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_size_input(%arg0: tensor<256x0xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{source tensor dimension size zero at dimension 1}}
%1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x0xi32, #blocked1> to tensor<256x16xi32, #blocked1>
tt.return
}

// -----

// Invalid zero result dimension
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{result tensor dimension size zero at dimension 1}}
%1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x0xi32, #blocked1>
tt.return
}

// -----

// Invalid offset, not multiple of shapePerTile
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{offset [0, 5] must be a multiple of shapePerCTATile [256, 16]}}
%1 = amdgpu.extract_slice %arg0 [0,5] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
tt.return
}

// -----

// Invalid offset, out of bounds for dimension
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{invalid offset 128 at dimension 1}}
%1 = amdgpu.extract_slice %arg0 [0,128] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
tt.return
}

// -----

// Invalid result layout
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_result_layout(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{result layout must match source layout}}
%1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked2>
tt.return
}

// -----

// Invalid result element type
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_result_element_type(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{result element type must match source element type}}
%1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi64, #blocked1>
tt.return
}

// -----

// Invalid result rank
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{result rank must be equal to source rank}}
%1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16x2xi32, #blocked1>
tt.return
}

// -----

// Invalid result shape
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{result shape cannot be larger than input shape at dimension 1}}
%1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x256xi32, #blocked1>
tt.return
}

// -----

// Invalid rank
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_rank(%arg0: tensor<256x128x2xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// expected-error @+1 {{currently only 2D tensors are supported}}
%1 = amdgpu.extract_slice %arg0 [0,0,0] : tensor<256x128x2xi32, #blocked1> to tensor<256x16x2xi32, #blocked1>
tt.return
}

// -----

// Invalid non static offset
#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}, %arg1: i32) {
// expected-error @+2 {{expected ']'}}
// expected-error @+1 {{expected integer value}}
%2 = amdgpu.extract_slice %arg0 [%arg1, 0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
tt.return
}
14 changes: 14 additions & 0 deletions test/TritonGPU/amd/amd-extractslice-op.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s

#blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
module attributes {"triton_gpu.compute-capability" = 0 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func @basic_insert_slice(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
// CHECK: llvm.func @basic_insert_slice
// CHECK-COUNT-64: %{{[0-9]*}} = llvm.extractvalue %arg0[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)>
// CHECK: %64 = llvm.mlir.undef : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
// CHECK-COUNT-8: %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
%72 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
tt.return
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ include "mlir/IR/EnumAttr.td"
include "triton/Dialect/Triton/IR/TritonTypes.td"
include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td" // Pure
hmalgewatta marked this conversation as resolved.
Show resolved Hide resolved
include "triton/Dialect/Triton/IR/TritonInterfaces.td"
include "TritonAMDGPUDialect.td"
include "TritonAMDGPUAttrDefs.td"


class TT_AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
Op<TritonAMDGPU_Dialect, mnemonic, !listconcat(traits, [])> {
}
Expand All @@ -44,6 +46,74 @@ class TT_AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
//
def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;

//===----------------------------------------------------------------------===//
// ExtractSliceOp
//===----------------------------------------------------------------------===//

def ExtractSliceOp
: TT_AMDGPU_Op<"extract_slice", [Pure]> {
let summary = "extract slice operation";
let description = [{
The "extract_slice" operation enables extracting a slice of a tensor in
registers.

The "extract_slice" operation supports the following arguments:

* source: the base tensor on which to create a view tensor
* offsets: offsets into the base tensor at which to create the view

Example 1:

```mlir
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8],
threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [0, 1]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8],
threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [0, 1]}>
%1 = triton_gpu.convert_layout %0 : tensor<128x128xf16, #blocked>
-> tensor<128x128xf16, #blocked1>
// create a slice of base tensor %1 with static offsets
%2 = amdgpu.extract_slice %0 [0, 0] :
tensor<128x128xf16, #blocked1> to tensor<128x32xf16, #blocked1>
```

Example 1 shows how "extract_slice" operation may be used. In this example a
new slice of 128x32 is created. "extract_slice" works on tensors with layout
where the desired slice has the same layout as the source tensor.
"%0" cannot be sliced directly as the resulting slice cannot have the same
layout as "%0". Therefore it needs to be converted to a layout suitable
for slicing. "#blocked1" layout is appropriate for this as it keeps the
sizePerThread the same thus keeping coalescing properties the same.
In order to utilize all threads in a warp, "threadsPerWarp" is set to
[16,4] for this new layout. This layout conversion carried out before
using "extract_slice" ensures slicing still uses all threads efficiently. The
size of the slice is determined by the result type.
}];

let arguments = (ins AnyRankedTensor:$source,
DenseI64ArrayAttr:$static_offsets);
let results = (outs AnyRankedTensor:$result);

let builders = [
// Build a ExtractSliceOp with static offsets and the same result type
OpBuilder<(ins "RankedTensorType":$resultType,
"Value":$source,
"ArrayRef<int64_t>": $static_offsets)>,
];

let extraClassDeclaration = [{
std::array<unsigned, 3> getArrayAttrMaxRanks() {
unsigned rank = getSource().getType().getRank();
return {rank, rank, rank};
}
}];

let assemblyFormat = [{
$source $static_offsets attr-dict `:` type($source) `to` type($result)
}];

let hasVerifier = 1;
}

def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
let summary = "A placeholder op for instruction scheduling hints within a basic block";
let description = [{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef TRITONAMDGPU_TO_LLVM_PATTERNS_AMDGPU_OP_TO_LLVM_H
#define TRITONAMDGPU_TO_LLVM_PATTERNS_AMDGPU_OP_TO_LLVM_H

#include "mlir/Conversion/LLVMCommon/TypeConverter.h"

namespace mlir::triton::AMD {

void populateExtractSliceOpToLLVMPatterns(
mlir::LLVMTypeConverter &typeConverter, mlir::RewritePatternSet &patterns,
mlir::PatternBenefit benefit);

}

#endif
84 changes: 82 additions & 2 deletions third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@
#include "triton/Dialect/Triton/IR/Dialect.h"
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/OperationSupport.h"

#include "llvm/ADT/TypeSwitch.h"

#include "triton/Conversion/TritonGPUToLLVM/Utility.h"

// clang-format off
#include "Dialect/TritonAMDGPU/IR/Dialect.h"
#include "Dialect/TritonAMDGPU/IR/Dialect.cpp.inc"
Expand All @@ -53,3 +53,83 @@ void mlir::triton::amdgpu::TritonAMDGPUDialect::initialize() {

#define GET_OP_CLASSES
#include "Dialect/TritonAMDGPU/IR/Ops.cpp.inc"

namespace mlir::triton::amdgpu {

LogicalResult ExtractSliceOp::verify() {
auto srcTy = getSource().getType();
auto srcLayout = srcTy.getEncoding();
auto srcElementType = getElementTypeOrSelf(srcTy);
auto resultTy = getResult().getType();
auto resultLayout = resultTy.getEncoding();
auto resultElementType = getElementTypeOrSelf(resultTy);

if (srcElementType != resultElementType) {
return emitError("result element type must match source element type");
}
if (srcLayout != resultLayout) {
return emitError("result layout must match source layout");
}
if (srcTy.getRank() != resultTy.getRank()) {
return emitError("result rank must be equal to source rank");
}
if (srcTy.getRank() != 2) {
return emitError("currently only 2D tensors are supported");
}

hmalgewatta marked this conversation as resolved.
Show resolved Hide resolved
auto srcShape = srcTy.getShape();
auto shapePerCTATile =
mlir::triton::gpu::getShapePerCTATile(srcLayout, srcShape);
shapePerCTATile[0] =
std::min(static_cast<unsigned>(srcShape[0]), shapePerCTATile[0]);
shapePerCTATile[1] =
std::min(static_cast<unsigned>(srcShape[1]), shapePerCTATile[1]);

// ExtractSlice only supports slicing where offsets and sizes are multiples of
// shapePerCTATile. This condition ensures that slice has the same layout as
// the original tensor.

auto offsets = getStaticOffsets();
if (offsets.size() != 2) {
return emitError("invalid offset shape ") << offsets;
}

SmallVector<int64_t, 2> sizes;
for (auto i = 0; i < 2; ++i) {
auto resultDimSize = resultTy.getDimSize(i);
auto srcDimSize = srcTy.getDimSize(i);
if (resultDimSize == 0) {
return emitError("result tensor dimension size zero at dimension ") << i;
}
if (srcDimSize == 0) {
return emitError("source tensor dimension size zero at dimension ") << i;
}
if (resultDimSize > srcDimSize) {
return emitError(
"result shape cannot be larger than input shape at dimension ")
<< i;
}
if (offsets[i] + resultDimSize > srcDimSize) {
return emitError("invalid offset ")
<< offsets[i] << " at dimension " << i;
}
sizes.push_back(resultDimSize);
}

if (sizes[0] % shapePerCTATile[0] != 0 ||
sizes[1] % shapePerCTATile[1] != 0) {
return emitError() << "sizes [" << sizes
<< "] must be a multiple of shapePerCTATile ["
<< shapePerCTATile << "]";
}

if (offsets[0] % shapePerCTATile[0] != 0 ||
offsets[1] % shapePerCTATile[1] != 0) {
return emitError() << "offset [" << offsets
<< "] must be a multiple of shapePerCTATile ["
<< shapePerCTATile << "]";
}

antiagainst marked this conversation as resolved.
Show resolved Hide resolved
return success();
}
} // namespace mlir::triton::amdgpu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
add_triton_library(TritonAMDGPUDialectToLLVM
TritonAMDGPUToLLVMPatterns.cpp
ExtractSliceOpToLLVM.cpp

DEPENDS
TritonAMDGPUIR
Expand Down
Loading
Loading