Skip to content

Commit

Permalink
[XPU][OptRed] Define triton_intel_gpu.simd_reduce and use in optimi…
Browse files Browse the repository at this point in the history
…zed transposed reduction

Define SIMD transpose-reduce operation performing a SIMD reduction while transposing
the implicit SIMD matrix. See description definition for further context.

Using this operation in the transpose reduction pass allows us to perform the optimization
while not using SLM.

Signed-off-by: victor-eds <victor.perez@codeplay.com>
Co-authored-by: chengjunlu <chengjun.lu@intel.com>
Signed-off-by: Victor Perez <victor.perez@codeplay.com>
  • Loading branch information
victor-eds and chengjunlu committed Dec 3, 2024
1 parent 6588f0d commit 114add9
Show file tree
Hide file tree
Showing 10 changed files with 1,486 additions and 161 deletions.
24 changes: 24 additions & 0 deletions test/Conversion/intel/simd-reduce.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s

// Basic 16x16 SIMD reduction.

#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
// CHECK-LABEL: llvm.func spir_kernelcc @test_single(
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct
// CHECK: %[[VAL_17:.*]] = llvm.mlir.poison : vector<16xf32>
// COM: Check we insert all tensor elements in a vector:
// CHECK-COUNT-16: llvm.insertelement
// CHECK: %[[VAL_50:.*]] = llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "{\0A.decl temp_result v_type=G type=f num_elts=128 align=wordx32\0Aadd (M1_NM, 16) temp_result(0, 0)<1> $1(0, 0)<16;8,1> $1(0, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(1, 0)<1> $1(2, 0)<16;8,1> $1(2, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(2, 0)<1> $1(4, 0)<16;8,1> $1(4, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(3, 0)<1> $1(6, 0)<16;8,1> $1(6, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(4, 0)<1> $1(8, 0)<16;8,1> $1(8, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(5, 0)<1> $1(10, 0)<16;8,1> $1(10, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(6, 0)<1> $1(12, 0)<16;8,1> $1(12, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(7, 0)<1> $1(14, 0)<16;8,1> $1(14, 8)<16;8,1>\0Aadd (M1_NM, 16) temp_result(0, 0)<1> temp_result(0, 0)<8;4,1> temp_result(0, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(1, 0)<1> temp_result(2, 0)<8;4,1> temp_result(2, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(2, 0)<1> temp_result(4, 0)<8;4,1> temp_result(4, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(3, 0)<1> temp_result(6, 0)<8;4,1> temp_result(6, 4)<8;4,1>\0Aadd (M1_NM, 16) temp_result(0, 0)<1> temp_result(0, 0)<4;2,1> temp_result(0, 2)<4;2,1>\0Aadd (M1_NM, 16) temp_result(1, 0)<1> temp_result(2, 0)<4;2,1> temp_result(2, 2)<4;2,1>\0Aadd (M1_NM, 16) $0(0, 0)<1> temp_result(0, 0)<2;1,0> temp_result(0, 1)<2;1,0>\0A}", "=rw,rw" %{{.*}} : (vector<16xf32>) -> f32
// COM: Check we obtain a single result, i.e., the SIMD reduction minimizes register usage.
// CHECK: %[[VAL_51:.*]] = llvm.mlir.undef : !llvm.struct<(f32)>
// CHECK: %[[VAL_52:.*]] = llvm.insertvalue %[[VAL_50]], %[[VAL_51]][0] : !llvm.struct<(f32)>
// CHECK: llvm.return %[[VAL_52]] : !llvm.struct<(f32)>
// CHECK: }
tt.func @test_single(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16xf32, #blocked1> {
%0 = triton_intel_gpu.simd_reduce add %arg0 axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
tt.return %0 : tensor<16xf32, #blocked1>
}
}
289 changes: 289 additions & 0 deletions test/TritonIntelGPU/optimize-reduction-simd.mlir

Large diffs are not rendered by default.

303 changes: 153 additions & 150 deletions test/TritonIntelGPU/optimize-reduction.mlir

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions test/TritonIntelGPU/tritonintelgpu.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,17 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
tt.return %res : tensor<16x16xf16>
}
}

// -----

#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
tt.func @triton_intel_gpu.simd_reduce(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16xf32, #blocked1> {
// CHECK-LABEL: @triton_intel_gpu.simd_reduce
// CHECK: triton_intel_gpu.simd_reduce add %{{.*}} axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
%0 = triton_intel_gpu.simd_reduce add %arg0 axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
tt.return %0 : tensor<16xf32, #blocked1>
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
include "triton/Dialect/Triton/IR/TritonTypes.td"
include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
include "intel/include/Dialect/TritonGEN/IR/TritonGENAttrDefs.td"
include "intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td"
include "intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
Expand Down Expand Up @@ -202,4 +203,66 @@ def TTIG_SubGroupTransposeOp
let hasVerifier = 1;
}

def TTIG_SIMDReduceOp : TTIG_Op<"simd_reduce", [Pure, SameOperandsAndResultElementType]> {
let summary = "SIMD reduction.";
let description = [{
The `triton_intel_gpu.simd_reduce` operation performs a SIMD reduction.
Contrary to `tt.reduce`, when performing a warp reduction, the result is
non-uniform.

The reduction axis must be in such a way that only a warp reduction is
performed, i.e., `sizePerThread[axis]`, `warpsPerCTA[axis]` and
`CTAsPerCGA[axis]` must be 1; and `shape[axis]` and `threadsPerWarp[axis]`
must be equal to the sub-group size.

The output type must be compatible with the performed reduction. However,
ensuring this is up to the user. As a rule of thumb, the total number of
elements in the output tensor must be sub-group size smaller than in the
original one. Users should bear in mind a tensor like:

```
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
```

would be reduced to:

```
t0 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15
```

Example:
```mlir
#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
triton_intel_gpu.simd_reduce add %0 axis = 0 : tensor<16x16xf32, #blocked> -> tensor<16xf32, #blocked1>
// # 3D reduction:
#blocked = #ttg.blocked<{sizePerThread = [1, 16, 1], threadsPerWarp = [16, 1, 1], warpsPerCTA = [1, 1, 2], order = [0, 1, 2]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 1], warpsPerCTA = [1, 2], order = [0, 1]}>
triton_intel_gpu.simd_reduce add %0 axis = 0 : tensor<16x16x2xf32, #blocked> -> tensor<16x2xf32, #blocked1>
```
}];
let arguments = (ins TT_Tensor:$src,
TritonGEN_ReduceKindAttr: $op,
I32Attr:$axis);
let results = (outs TT_Tensor:$res);
let assemblyFormat = [{
$op $src `axis` `=` $axis attr-dict `:` type($src) `->` type($res)
}];
}

#endif
Loading

0 comments on commit 114add9

Please sign in to comment.