Skip to content

Commit

Permalink
[Codegen] Add interface tensor reshape foldings to TileAndDistribute (i…
Browse files Browse the repository at this point in the history
…ree-org#17758)

This PR adds reshape into interface tensor folding patterns to
TileAndDistributeToWorkgroups. If there are reshapes between interface
tensors and their users, then TileAndDistributeToWorkgroups can fail, so
these patterns help to preprocess the input IR into a form that can be
distributed. The patterns can create duplicate interface binding ops, so
a CSE pass is added whenever calling TileAndDistributeToWorkgroups.

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
  • Loading branch information
Max191 authored Jul 19, 2024
1 parent 4a13331 commit 5ea0b21
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,15 @@ void TileAndDistributeToWorkgroupsPass::runOnOperation() {

auto funcOp = getOperation();

{
RewritePatternSet patterns(context);
populateReshapeToInterfaceTensorPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
funcOp.emitOpError("reshape to interface tensor patterns failed");
return signalPassFailure();
}
}

// TODO(MaheshRavishankar): The logic of lowering workgroup count
// needs to be moved out of this pass. Once this is moved to
// use scf.forall, this logic can be moved to the scf.forall
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2663,3 +2663,68 @@ hal.executable private @set_size_to_tilesize_when_divisible {
// NO-LOOP: %[[RESULT:.+]] = linalg.generic
// NO-LOOP: -> tensor<1x16x128xf16>
// NO-LOOP: flow.dispatch.tensor.store %[[RESULT]], %{{.+}}, offsets = [%[[IDX_Y]], 0, %[[OFFX]]]

// -----

#config = #iree_codegen.lowering_config<tile_sizes = [[32, 16, 0], [16, 8, 0], [0, 0, 2]]>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<1, storage_buffer>,
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64">
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @reshape_matmul_tensors {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
hal.executable.export public @reshape_matmul layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @reshape_matmul() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<64x2x256xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<256x512xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
: !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 2, 256], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:tensor<64x2x256xf32>> -> tensor<64x2x256xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1]
: !flow.dispatch.tensor<readonly:tensor<256x512xf32>> -> tensor<256x512xf32>
%collapsed = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<64x2x256xf32> into tensor<128x256xf32>
%5 = tensor.empty() : tensor<128x512xf32>
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32>
%7 = linalg.matmul {lowering_config = #config}
ins(%collapsed, %4 : tensor<128x256xf32>, tensor<256x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1]
: tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
return
}
}
}
}
// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 32)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 16)>
// CHECK: hal.executable.export public @reshape_matmul
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device)
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index
// CHECK: hal.return %[[C32]], %[[C4]], %[[C1]]
// CHECK: func.func @reshape_matmul()
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %{{.+}}, offsets = [%[[IV0]], 0], sizes = [32, 256]
// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %{{.+}}, offsets = [0, %[[IV1]]], sizes = [256, 16]
// CHECK-DAG: %[[INIT:.+]] = tensor.empty
// CHECK-DAG: %[[FILL:.+]] = linalg.fill
// CHECK-SAME: outs(%[[INIT]] :
// CHECK-DAG: %[[GEMM:.+]] = linalg.matmul
// CHECK-SAME: outs(%[[FILL]] :
// CHECK: flow.dispatch.tensor.store %[[GEMM]]
// CHECK-SAME: offsets = [%[[IV0]], %[[IV1]]], sizes = [32, 16]
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ static llvm::cl::opt<bool> clForceArmStreaming(

static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass());
funcPassManager.addPass(createCSEPass());
funcPassManager.addPass(createConvertToDestinationPassingStylePass());
funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass());
funcPassManager.addPass(createCanonicalizerPass());
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ tileAndDistributeToWorkgroup(OpPassManager &funcPassManager,
funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass(
kNumMaxParallelDims,
linalg::DistributionMethod::CyclicNumProcsEqNumIters));
funcPassManager.addPass(createCSEPass());

funcPassManager.addPass(createConvertToDestinationPassingStylePass(
useWARForCooperativeMatrixCodegen));
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ static void addTileAndDistributeToWorkgroupsPasses(
funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass(
kNumMaxParallelDims,
linalg::DistributionMethod::CyclicNumProcsEqNumIters));
funcPassManager.addPass(createCSEPass());
if (useFuseTensorPadWithConsumerPass) {
funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
}
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/VMVX/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ static llvm::cl::opt<bool> clEnableUKernelsDecomposeLinalgGeneric(

static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass());
funcPassManager.addPass(createCSEPass());
funcPassManager.addPass(createConvertToDestinationPassingStylePass());
funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass());
funcPassManager.addPass(createCanonicalizerPass());
Expand Down

0 comments on commit 5ea0b21

Please sign in to comment.