Skip to content

Commit

Permalink
Revert "[LLVMGPU] Splitting TensorCoreVectorization to two passes." (i…
Browse files Browse the repository at this point in the history
…ree-org#15225)

It also reverts
iree-org@a9d7aa5
which depends on it.

The commit causes test failures:
https://github.com/openxla/iree/actions/runs/6554319504/job/17801458942
  • Loading branch information
hanhanW authored Oct 18, 2023
1 parent a9d7aa5 commit e8f184d
Show file tree
Hide file tree
Showing 13 changed files with 221 additions and 119 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <algorithm>
#include <numeric>

#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Common/GPU/PassDetail.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
Expand All @@ -28,6 +29,9 @@

#define DEBUG_TYPE "iree-codegen-gpu-distribute-shared-memory-copy"

using mlir::iree_compiler::IREE::LinalgExt::LinalgVectorizationPattern;
using mlir::iree_compiler::IREE::LinalgExt::VectorizationPatterns;

/// Prints the given `funcOp` after a leading `step` comment header.
void debugPrint(mlir::func::FuncOp funcOp, const char *step) {
LLVM_DEBUG({
Expand Down Expand Up @@ -270,17 +274,14 @@ static void populateTilingAndDistribute(RewritePatternSet &patterns,
StringAttr::get(patterns.getContext(), kCopyDistributed)));
}

static void vectorizeDistributedCopies(func::FuncOp funcOp) {
IRRewriter rewriter(funcOp.getContext());
SmallVector<linalg::GenericOp> candidates;
funcOp.walk([&](linalg::GenericOp op) { candidates.push_back(op); });
for (auto op : candidates) {
SmallVector<int64_t> vectorSizes;
SmallVector<bool> scalableVecDims;
scalableVecDims.resize(vectorSizes.size());
(void)linalg::vectorize(rewriter, op, vectorSizes, scalableVecDims,
/*vectorizeGatherAccesses=*/true);
};
static void populateVectorizationPatterns(RewritePatternSet &patterns) {
VectorizationPatterns<linalg::GenericOp>::insert(
patterns, IREE::LinalgExt::LinalgVectorizationOptions(),
IREE::LinalgExt::LinalgTransformationFilter(
{StringAttr::get(patterns.getContext(),
getCopyToWorkgroupMemoryMarker()),
StringAttr::get(patterns.getContext(), kCopyDistributed)},
std::nullopt));
}

/// Return a flattened Id Value by combining the 3D gpu thread IDs.
Expand Down Expand Up @@ -435,7 +436,12 @@ class GPUDistributeSharedMemoryCopyPass
debugPrint(funcOp, "After step 2: thread distribution");

// Step 3. Vectorize the distributed copies.
vectorizeDistributedCopies(funcOp);
RewritePatternSet vectorizationPatterns(context);
populateVectorizationPatterns(vectorizationPatterns);
if (failed(applyPatternsAndFoldGreedily(
funcOp, std::move(vectorizationPatterns)))) {
return signalPassFailure();
}
debugPrint(funcOp, "After step 3: vectorization");

// Step4. Finally unroll all the loop created
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ iree_compiler_cc_library(
"LLVMGPUCastAddressSpaceFunction.cpp",
"LLVMGPULowerExecutableTarget.cpp",
"LLVMGPUPackSharedMemoryAlloc.cpp",
"LLVMGPUTensorCorePreparation.cpp",
"LLVMGPUTensorCoreVectorization.cpp",
"LLVMGPUTensorPad.cpp",
"LLVMGPUTileAndDistribute.cpp",
"LLVMGPUVectorLowering.cpp",
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ iree_cc_library(
"LLVMGPUCastAddressSpaceFunction.cpp"
"LLVMGPULowerExecutableTarget.cpp"
"LLVMGPUPackSharedMemoryAlloc.cpp"
"LLVMGPUTensorCorePreparation.cpp"
"LLVMGPUTensorCoreVectorization.cpp"
"LLVMGPUTensorPad.cpp"
"LLVMGPUTileAndDistribute.cpp"
"LLVMGPUVectorLowering.cpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
#include "iree/compiler/Codegen/Dialect/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
Expand All @@ -20,15 +22,31 @@
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"

#define DEBUG_TYPE "iree-codegen-gpu-tensorcore-preparation"
#define DEBUG_TYPE "iree-codegen-gpu-tensorcore-vectorization"

using mlir::iree_compiler::IREE::LinalgExt::LinalgVectorizationPattern;
using mlir::iree_compiler::IREE::LinalgExt::VectorizationPatterns;

namespace mlir {
namespace iree_compiler {

//====---------------------------------------------------------------------===//
// Patterns for preparation
// Patterns for vectorization
//====---------------------------------------------------------------------===//

static void populateVectorizationPatterns(RewritePatternSet &patterns) {
IREE::LinalgExt::LinalgTransformationFilter f(
StringAttr::get(patterns.getContext(), getVectorizeMarker()));
IREE::LinalgExt::LinalgVectorizationOptions vectorizationOptions;
VectorizationPatterns<linalg::FillOp, linalg::GenericOp>::insert(
patterns, vectorizationOptions, f);
patterns.add<LinalgVectorizationPattern>(
patterns.getContext(), vectorizationOptions,
f.addOpFilter<linalg::ContractionOpInterface>());
vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
vector::populateVectorReductionToContractPatterns(patterns);
}

static void populateVectorUnrollPatterns(RewritePatternSet &patterns,
bool useMmaSyncShape) {
auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
Expand All @@ -49,24 +67,47 @@ static void populateVectorUnrollPatterns(RewritePatternSet &patterns,
}

namespace {
struct LLVMGPUTensorCorePreparationPass
: public LLVMGPUTensorCorePreparationBase<
LLVMGPUTensorCorePreparationPass> {
LLVMGPUTensorCorePreparationPass(GPUTensorCoreType tensorCoreType)
struct LLVMGPUTensorCoreVectorizationPass
: public LLVMGPUTensorCoreVectorizationBase<
LLVMGPUTensorCoreVectorizationPass> {
LLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType)
: tensorCoreType(tensorCoreType) {}
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<vector::VectorDialect>();
}
void runOnOperation() override {
auto funcOp = getOperation();
LLVM_DEBUG({
llvm::dbgs() << "LLVMGPUTensorCorePreparationPass runOnOperation():\n";
llvm::dbgs() << "LLVMGPUTensorCoreVectorizationPass runOnOperation():\n";
funcOp->dump();
});

MLIRContext *context = &getContext();
{
// Step 1. Merge transpose into transfer_read ops.
// Step 1(a). Vectorize (linalg to vector).
RewritePatternSet vectorizationPatterns(context);
populateVectorizationPatterns(vectorizationPatterns);
if (failed(applyPatternsAndFoldGreedily(
funcOp, std::move(vectorizationPatterns)))) {
return signalPassFailure();
}
LLVM_DEBUG({
llvm::dbgs() << "\nAfter populateVectorizationPatterns:\n";
funcOp->dump();
});

// Step 1(b). Fold arithmetic extensions into vector contraction ops.
// Linalg to vector conversion introduces arithmetic extensions on the
// operands of vector contraction ops for mixed precision computation.
// This pattern folds the arithmetic extensions into the vector.contract.
RewritePatternSet foldArithExtPatterns(context);
vector::populateFoldArithExtensionPatterns(foldArithExtPatterns);
if (failed(applyPatternsAndFoldGreedily(
funcOp, std::move(foldArithExtPatterns)))) {
return signalPassFailure();
}

// Step 2. Fold consumer add ops into the contraction op itself.
RewritePatternSet canonicalizationPatterns(context);
vector::ContractionOp::getCanonicalizationPatterns(
canonicalizationPatterns, context);
Expand All @@ -82,7 +123,7 @@ struct LLVMGPUTensorCorePreparationPass
funcOp->dump();
});

// Step 2. Prepare vector operations to be lowered to native tensor core
// Step 3. Prepare vector operations to be lowered to native tensor core
// operations (nvgpu.mmasync, nvgpu.ldmatrix).
if (tensorCoreType == GPUTensorCoreType::MMA_SYNC) {
RewritePatternSet vectorContractPatterns(funcOp.getContext());
Expand All @@ -103,7 +144,7 @@ struct LLVMGPUTensorCorePreparationPass
});

bool useMmaSyncShape = tensorCoreType == GPUTensorCoreType::MMA_SYNC;
// Step 3. Break and unroll warp tile size to native math and load sizes.
// Step 4. Break and unroll warp tile size to native math and load sizes.
RewritePatternSet vectorUnrollPatterns(context);
populateVectorUnrollPatterns(vectorUnrollPatterns, useMmaSyncShape);
if (failed(applyPatternsAndFoldGreedily(
Expand All @@ -123,8 +164,8 @@ struct LLVMGPUTensorCorePreparationPass
} // namespace

std::unique_ptr<OperationPass<func::FuncOp>>
createLLVMGPUTensorCorePreparationPass(GPUTensorCoreType tensorCoreType) {
return std::make_unique<LLVMGPUTensorCorePreparationPass>(tensorCoreType);
createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType) {
return std::make_unique<LLVMGPUTensorCoreVectorizationPass>(tensorCoreType);
}

} // namespace iree_compiler
Expand Down
48 changes: 18 additions & 30 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,24 +236,9 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());

// Distribute shared memory copies.
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUReduceSharedMemoryBankConflicts());

// Linalg -> vector
{
GenericVectorizationPassOptions options;
options.enableCleanup = false;
nestedModulePM.addNestedPass<func::FuncOp>(
createGenericVectorizationPass(options));
}
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUTensorCorePreparationPass());
createLLVMGPUTensorCoreVectorizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
Expand All @@ -262,6 +247,15 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
nestedModulePM.addNestedPass<func::FuncOp>(
createHoistRedundantVectorTransfersPass());

// Distribute shared memory copies.
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUReduceSharedMemoryBankConflicts());

// Vector -> MMA ops
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
Expand Down Expand Up @@ -306,22 +300,9 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &pm,
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());

// Distribute shared memory copies.
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());

// Linalg -> vector
{
GenericVectorizationPassOptions options;
options.enableCleanup = false;
nestedModulePM.addNestedPass<func::FuncOp>(
createGenericVectorizationPass(options));
}
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUTensorCorePreparationPass(GPUTensorCoreType::MMA_SYNC));
createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType::MMA_SYNC));
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
Expand All @@ -330,6 +311,13 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &pm,
nestedModulePM.addNestedPass<func::FuncOp>(
createHoistRedundantVectorTransfersPass());

// Distribute shared memory copies.
nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());
nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());

// Vector -> MMA ops
nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
Expand Down
2 changes: 1 addition & 1 deletion compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ enum class GPUTensorCoreType {

/// Convert Linalg ops to Vector and prepare converstion to GPU MMA ops.
std::unique_ptr<OperationPass<func::FuncOp>>
createLLVMGPUTensorCorePreparationPass(
createLLVMGPUTensorCoreVectorizationPass(
GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);

//. Pass to pad out tensors up to static dimensions.
Expand Down
8 changes: 4 additions & 4 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ def LLVMGPUPackSharedMemoryAlloc :
let constructor = "mlir::iree_compiler::createLLVMGPUPackSharedMemoryAlloc()";
}

def LLVMGPUTensorCorePreparation :
Pass<"iree-llvmgpu-tensorcore-preparation", "func::FuncOp"> {
let summary = "Pass to transform vector.contract to a form that can be lowered to GPU MMA ops";
let constructor = "mlir::iree_compiler::createLLVMGPUTensorCorePreparationPass()";
def LLVMGPUTensorCoreVectorization :
Pass<"iree-llvmgpu-tensorcore-vectorization", "func::FuncOp"> {
let summary = "Pass to convert linalg into Vector and transform it to a form that can be lowered to GPU MMA ops";
let constructor = "mlir::iree_compiler::createLLVMGPUTensorCoreVectorizationPass()";
}

def LLVMGPUTensorPad :
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ iree_lit_test_suite(
"pack_pipeline_test.mlir",
"pack_shared_memory_alloc.mlir",
"tensor_pad.mlir",
"tensorcore_preparation.mlir",
"tensorcore_vectorization.mlir",
"transform_dialect_hoist_allocs.mlir",
"transform_dialect_vector_distribution.mlir",
"transform_dialect_bufferize.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ iree_lit_test_suite(
"set_transform_strategy_matmul.mlir"
"set_transform_strategy_pad.mlir"
"tensor_pad.mlir"
"tensorcore_preparation.mlir"
"tensorcore_vectorization.mlir"
"transform_dialect_bufferize.mlir"
"transform_dialect_eliminate_gpu_barriers.mlir"
"transform_dialect_hoist_allocs.mlir"
Expand Down

This file was deleted.

Loading

0 comments on commit e8f184d

Please sign in to comment.