Revert "[LLVMGPU] Splitting TensorCoreVectorization to two passes." (i…

…ree-org#15225) It also reverts iree-org@a9d7aa5 which depends on it. The commit causes test failures: https://github.com/openxla/iree/actions/runs/6554319504/job/17801458942
nod-ai · Oct 18, 2023 · e8f184d · e8f184d
1 parent a9d7aa5
commit e8f184d
Show file tree

Hide file tree

Showing 13 changed files with 221 additions and 119 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <numeric>
 
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
 #include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Common/GPU/PassDetail.h"
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
@@ -28,6 +29,9 @@
 
 #define DEBUG_TYPE "iree-codegen-gpu-distribute-shared-memory-copy"
 
+using mlir::iree_compiler::IREE::LinalgExt::LinalgVectorizationPattern;
+using mlir::iree_compiler::IREE::LinalgExt::VectorizationPatterns;
+
 /// Prints the given `funcOp` after a leading `step` comment header.
 void debugPrint(mlir::func::FuncOp funcOp, const char *step) {
   LLVM_DEBUG({
@@ -270,17 +274,14 @@ static void populateTilingAndDistribute(RewritePatternSet &patterns,
           StringAttr::get(patterns.getContext(), kCopyDistributed)));
 }
 
-static void vectorizeDistributedCopies(func::FuncOp funcOp) {
-  IRRewriter rewriter(funcOp.getContext());
-  SmallVector<linalg::GenericOp> candidates;
-  funcOp.walk([&](linalg::GenericOp op) { candidates.push_back(op); });
-  for (auto op : candidates) {
-    SmallVector<int64_t> vectorSizes;
-    SmallVector<bool> scalableVecDims;
-    scalableVecDims.resize(vectorSizes.size());
-    (void)linalg::vectorize(rewriter, op, vectorSizes, scalableVecDims,
-                            /*vectorizeGatherAccesses=*/true);
-  };
+static void populateVectorizationPatterns(RewritePatternSet &patterns) {
+  VectorizationPatterns<linalg::GenericOp>::insert(
+      patterns, IREE::LinalgExt::LinalgVectorizationOptions(),
+      IREE::LinalgExt::LinalgTransformationFilter(
+          {StringAttr::get(patterns.getContext(),
+                           getCopyToWorkgroupMemoryMarker()),
+           StringAttr::get(patterns.getContext(), kCopyDistributed)},
+          std::nullopt));
 }
 
 /// Return a flattened Id Value by combining the 3D gpu thread IDs.
@@ -435,7 +436,12 @@ class GPUDistributeSharedMemoryCopyPass
       debugPrint(funcOp, "After step 2: thread distribution");
 
       // Step 3. Vectorize the distributed copies.
-      vectorizeDistributedCopies(funcOp);
+      RewritePatternSet vectorizationPatterns(context);
+      populateVectorizationPatterns(vectorizationPatterns);
+      if (failed(applyPatternsAndFoldGreedily(
+              funcOp, std::move(vectorizationPatterns)))) {
+        return signalPassFailure();
+      }
       debugPrint(funcOp, "After step 3: vectorization");
 
       // Step4. Finally unroll all the loop created

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -56,7 +56,7 @@ iree_compiler_cc_library(
         "LLVMGPUCastAddressSpaceFunction.cpp",
         "LLVMGPULowerExecutableTarget.cpp",
         "LLVMGPUPackSharedMemoryAlloc.cpp",
-        "LLVMGPUTensorCorePreparation.cpp",
+        "LLVMGPUTensorCoreVectorization.cpp",
         "LLVMGPUTensorPad.cpp",
         "LLVMGPUTileAndDistribute.cpp",
         "LLVMGPUVectorLowering.cpp",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -55,7 +55,7 @@ iree_cc_library(
     "LLVMGPUCastAddressSpaceFunction.cpp"
     "LLVMGPULowerExecutableTarget.cpp"
     "LLVMGPUPackSharedMemoryAlloc.cpp"
-    "LLVMGPUTensorCorePreparation.cpp"
+    "LLVMGPUTensorCoreVectorization.cpp"
     "LLVMGPUTensorPad.cpp"
     "LLVMGPUTileAndDistribute.cpp"
     "LLVMGPUVectorLowering.cpp"

diff --git a/.../LLVMGPU/LLVMGPUTensorCorePreparation.cpp → ...LVMGPU/LLVMGPUTensorCoreVectorization.cpp b/.../LLVMGPU/LLVMGPUTensorCorePreparation.cpp → ...LVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -4,6 +4,8 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree-dialects/Dialect/LinalgExt/Passes/Passes.h"
+#include "iree-dialects/Dialect/LinalgExt/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
 #include "iree/compiler/Codegen/Dialect/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
@@ -20,15 +22,31 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
-#define DEBUG_TYPE "iree-codegen-gpu-tensorcore-preparation"
+#define DEBUG_TYPE "iree-codegen-gpu-tensorcore-vectorization"
+
+using mlir::iree_compiler::IREE::LinalgExt::LinalgVectorizationPattern;
+using mlir::iree_compiler::IREE::LinalgExt::VectorizationPatterns;
 
 namespace mlir {
 namespace iree_compiler {
 
 //====---------------------------------------------------------------------===//
-// Patterns for preparation
+// Patterns for vectorization
 //====---------------------------------------------------------------------===//
 
+static void populateVectorizationPatterns(RewritePatternSet &patterns) {
+  IREE::LinalgExt::LinalgTransformationFilter f(
+      StringAttr::get(patterns.getContext(), getVectorizeMarker()));
+  IREE::LinalgExt::LinalgVectorizationOptions vectorizationOptions;
+  VectorizationPatterns<linalg::FillOp, linalg::GenericOp>::insert(
+      patterns, vectorizationOptions, f);
+  patterns.add<LinalgVectorizationPattern>(
+      patterns.getContext(), vectorizationOptions,
+      f.addOpFilter<linalg::ContractionOpInterface>());
+  vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
+  vector::populateVectorReductionToContractPatterns(patterns);
+}
+
 static void populateVectorUnrollPatterns(RewritePatternSet &patterns,
                                          bool useMmaSyncShape) {
   auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
@@ -49,24 +67,47 @@ static void populateVectorUnrollPatterns(RewritePatternSet &patterns,
 }
 
 namespace {
-struct LLVMGPUTensorCorePreparationPass
-    : public LLVMGPUTensorCorePreparationBase<
-          LLVMGPUTensorCorePreparationPass> {
-  LLVMGPUTensorCorePreparationPass(GPUTensorCoreType tensorCoreType)
+struct LLVMGPUTensorCoreVectorizationPass
+    : public LLVMGPUTensorCoreVectorizationBase<
+          LLVMGPUTensorCoreVectorizationPass> {
+  LLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType)
       : tensorCoreType(tensorCoreType) {}
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<vector::VectorDialect>();
   }
   void runOnOperation() override {
     auto funcOp = getOperation();
     LLVM_DEBUG({
-      llvm::dbgs() << "LLVMGPUTensorCorePreparationPass runOnOperation():\n";
+      llvm::dbgs() << "LLVMGPUTensorCoreVectorizationPass runOnOperation():\n";
       funcOp->dump();
     });
 
     MLIRContext *context = &getContext();
     {
-      // Step 1. Merge transpose into transfer_read ops.
+      // Step 1(a). Vectorize (linalg to vector).
+      RewritePatternSet vectorizationPatterns(context);
+      populateVectorizationPatterns(vectorizationPatterns);
+      if (failed(applyPatternsAndFoldGreedily(
+              funcOp, std::move(vectorizationPatterns)))) {
+        return signalPassFailure();
+      }
+      LLVM_DEBUG({
+        llvm::dbgs() << "\nAfter populateVectorizationPatterns:\n";
+        funcOp->dump();
+      });
+
+      // Step 1(b). Fold arithmetic extensions into vector contraction ops.
+      // Linalg to vector conversion introduces arithmetic extensions on the
+      // operands of vector contraction ops for mixed precision computation.
+      // This pattern folds the arithmetic extensions into the vector.contract.
+      RewritePatternSet foldArithExtPatterns(context);
+      vector::populateFoldArithExtensionPatterns(foldArithExtPatterns);
+      if (failed(applyPatternsAndFoldGreedily(
+              funcOp, std::move(foldArithExtPatterns)))) {
+        return signalPassFailure();
+      }
+
+      // Step 2. Fold consumer add ops into the contraction op itself.
       RewritePatternSet canonicalizationPatterns(context);
       vector::ContractionOp::getCanonicalizationPatterns(
           canonicalizationPatterns, context);
@@ -82,7 +123,7 @@ struct LLVMGPUTensorCorePreparationPass
         funcOp->dump();
       });
 
-      // Step 2. Prepare vector operations to be lowered to native tensor core
+      // Step 3. Prepare vector operations to be lowered to native tensor core
       // operations (nvgpu.mmasync, nvgpu.ldmatrix).
       if (tensorCoreType == GPUTensorCoreType::MMA_SYNC) {
         RewritePatternSet vectorContractPatterns(funcOp.getContext());
@@ -103,7 +144,7 @@ struct LLVMGPUTensorCorePreparationPass
       });
 
       bool useMmaSyncShape = tensorCoreType == GPUTensorCoreType::MMA_SYNC;
-      // Step 3. Break and unroll warp tile size to native math and load sizes.
+      // Step 4. Break and unroll warp tile size to native math and load sizes.
       RewritePatternSet vectorUnrollPatterns(context);
       populateVectorUnrollPatterns(vectorUnrollPatterns, useMmaSyncShape);
       if (failed(applyPatternsAndFoldGreedily(
@@ -123,8 +164,8 @@ struct LLVMGPUTensorCorePreparationPass
 } // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>>
-createLLVMGPUTensorCorePreparationPass(GPUTensorCoreType tensorCoreType) {
-  return std::make_unique<LLVMGPUTensorCorePreparationPass>(tensorCoreType);
+createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType) {
+  return std::make_unique<LLVMGPUTensorCoreVectorizationPass>(tensorCoreType);
 }
 
 } // namespace iree_compiler

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -236,24 +236,9 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
   nestedModulePM.addPass(createCanonicalizerPass());
   nestedModulePM.addPass(createCSEPass());
 
-  // Distribute shared memory copies.
-  nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
-  nestedModulePM.addNestedPass<func::FuncOp>(
-      createGPUDistributeSharedMemoryCopy());
-  nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
-  nestedModulePM.addNestedPass<func::FuncOp>(
-      createGPUReduceSharedMemoryBankConflicts());
-
   // Linalg -> vector
-  {
-    GenericVectorizationPassOptions options;
-    options.enableCleanup = false;
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createGenericVectorizationPass(options));
-  }
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMGPUTensorCorePreparationPass());
+      createLLVMGPUTensorCoreVectorizationPass());
   nestedModulePM.addNestedPass<func::FuncOp>(
       memref::createFoldMemRefAliasOpsPass());
   nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
@@ -262,6 +247,15 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
   nestedModulePM.addNestedPass<func::FuncOp>(
       createHoistRedundantVectorTransfersPass());
 
+  // Distribute shared memory copies.
+  nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createGPUDistributeSharedMemoryCopy());
+  nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createGPUReduceSharedMemoryBankConflicts());
+
   // Vector -> MMA ops
   nestedModulePM.addNestedPass<func::FuncOp>(
       memref::createFoldMemRefAliasOpsPass());
@@ -306,22 +300,9 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &pm,
   nestedModulePM.addPass(createCanonicalizerPass());
   nestedModulePM.addPass(createCSEPass());
 
-  // Distribute shared memory copies.
-  nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
-  nestedModulePM.addNestedPass<func::FuncOp>(
-      createGPUDistributeSharedMemoryCopy());
-  nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
-
   // Linalg -> vector
-  {
-    GenericVectorizationPassOptions options;
-    options.enableCleanup = false;
-    nestedModulePM.addNestedPass<func::FuncOp>(
-        createGenericVectorizationPass(options));
-  }
   nestedModulePM.addNestedPass<func::FuncOp>(
-      createLLVMGPUTensorCorePreparationPass(GPUTensorCoreType::MMA_SYNC));
+      createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType::MMA_SYNC));
   nestedModulePM.addNestedPass<func::FuncOp>(
       memref::createFoldMemRefAliasOpsPass());
   nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
@@ -330,6 +311,13 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &pm,
   nestedModulePM.addNestedPass<func::FuncOp>(
       createHoistRedundantVectorTransfersPass());
 
+  // Distribute shared memory copies.
+  nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createGPUDistributeSharedMemoryCopy());
+  nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+
   // Vector -> MMA ops
   nestedModulePM.addNestedPass<func::FuncOp>(
       memref::createFoldMemRefAliasOpsPass());

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -89,7 +89,7 @@ enum class GPUTensorCoreType {
 
 /// Convert Linalg ops to Vector and prepare converstion to GPU MMA ops.
 std::unique_ptr<OperationPass<func::FuncOp>>
-createLLVMGPUTensorCorePreparationPass(
+createLLVMGPUTensorCoreVectorizationPass(
     GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);
 
 //. Pass to pad out tensors up to static dimensions.

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
@@ -60,10 +60,10 @@ def LLVMGPUPackSharedMemoryAlloc :
   let constructor = "mlir::iree_compiler::createLLVMGPUPackSharedMemoryAlloc()";
 }
 
-def LLVMGPUTensorCorePreparation :
-    Pass<"iree-llvmgpu-tensorcore-preparation", "func::FuncOp"> {
-  let summary = "Pass to transform vector.contract to a form that can be lowered to GPU MMA ops";
-  let constructor = "mlir::iree_compiler::createLLVMGPUTensorCorePreparationPass()";
+def LLVMGPUTensorCoreVectorization :
+    Pass<"iree-llvmgpu-tensorcore-vectorization", "func::FuncOp"> {
+  let summary = "Pass to convert linalg into Vector and transform it to a form that can be lowered to GPU MMA ops";
+  let constructor = "mlir::iree_compiler::createLLVMGPUTensorCoreVectorizationPass()";
 }
 
 def LLVMGPUTensorPad :

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -47,7 +47,7 @@ iree_lit_test_suite(
             "pack_pipeline_test.mlir",
             "pack_shared_memory_alloc.mlir",
             "tensor_pad.mlir",
-            "tensorcore_preparation.mlir",
+            "tensorcore_vectorization.mlir",
             "transform_dialect_hoist_allocs.mlir",
             "transform_dialect_vector_distribution.mlir",
             "transform_dialect_bufferize.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -43,7 +43,7 @@ iree_lit_test_suite(
     "set_transform_strategy_matmul.mlir"
     "set_transform_strategy_pad.mlir"
     "tensor_pad.mlir"
-    "tensorcore_preparation.mlir"
+    "tensorcore_vectorization.mlir"
     "transform_dialect_bufferize.mlir"
     "transform_dialect_eliminate_gpu_barriers.mlir"
     "transform_dialect_hoist_allocs.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_preparation.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_preparation.mlir