From 8b8342596bb399f0c699e980d87b17875905d66e Mon Sep 17 00:00:00 2001 From: Max191 <44243577+Max191@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:00:00 -0700 Subject: [PATCH] [Codegen] Add vector transfer + slice foldings in GenericVectorization (#17613) Vectorizing a `linalg.copy` op can result in a sequence of ``` %extract = tensor.extract_slice %source %read = vector.transfer_read %extract %write = vector.transfer_read %dest %insert = tensor.insert_slice %write into %dest ``` This sequence is folded by the transfer_write folder into ``` %extract = tensor.extract_slice %source %insert = tensor.insert_slice %extract into %dest ``` In order to conserve the vector transfers, this PR adds folding patterns for vector transfer ops acting on insert/extract slice ops. This will fold the insert_slice into the transfer_write and the extract_slice into the transfer_read, and the vector transfers will not be folded. This is turned off for the vector distribution pipeline because it causes distribution to fail in some cases. Also removes `Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir`, since it completes a TODO to remove the test after eliminating some undesired extra buffers. --------- Signed-off-by: Max Dawkins --- .../Codegen/Common/GenericVectorization.cpp | 13 +- .../src/iree/compiler/Codegen/Common/Passes.h | 2 + .../iree/compiler/Codegen/Common/Passes.td | 4 +- .../Common/test/generic_vectorization.mlir | 113 ++++++++++++------ .../LLVMCPU/test/pipeline_pad_tests.mlir | 4 +- .../vectorize_with_masking_and_hoist.mlir | 6 +- .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 7 +- .../compiler/Codegen/LLVMGPU/test/BUILD.bazel | 1 - .../Codegen/LLVMGPU/test/CMakeLists.txt | 1 - .../test/ROCDL/pipeline_tile_and_fuse.mlir | 2 - .../LLVMGPU/test/conv_pipeline_test_rocm.mlir | 61 ---------- 11 files changed, 100 insertions(+), 114 deletions(-) delete mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp index e36a9c789092..0a2bea0a9910 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp @@ -322,6 +322,8 @@ class GenericVectorizationPass this->generateContract.setValue(options.generateContract); this->foldCastIntoContract.setValue(options.foldCastIntoContract); this->maxVectorSize.setValue(options.maxVectorSize); + this->earlySubsetTransferFolding.setValue( + options.earlySubsetTransferFolding); } void getDependentDialects(DialectRegistry ®istry) const override { @@ -384,8 +386,17 @@ void GenericVectorizationPass::runOnOperation() { }; { - // Canonicalize mask related ops before we lower them. + // Canonicalize mask related ops before we lower them. Also run patterns + // for vector transfers on tensor subset ops, since they can be folded if + // not handled here. RewritePatternSet maskCanonPatterns(funcOp.getContext()); + if (earlySubsetTransferFolding) { + // It is important to add these vector transfer on tensor subset patterns + // in the first greedy pattern rewrite, since transfer foldings can remove + // vectorized reads and writes by folding them into tensor ops. + tensor::populateFoldTensorSubsetIntoVectorTransferPatterns( + maskCanonPatterns); + } vector::CreateMaskOp::getCanonicalizationPatterns(maskCanonPatterns, funcOp.getContext()); vector::ConstantMaskOp::getCanonicalizationPatterns(maskCanonPatterns, diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h index 2880477d0a2b..621fb35d2e62 100644 --- a/compiler/src/iree/compiler/Codegen/Common/Passes.h +++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h @@ -174,6 +174,8 @@ struct GenericVectorizationPassOptions { bool foldCastIntoContract = false; // Max vector size allowed to avoid creating large vectors. int64_t maxVectorSize = std::numeric_limits::max(); + // Enable early folding of tensor subset ops into vector transfer ops. + bool earlySubsetTransferFolding = true; }; /// Creates a pass to perform vectorization on LinAlg and tensor ops. std::unique_ptr> diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td index ed182941c372..0f313c55a8d5 100644 --- a/compiler/src/iree/compiler/Codegen/Common/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td @@ -288,7 +288,9 @@ def GenericVectorization : "Enable folding casting ops into vector.contract.">, Option<"maxVectorSize", "max-vector-size", "int64_t", /*default=*/"2147483647", - "Max vector size allowed to avoid creating large vectors."> + "Max vector size allowed to avoid creating large vectors.">, + Option<"earlySubsetTransferFolding", "early-subset-transfer-folding", "bool",/*default=*/"true", + "Enable early folding of tensor subset ops into vector transfer ops."> ]; let constructor = "mlir::iree_compiler::createGenericVectorizationPass()"; diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir index 3f0947d43f91..924fe5b5f950 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir @@ -64,12 +64,12 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1 // CHECK-LABEL: func.func @single_static_pack_infer_vector_size // CHECK: tensor.pack -// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)> -// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)> -// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)> -// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> -// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)> +// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 2)> +// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 51, 4)> +// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 101, d0 * 2)> +// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1) -> (d1 * -16 + 201, d0 * 16)> // CHECK-MASK-LABEL: func.func @single_static_pack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8 @@ -79,9 +79,8 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1 // CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]] // CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]] -// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]] // CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8> // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1] // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor @@ -130,12 +129,12 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor, %arg1: t // CHECK-LABEL: func.func @single_dynamic_pack_infer_vector_size // CHECK: tensor.pack -// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> -// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> -// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> +// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> +// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> // CHECK-MASK-LABEL: func.func @single_dynamic_pack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK: %[[C0:.+]] = arith.constant 0 : i8 @@ -145,9 +144,8 @@ func.func @single_dynamic_pack_infer_vector_size(%arg0: tensor, %arg1: t // CHECK-MASK: %[[WRITE_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK: %[[READ_SZ0:.+]] = affine.min #[[$MAP3]] // CHECK-MASK: %[[READ_SZ1:.+]] = affine.min #[[$MAP5]] -// CHECK-MASK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[READ_SZ0]], %[[READ_SZ1]]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[READ_SZ0]], %[[READ_SZ1]] : vector<8x32xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SLICE]][%{{.+}}], %[[C0]], %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%{{.+}}], %[[C0]], %[[READ_MASK]] // CHECK-MASK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<8x32xi8> to vector<4x2x2x16xi8> // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[CAST]], [2, 0, 3, 1] // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[WRITE_SZ0]], %[[WRITE_SZ1]]) : tensor @@ -204,13 +202,13 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor) -> tensor } return %3 : tensor<32x?x64x16x2xbf16> } -// CHECK-MASK: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> -// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)> -// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)> -// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)> -// CHECK-MASK: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> -// CHECK-MASK: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)> -// CHECK-MASK: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> +// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0 ceildiv 16)> +// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (-d0 + 64, 6)> +// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0, d1) -> (d1 * -2 + 128, d0 * 2)> +// CHECK-MASK-DAG: #[[$MAP4:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -16 + s0, d0 * 16)> +// CHECK-MASK-DAG: #[[$MAP5:.+]] = affine_map<(d0) -> (d0 * 16)> +// CHECK-MASK-DAG: #[[$MAP6:.+]] = affine_map<(d0) -> (d0 * 2)> // CHECK-MASK-LABEL: func.func @generic_pack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK-DAG: %[[C0_BF16:.+]] = arith.constant 0.000000e+00 : bf16 @@ -229,9 +227,8 @@ func.func @generic_pack_infer_vector_size(%arg0: tensor) -> tensor // CHECK-MASK-DAG: %[[SRC_SZ0:.+]] = affine.min #[[$MAP4]] // CHECK-MASK-DAG: %[[SRC_SZ2:.+]] = affine.min #[[$MAP3]] // CHECK-MASK-DAG: %[[ITER_SLICE:.+]] = tensor.extract_slice %[[GENERIC_EMPTY]] -// CHECK-MASK-DAG: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]][{{.+}}] [%[[SRC_SZ0]], 2, %[[SRC_SZ2]]] // CHECK-MASK-DAG: %[[READ_MASK:.+]] = vector.create_mask %[[SRC_SZ0]], %[[C2]], %[[SRC_SZ2]] : vector<64x2x12xi1> -// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}} %[[READ_MASK]] +// CHECK-MASK: %[[GENERIC_READ:.+]] = vector.transfer_read %[[SRC]]{{.+}} %[[READ_MASK]] // CHECK-MASK-DAG: %[[WRITE_MASK:.+]] = vector.create_mask %[[C2]], %[[SRC_SZ2]], %[[SRC_SZ0]] : vector<2x12x64xi1> // CHECK-MASK: %[[TRUNC:.+]] = arith.truncf %[[GENERIC_READ]] // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[TRUNC]], [1, 2, 0] @@ -278,10 +275,10 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor, } return %0 : tensor } -// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)> -// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)> -// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> -// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> +// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)> +// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)> +// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> // CHECK-MASK-LABEL: func.func @single_dynamic_unpack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -292,9 +289,8 @@ func.func @single_dynamic_unpack_infer_vector_size(%arg0: tensor, // CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]] // CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]] -// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]] // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3] // CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32> // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor @@ -338,10 +334,10 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor, %arg1: } return %0 : tensor } -// CHECK-MASK: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)> -// CHECK-MASK: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)> -// CHECK-MASK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> -// CHECK-MASK: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> +// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)> +// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)> +// CHECK-MASK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK-MASK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 16)> // CHECK-MASK-LABEL: func.func @generic_unpack_infer_vector_size // CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -352,9 +348,8 @@ func.func @generic_unpack_infer_vector_size(%arg0: tensor, %arg1: // CHECK-MASK-DAG: %[[DEST_SZ0:.+]] = affine.min #[[$MAP0]] // CHECK-MASK-DAG: %[[DEST_SZ1:.+]] = affine.min #[[$MAP1]] // CHECK-MASK-DAG: %[[SRC_SZ1:.+]] = affine.apply #[[$MAP3]] -// CHECK-MASK: %[[SRC_SLICE:.+]] = tensor.extract_slice %[[SRC]] // CHECK-MASK: %[[READ_MASK:.+]] = vector.create_mask %[[C1]], %[[SRC_SZ1]], %[[C16]], %[[C16]] : vector<1x2x16x16xi1> -// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC_SLICE]]{{.+}}, %[[READ_MASK]] +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]]{{.+}}, %[[READ_MASK]] // CHECK-MASK: %[[TRANSP:.+]] = vector.transpose %[[READ]], [0, 2, 1, 3] // CHECK-MASK: %[[SHAPE_CAST:.+]] = vector.shape_cast %[[TRANSP]] : vector<1x16x2x16xf32> to vector<16x32xf32> // CHECK-MASK: %[[EMPTY:.+]] = tensor.empty(%[[DEST_SZ0]], %[[DEST_SZ1]]) : tensor @@ -404,4 +399,46 @@ func.func @dynamic_fill_with_scalable_tiling_infer_vector_size(%arg0: tensor<1x6 // CHECK-MASK: scf.for // CHECK-MASK: scf.for // CHECK-MASK: scf.for -// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32> +// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x67x120x176xf32> + +// ----- + +#map = affine_map<(d0)[s0] -> (-d0 + s0, 16)> +#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 32)> +func.func @tiled_linalg_copy(%arg0: tensor, %arg1: tensor) -> tensor { + %c32 = arith.constant 32 : index + %c16 = arith.constant 16 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %dim = tensor.dim %arg1, %c0 : tensor + %dim_0 = tensor.dim %arg1, %c1 : tensor + %0 = scf.for %arg3 = %c0 to %dim step %c16 iter_args(%arg4 = %arg1) -> (tensor) { + %1 = scf.for %arg5 = %c0 to %dim_0 step %c32 iter_args(%arg6 = %arg4) -> (tensor) { + %2 = affine.min #map(%arg3)[%dim] + %3 = affine.min #map1(%arg5)[%dim_0] + %extracted_slice_0 = tensor.extract_slice %arg0[%arg3, %arg5] [%2, %3] [1, 1] : tensor to tensor + %extracted_slice_1 = tensor.extract_slice %arg1[%arg3, %arg5] [%2, %3] [1, 1] : tensor to tensor + %copy = linalg.copy ins(%extracted_slice_0 : tensor) outs(%extracted_slice_1 : tensor) -> tensor + %inserted_slice = tensor.insert_slice %copy into %arg6[%arg3, %arg5] [%2, %3] [1, 1] : tensor into tensor + scf.yield %inserted_slice : tensor + } + scf.yield %1 : tensor + } + return %0 : tensor +} +// CHECK-MASK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 16)> +// CHECK-MASK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 32)> +// CHECK-MASK-LABEL: func.func @tiled_linalg_copy +// CHECK-MASK-SAME: %[[SRC:[a-zA-Z0-9]+]]: tensor, %[[DST:[a-zA-Z0-9]+]] +// CHECK-MASK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-MASK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-MASK-DAG: %[[C16:.+]] = arith.constant 16 : index +// CHECK-MASK-DAG: %[[C32:.+]] = arith.constant 32 : index +// CHECK-MASK: scf.for %[[IV0:.+]] = %[[C0]] +// CHECK-MASK: scf.for %[[IV1:.+]] = %[[C0]] {{.*}} iter_args(%[[ITER_ARG:.+]] = {{.*}}) +// CHECK-MASK-DAG: %[[DST_SZ0:.+]] = affine.min #[[$MAP0]] +// CHECK-MASK-DAG: %[[DST_SZ1:.+]] = affine.min #[[$MAP1]] +// CHECK-MASK: %[[DST_SLICE:.+]] = tensor.extract_slice %[[DST]][%[[IV0]], %[[IV1]]] [%[[DST_SZ0]], %[[DST_SZ1]]] [1, 1] : tensor to tensor +// CHECK-MASK: %[[MASK:.+]] = vector.create_mask %[[DST_SZ0]], %[[DST_SZ1]] : vector<16x32xi1> +// CHECK-MASK: %[[READ:.+]] = vector.transfer_read %[[SRC]][%[[IV0]], %[[IV1]]],{{.*}} %[[MASK]]{{.*}} : tensor, vector<16x32xf32> +// CHECK-MASK: vector.transfer_write %[[READ]], %[[DST_SLICE]]{{.+}}, %[[MASK]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir index 045193a29cea..27898749a907 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir @@ -33,13 +33,11 @@ module { // CHECK: scf.for // CHECK: scf.for // CHECK: scf.for -// CHECK: %[[OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SUBVIEW]] // CHECK: %[[RESULT_VEC:.+]] = scf.if %{{.+}} -> (vector<4xf32>) { // CHECK: %[[VEC_LOAD:.+]] = vector.load %[[INPUT_SUBVIEW]] // CHECK: scf.yield %[[VEC_LOAD]] // CHECK: } -// CHECK: %[[DROP_UNIT_OUTPUT_SLICE:.+]] = memref.subview %[[OUTPUT_SLICE]] -// CHECK: vector.store %[[RESULT_VEC]], %[[DROP_UNIT_OUTPUT_SLICE]] +// CHECK: vector.store %[[RESULT_VEC]], %[[OUTPUT_SUBVIEW]] // ----- #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir index b6450d346fcc..5f854dc25d60 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir @@ -20,7 +20,6 @@ // CHECK: scf.for {{.*}} iter_args(%[[OUT_TENSOR:.*]] = {{.*}}) -> (tensor<1024x1024xf32>) { // CHECK-NEXT: scf.for {{.*}} iter_args(%[[OUT_TENSOR_1:.*]] = %[[OUT_TENSOR]]) -> (tensor<1024x1024xf32>) { // CHECK-NEXT: %[[OUT_SLICE:.*]] = tensor.extract_slice %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32> to tensor<8x?xf32> -// CHECK-NEXT: %[[OUT_SLICE_1:.*]] = tensor.extract_slice %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> to tensor<8x?xf32> // CHECK-NEXT: %[[OUT_VEC:.*]] = vector.transfer_read %[[OUT_TENSOR_1]]{{.*}} : tensor<1024x1024xf32>, vector<8x[16]xf32> // CHECK-NEXT: %[[INNER_LOOP:.*]] = scf.for {{.*}} iter_args(%[[RES:.*]] = %[[OUT_VEC]]) -> (vector<8x[16]xf32>) { // CHECK-NEXT: %[[LHS:.*]] = vector.transfer_read {{.*}} : tensor<1024x1024xf32>, vector<8x1xf32> @@ -30,9 +29,8 @@ // CHECK-SAME: %[[LHS]], %[[RHS]], %[[RES]] : vector<8x1xf32>, vector<1x[16]xf32> into vector<8x[16]xf32> // CHECK-NEXT: scf.yield %[[CONTRACT]] : vector<8x[16]xf32> // CHECK-NEXT: } -// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE_1]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32> -// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> into tensor<8x?xf32> -// CHECK-NEXT: tensor.insert_slice %[[INSERT_SLICE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32> +// CHECK-NEXT: %[[OUT_WRITE:.*]] = vector.transfer_write %[[INNER_LOOP]], %[[OUT_SLICE]]{{.*}} {{.*}} : vector<8x[16]xf32>, tensor<8x?xf32> +// CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32> func.func @pipeline() { %c1 = arith.constant 1 : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 153896714bb0..f204d27f2e1d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -239,13 +239,15 @@ static void tileAndBufferize(OpPassManager &funcPassManager) { addBufferizePasses(funcPassManager); } -static void addGPUVectorizationPasses(OpPassManager &funcPassManager) { +static void addGPUVectorizationPasses(OpPassManager &funcPassManager, + bool earlySubsetTransferFolding = true) { funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass()); GenericVectorizationPassOptions options; options.vectorizePadding = true; options.vectorizeGatherAccesses = true; options.enableCleanup = false; options.foldCastIntoContract = true; + options.earlySubsetTransferFolding = earlySubsetTransferFolding; funcPassManager.addPass(createGenericVectorizationPass(options)); funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); funcPassManager.addPass(createCanonicalizerPass()); @@ -763,7 +765,8 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); // Linalg -> Vector - addGPUVectorizationPasses(funcPassManager); + addGPUVectorizationPasses(funcPassManager, + /*earlySubsetTransferFolding=*/false); // Allocate tensors for copies to shared memory. funcPassManager.addPass(createGPUVectorAllocPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 1757b5ce48f4..e75bfc66c986 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -24,7 +24,6 @@ iree_lit_test_suite( "attention.mlir", "attention_mfma.mlir", "conv_pipeline_test_cuda.mlir", - "conv_pipeline_test_rocm.mlir", "convert_to_nvvm.mlir", "convert_to_rocdl.mlir", "create_async_groups.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 2ff84aa75ea2..6d87f02af258 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -24,7 +24,6 @@ iree_lit_test_suite( "config_matvec.mlir" "config_winograd.mlir" "conv_pipeline_test_cuda.mlir" - "conv_pipeline_test_rocm.mlir" "convert_to_nvvm.mlir" "convert_to_rocdl.mlir" "create_async_groups.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 95463a872aa2..d5ad390fa821 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -48,7 +48,6 @@ hal.executable public @main { // CHECK-DAG: memref.alloc() : memref<64x4xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x4xf16, #gpu.address_space> // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1280 step %c4 {{.*}} -> (vector<8x4xf32>) -// CHECK: gpu.barrier // CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<2xf16> // CHECK: vector.transfer_write %[[LHS_RD]], %[[LHS_ALLOC:[A-Za-z0-9]+]] // CHECK: gpu.barrier @@ -109,7 +108,6 @@ hal.executable public @main { // CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) -// CHECK: gpu.barrier // CHECK: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<8xf16> // CHECK: vector.transfer_write %[[LHS_RD]] // CHECK: gpu.barrier diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir deleted file mode 100644 index fbc4faa1b2b6..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir +++ /dev/null @@ -1,61 +0,0 @@ -// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \ -// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target,canonicalize)))))' \ -// RUN: %s | FileCheck %s - -#layout = #hal.pipeline.layout, - <1, storage_buffer, ReadOnly>, - <2, storage_buffer, ReadOnly>, - <3, storage_buffer> - ]> - ]> -hal.executable private @conv_nchw_dispatch_1 { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16 ordinal(0) layout(#layout) attributes { - hal.interface.bindings = [ - #hal.interface.binding<0, 0>, - #hal.interface.binding<0, 1>, - #hal.interface.binding<0, 2>, - #hal.interface.binding<0, 3> - ], - translation_info = #iree_codegen.translation_info} { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16() { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 320, 130, 130], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x320x130x130xf16> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<320x320x3x3xf16> - %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [320], strides = [1] : !flow.dispatch.tensor> -> tensor<320xf16> - %7 = tensor.empty() : tensor<2x320x64x64xf16> - %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f16) outs(%7 : tensor<2x320x64x64xf16>) -> tensor<2x320x64x64xf16> - %9 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<2> : vector<2xi64>} ins(%4, %5 : tensor<2x320x130x130xf16>, tensor<320x320x3x3xf16>) outs(%8 : tensor<2x320x64x64xf16>) -> tensor<2x320x64x64xf16> - %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<2x320x64x64xf16>, tensor<320xf16>) outs(%7 : tensor<2x320x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config} { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %11 = arith.addf %in, %in_0 : f16 - linalg.yield %11 : f16 - } -> tensor<2x320x64x64xf16> - flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [2, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<2x320x64x64xf16> -> !flow.dispatch.tensor> - return - } - } - } -} - -// TODO: This test reflects a bug related to how the convolution is bufferized -// for the LLVMGPUVectorize pipeline, meaning these local memory allocations are -// not desired. This test should be dropped once the extra buffers have been -// eliminated. - -// CHECK-LABEL: func @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16 -// CHECK-COUNT-3: memref.alloc() : memref<1x1x1x4xf16, #gpu.address_space> -// CHECK-COUNT-3: memref.copy %{{.*}}, %{{.*}} : memref<1x1x1x4xf16, #gpu.address_space> to memref<{{.*}} #hal.descriptor_type>