Skip to content

Conversation

@charithaintc
Copy link
Contributor

This PR adds some tests for covering some useful corner cases.

  1. more tests for vector.shape_cast distribution.
  2. testing for MoveFuncBodyToWarpOp pattern that was not possible before.
@llvmbot
Copy link
Member

llvmbot commented Oct 8, 2025

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-gpu

Author: Charitha Saumya (charithaintc)

Changes

This PR adds some tests for covering some useful corner cases.

  1. more tests for vector.shape_cast distribution.
  2. testing for MoveFuncBodyToWarpOp pattern that was not possible before.

Full diff: https://github.com/llvm/llvm-project/pull/162543.diff

5 Files Affected:

  • (modified) mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h (+4)
  • (modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp (+7-3)
  • (added) mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir (+63)
  • (modified) mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir (+81-2)
  • (modified) mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp (+32)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index b74c15e5b7ac1..a480195eebd00 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -64,6 +64,10 @@ void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); /// Appends patterns for XeGPU SIMT distribution into `patterns`. void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns); +/// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op. +void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns); +/// Appends patterns for XeGPU workgroup to subgroup distribution into +/// `patterns`. void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns); /// Collect a set of patterns to unroll xegpu operations to a smaller shapes. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index f1dbc5ddb2022..26770b3c003ea 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -195,8 +195,7 @@ static bool requireTranspose(const xegpu::LayoutAttr layout, /// } /// return %0 /// } -struct MoveFuncBodyToWarpExecuteOnLane0 - : public OpRewritePattern<gpu::GPUFuncOp> { +struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> { using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern; LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const override { @@ -1447,6 +1446,11 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( /*pattern benefit=*/highPatternBenefit); } +void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns( + RewritePatternSet &patterns) { + patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext()); +} + void XeGPUSubgroupDistributePass::runOnOperation() { // Step 1: Attach layouts to op operands. // TODO: Following assumptions are made: @@ -1473,7 +1477,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // gpu.warp_execute_on_lane_0 operation. { RewritePatternSet patterns(&getContext()); - patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext()); + xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns); if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { signalPassFailure(); diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir new file mode 100644 index 0000000000000..d289d73e863c7 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir @@ -0,0 +1,63 @@ +// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s + +gpu.module @test { +gpu.func @empty() { + gpu.return +} +} + +// CHECK-LABEL: gpu.func @empty() { +// CHECK-NEXT: gpu.return +// CHECK-NEXT: } + +// ----- +gpu.module @test { +gpu.func @gemm(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0[%c0, %c0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1[%c0, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = xegpu.dpas %2, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return +} +} + +// CHECK-LABEL: gpu.func @gemm( +// CHECK: %[[ARG0:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<16x16xf16>, +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<8x16xf32>) { +// CHECK: %[[LANEID:.*]] = gpu.lane_id +// CHECK-NEXT: gpu.warp_execute_on_lane_0(%[[LANEID]])[16] +// CHECK-SAME: args(%[[ARG0]], %[[ARG1]], %[[ARG2]] : memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) { +// CHECK: ^bb0(%[[ARG3:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG4:[a-zA-Z0-9]+]]: memref<16x16xf16>, +// CHECK-SAME: %[[ARG5:[a-zA-Z0-9]+]]: memref<8x16xf32>): +// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG3]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG4]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T1]][{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[T2]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> +// CHECK-NEXT: %[[T5:.*]] = xegpu.dpas %[[T3]], %[[T4]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK-NEXT: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG5]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T5]], %[[T6]][%{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: gpu.return + +// ----- +gpu.module @test { +gpu.func @already_in_warp_op() { + %laneid = gpu.lane_id + gpu.warp_execute_on_lane_0(%laneid)[16] { + "some_op"() : () -> () + gpu.yield + } + gpu.return +} +} + +// CHECK-LABEL: gpu.func @already_in_warp_op() { +// CHECK: %[[LANEID:.*]] = gpu.lane_id +// CHECK: gpu.warp_execute_on_lane_0(%[[LANEID]])[16] { +// CHECK: "some_op"() : () -> () +// CHECK: } +// CHECK: gpu.return diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 40b66d18cc47f..f233dff609f2b 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -530,7 +530,7 @@ gpu.module @xevm_module{ // CHECK-NEXT: } // CHECK-NEXT: %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32> gpu.module @xevm_module{ - gpu.func @vector_transpose(%arg0: memref<2x16xf32>, %laneid: index) { + gpu.func @vector_transpose(%laneid: index) { %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) { %cst = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} @@ -556,7 +556,7 @@ gpu.module @xevm_module{ // CHECK: } // CHECK: vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16> gpu.module @xevm_module{ - gpu.func @vector_bitcast(%arg0: memref<4x16xi16>, %laneid: index) { + gpu.func @vector_bitcast(%laneid: index) { %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) { %cst = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} @@ -573,3 +573,82 @@ gpu.module @xevm_module{ gpu.return } } + +// ----- +// CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing +// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) { +// CHECK: gpu.yield %{{.*}} : vector<1x16xf32>, vector<16xf32> +// CHECK: } +// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1xf32> to vector<1x1xf32> +gpu.module @xevm_module { + gpu.func @vector_shapecast_rank_increasing(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) { + %cst = "some_op"() + {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} + : () -> (vector<16xf32>) + %cast = vector.shape_cast %cst + { + layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + : vector<16xf32> to vector<1x16xf32> + gpu.yield %cast : vector<1x16xf32> + } + "some_user_op"(%r) : (vector<1x1xf32>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @vector_shapecast_rank_reducing( +// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1xf32>, vector<1x1xf32>) { +// CHECK: gpu.yield %{{.*}} : vector<16xf32>, vector<1x16xf32> +// CHECK: } +// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1x1xf32> to vector<1xf32> +gpu.module @xevm_module { + gpu.func @vector_shapecast_rank_reducing(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) { + %cst = "some_op"() + {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + : () -> (vector<1x16xf32>) + %cast = vector.shape_cast %cst + { + layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, + layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]> + } + : vector<1x16xf32> to vector<16xf32> + gpu.yield %cast : vector<16xf32> + } + "some_user_op"(%r) : (vector<1xf32>) -> () + gpu.return + } +} + +// ----- +// NOTE: Layouts are still valid, but distribution still requires a slice layout for the operand. +// +// CHECK-LABEL: gpu.func @vector_shapecast_unsupported +// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>) { +// CHECK: %[[T1:.*]] = vector.shape_cast %{{.*}} : vector<16xf32> to vector<1x16xf32> +// CHECK: gpu.yield %[[T1]] : vector<1x16xf32> +// CHECK: } +// CHECK: "some_user_op"(%[[W]]) : (vector<1x1xf32>) -> () +// CHECK: gpu.return +gpu.module @xevm_module { + gpu.func @vector_shapecast_unsupported(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) { + %cst = "some_op"() + {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> } + : () -> (vector<16xf32>) + %cast = vector.shape_cast %cst + { + layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, + layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> + } + : vector<16xf32> to vector<1x16xf32> + gpu.yield %cast : vector<1x16xf32> + } + "some_user_op"(%r) : (vector<1x1xf32>) -> () + gpu.return + } +} diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 6ba7a004b7d31..1869a9af34493 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -247,6 +248,36 @@ struct TestXeGPUSGDistribute } }; +struct TestXeGPUMoveFuncBodyToWarpOp + : public PassWrapper<TestXeGPUMoveFuncBodyToWarpOp, + OperationPass<gpu::GPUModuleOp>> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUMoveFuncBodyToWarpOp) + + StringRef getArgument() const final { + return "test-xegpu-move-func-to-warp-op"; + } + + StringRef getDescription() const final { + return "Test the implementation of XeGPU move gpu function body to " + "WarpExecuteOnLane0 op."; + } + + void getDependentDialects(::mlir::DialectRegistry &registry) const override { + registry.insert<xegpu::XeGPUDialect>(); + registry.insert<gpu::GPUDialect>(); + } + + TestXeGPUMoveFuncBodyToWarpOp() = default; + TestXeGPUMoveFuncBodyToWarpOp(const TestXeGPUMoveFuncBodyToWarpOp &pass) = + default; + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); + } +}; + struct TestXeGPULayoutInterface : public PassWrapper<TestXeGPULayoutInterface, OperationPass<gpu::GPUModuleOp>> { @@ -312,6 +343,7 @@ void registerTestXeGPULowerings() { PassRegistration<TestXeGPUUnrollingPatterns>(); PassRegistration<TestXeGPULayoutInterface>(); PassRegistration<TestXeGPUSGDistribute>(); + PassRegistration<TestXeGPUMoveFuncBodyToWarpOp>(); } } // namespace test } // namespace mlir 
Copy link
Contributor

@Jianhui-Li Jianhui-Li left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Copy link
Contributor

@adam-smnk adam-smnk left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for increasing coverage 👍

@charithaintc charithaintc merged commit bd6da1f into llvm:main Oct 10, 2025
5 of 9 checks passed
DharuniRAcharya pushed a commit to DharuniRAcharya/llvm-project that referenced this pull request Oct 13, 2025
…2543) This PR adds some tests for covering some useful corner cases. 1. more tests for `vector.shape_cast` distribution. 2. testing for `MoveFuncBodyToWarpOp` pattern that was not possible before.
akadutta pushed a commit to akadutta/llvm-project that referenced this pull request Oct 14, 2025
…2543) This PR adds some tests for covering some useful corner cases. 1. more tests for `vector.shape_cast` distribution. 2. testing for `MoveFuncBodyToWarpOp` pattern that was not possible before.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

6 participants