@@ -98,10 +98,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
9898 }
9999
100100 if (ci.clusterSize >= 32 ) {
101- // auto permArg = builder.getInt32(15);
102- // auto rowMask = builder.getInt32("0xa");
103- // auto bankMask = builder.getInt32("0xf");
104- // auto boundCtrl = builder.getBoolAttr(false);
105101 auto permArg = b.getIntegerAttr (b.getIntegerType (32 ), 15 );
106102 Value dppResult = b.create <amdgpu::DPPOp>(
107103 loc, result.getType (), result, result, amdgpu::DPPPerm::row_bcast_15,
@@ -111,10 +107,6 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
111107 }
112108
113109 if (ci.clusterSize == 64 ) {
114- // auto permArg = builder.getInt32(31);
115- // auto rowMask = builder.getInt32("0xc");
116- // auto bankMask = builder.getInt32("0xf");
117- // auto boundCtrl = builder.getBoolAttr(false);
118110 auto permArg = b.getIntegerAttr (b.getIntegerType (32 ), 31 );
119111 Value dppResult = b.create <amdgpu::DPPOp>(
120112 loc, result.getType (), result, result, amdgpu::DPPPerm::row_bcast_31,
@@ -123,9 +115,9 @@ Value createSubgroupDPPReduction(OpBuilder &b, Location loc, Value input,
123115 result, dppResult);
124116 }
125117
126- // // read lane 63 with the final result.
127- // auto lane = b.getIntegerAttr(b.getIntegerType(32) , 63);
128- // result = b.create<ROCDL::ReadLaneOp >(loc, input.getType(), result, lane );
118+ auto int32Type = IntegerType::get (b. getContext (), 32 );
119+ Value lane63 = b.create <LLVM::ConstantOp>(loc, int32Type , 63 );
120+ result = b.create <ROCDL::ReadlaneOp >(loc, input.getType (), result, lane63 );
129121 assert (result.getType () == input.getType ());
130122 return result;
131123}
@@ -170,16 +162,16 @@ struct ConvertGPUToAMDGPUPass
170162 void runOnOperation () override {
171163 RewritePatternSet patterns (&getContext ());
172164 int subgroupSizeInt = static_cast <int >(subgroupSize);
173- populateSubgroupReduceLoweringPatterns (patterns, subgroupSizeInt,
165+ populateAMDGPUOptimizedSubgroupReducePatterns (patterns, subgroupSizeInt,
174166 PatternBenefit (1 ));
175167 walkAndApplyPatterns (getOperation (), std::move (patterns));
176168 }
177169};
178170} // namespace
179171
180- void mlir::populateSubgroupReduceLoweringPatterns (RewritePatternSet &patterns,
172+ void mlir::populateAMDGPUOptimizedSubgroupReducePatterns (RewritePatternSet &patterns,
181173 unsigned subgroupSize,
182174 PatternBenefit benefit) {
183175 patterns.add <ScalarSubgroupReduceToShuffles>(
184176 patterns.getContext (), subgroupSize, /* matchClustered=*/ true , benefit);
185- }
177+ }
0 commit comments