Skip to content

Commit a5676a3

Browse files
committed
StructurizeCFG: Set Undef for non-predecessors in setPhiValues()
During structurization process, we may place non-predecessor blocks between the predecessors of a block in the structurized CFG. Take the typical while-break case as an example: ``` /---A(v=...) | / \ ^ B C | \ /| \---L | \ / E (r = phi (v:C)...) ``` After structurization, the CFG would be look like: ``` /---A | |\ | | C | |/ | F1 ^ |\ | | B | |/ | F2 | |\ | | L \ |/ \--F3 | E ``` We can see that block B is placed between the predecessors(C/L) of E. During phi reconstruction, to achieve the same sematics as before, we are reconstructing the PHIs as: F1: v1 = phi (v:C), (undef:A) F3: r = phi (v1:F2), ... But this is also saying that `v1` would be live through B, which is not quite necessary. The idea in the change is to say the incoming value from B is Undef for the PHI in E. With this change, the reconstructed PHI would be: F1: v1 = phi (v:C), (undef:A) F2: v2 = phi (v1:F1), (undef:B) F3: r = phi (v2:F2), ... Reviewed by: sameerds Differential Revision: https://reviews.llvm.org/D132450
1 parent 40e9284 commit a5676a3

File tree

6 files changed

+252
-186
lines changed

6 files changed

+252
-186
lines changed

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

Lines changed: 90 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "llvm/ADT/SCCIterator.h"
1313
#include "llvm/ADT/STLExtras.h"
1414
#include "llvm/ADT/SmallPtrSet.h"
15+
#include "llvm/ADT/SmallSet.h"
1516
#include "llvm/ADT/SmallVector.h"
1617
#include "llvm/Analysis/InstructionSimplify.h"
1718
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
@@ -246,6 +247,7 @@ class StructurizeCFG {
246247

247248
SmallVector<RegionNode *, 8> Order;
248249
BBSet Visited;
250+
BBSet FlowSet;
249251

250252
SmallVector<WeakVH, 8> AffectedPhis;
251253
BBPhiMap DeletedPhis;
@@ -278,6 +280,9 @@ class StructurizeCFG {
278280

279281
void addPhiValues(BasicBlock *From, BasicBlock *To);
280282

283+
void findUndefBlocks(BasicBlock *PHIBlock,
284+
const SmallSet<BasicBlock *, 8> &Incomings,
285+
SmallVector<BasicBlock *> &UndefBlks) const;
281286
void setPhiValues();
282287

283288
void simplifyAffectedPhis();
@@ -632,6 +637,67 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
632637
AddedPhis[To].push_back(From);
633638
}
634639

640+
/// When we are reconstructing a PHI inside \p PHIBlock with incoming values
641+
/// from predecessors \p Incomings, we have a chance to mark the available value
642+
/// from some blocks as undefined. The function will find out all such blocks
643+
/// and return in \p UndefBlks.
644+
void StructurizeCFG::findUndefBlocks(
645+
BasicBlock *PHIBlock, const SmallSet<BasicBlock *, 8> &Incomings,
646+
SmallVector<BasicBlock *> &UndefBlks) const {
647+
// We may get a post-structured CFG like below:
648+
//
649+
// | P1
650+
// |/
651+
// F1
652+
// |\
653+
// | N
654+
// |/
655+
// F2
656+
// |\
657+
// | P2
658+
// |/
659+
// F3
660+
// |\
661+
// B
662+
//
663+
// B is the block that has a PHI being reconstructed. P1/P2 are predecessors
664+
// of B before structurization. F1/F2/F3 are flow blocks inserted during
665+
// structurization process. Block N is not a predecessor of B before
666+
// structurization, but are placed between the predecessors(P1/P2) of B after
667+
// structurization. This usually means that threads went to N never take the
668+
// path N->F2->F3->B. For example, the threads take the branch F1->N may
669+
// always take the branch F2->P2. So, when we are reconstructing a PHI
670+
// originally in B, we can safely say the incoming value from N is undefined.
671+
SmallSet<BasicBlock *, 8> VisitedBlock;
672+
SmallVector<BasicBlock *, 8> Stack;
673+
if (PHIBlock == ParentRegion->getExit()) {
674+
for (auto P : predecessors(PHIBlock)) {
675+
if (ParentRegion->contains(P))
676+
Stack.push_back(P);
677+
}
678+
} else {
679+
append_range(Stack, predecessors(PHIBlock));
680+
}
681+
682+
// Do a backward traversal over the CFG, and stop further searching if
683+
// the block is not a Flow. If a block is neither flow block nor the
684+
// incoming predecessor, then the incoming value from the block is
685+
// undefined value for the PHI being reconstructed.
686+
while (!Stack.empty()) {
687+
BasicBlock *Current = Stack.pop_back_val();
688+
if (VisitedBlock.contains(Current))
689+
continue;
690+
691+
VisitedBlock.insert(Current);
692+
if (FlowSet.contains(Current)) {
693+
for (auto P : predecessors(Current))
694+
Stack.push_back(P);
695+
} else if (!Incomings.contains(Current)) {
696+
UndefBlks.push_back(Current);
697+
}
698+
}
699+
}
700+
635701
/// Add the real PHI value as soon as everything is set up
636702
void StructurizeCFG::setPhiValues() {
637703
SmallVector<PHINode *, 8> InsertedPhis;
@@ -643,6 +709,8 @@ void StructurizeCFG::setPhiValues() {
643709
if (!DeletedPhis.count(To))
644710
continue;
645711

712+
SmallVector<BasicBlock *> UndefBlks;
713+
bool CachedUndefs = false;
646714
PhiMap &Map = DeletedPhis[To];
647715
for (const auto &PI : Map) {
648716
PHINode *Phi = PI.first;
@@ -651,15 +719,30 @@ void StructurizeCFG::setPhiValues() {
651719
Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
652720
Updater.AddAvailableValue(To, Undef);
653721

654-
NearestCommonDominator Dominator(DT);
655-
Dominator.addBlock(To);
722+
SmallSet<BasicBlock *, 8> Incomings;
723+
SmallVector<BasicBlock *> ConstantPreds;
656724
for (const auto &VI : PI.second) {
725+
Incomings.insert(VI.first);
657726
Updater.AddAvailableValue(VI.first, VI.second);
658-
Dominator.addAndRememberBlock(VI.first);
727+
if (isa<Constant>(VI.second))
728+
ConstantPreds.push_back(VI.first);
659729
}
660730

661-
if (!Dominator.resultIsRememberedBlock())
662-
Updater.AddAvailableValue(Dominator.result(), Undef);
731+
if (!CachedUndefs) {
732+
findUndefBlocks(To, Incomings, UndefBlks);
733+
CachedUndefs = true;
734+
}
735+
736+
for (auto UB : UndefBlks) {
737+
// If this undef block is dominated by any predecessor(before
738+
// structurization) of reconstructed PHI with constant incoming value,
739+
// don't mark the available value as undefined. Setting undef to such
740+
// block will stop us from getting optimal phi insertion.
741+
if (any_of(ConstantPreds,
742+
[&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
743+
continue;
744+
Updater.AddAvailableValue(UB, Undef);
745+
}
663746

664747
for (BasicBlock *FI : From)
665748
Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
@@ -759,6 +842,7 @@ BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
759842
Order.back()->getEntry();
760843
BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
761844
Func, Insert);
845+
FlowSet.insert(Flow);
762846
DT->addNewBlock(Flow, Dominator);
763847
ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
764848
return Flow;
@@ -1103,6 +1187,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
11031187
Loops.clear();
11041188
LoopPreds.clear();
11051189
LoopConds.clear();
1190+
FlowSet.clear();
11061191

11071192
return true;
11081193
}

llvm/test/CodeGen/AMDGPU/multilevel-break.ll

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,34 +9,32 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
99
; OPT-NEXT: main_body:
1010
; OPT-NEXT: br label [[LOOP_OUTER:%.*]]
1111
; OPT: LOOP.outer:
12-
; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
13-
; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ]
12+
; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
13+
; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP3:%.*]], [[FLOW1]] ]
1414
; OPT-NEXT: br label [[LOOP:%.*]]
1515
; OPT: LOOP:
16-
; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
17-
; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ]
18-
; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ]
16+
; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
17+
; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP3]], [[FLOW]] ]
1918
; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]]
20-
; OPT-NEXT: [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]])
21-
; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0
22-
; OPT-NEXT: [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1
23-
; OPT-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
19+
; OPT-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]])
20+
; OPT-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
21+
; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
22+
; OPT-NEXT: br i1 [[TMP1]], label [[ENDIF:%.*]], label [[FLOW]]
2423
; OPT: Flow:
25-
; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
26-
; OPT-NEXT: [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ]
27-
; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
28-
; OPT-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
29-
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
30-
; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]])
31-
; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
32-
; OPT-NEXT: [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]])
33-
; OPT-NEXT: br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]]
24+
; OPT-NEXT: [[TMP3]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ undef, [[LOOP]] ]
25+
; OPT-NEXT: [[TMP4:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
26+
; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
27+
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
28+
; OPT-NEXT: [[TMP6]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN]])
29+
; OPT-NEXT: [[TMP7:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP6]])
30+
; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP4]], i64 [[PHI_BROKEN2]])
31+
; OPT-NEXT: br i1 [[TMP7]], label [[FLOW1]], label [[LOOP]]
3432
; OPT: Flow1:
35-
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
36-
; OPT-NEXT: [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]])
37-
; OPT-NEXT: br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]]
33+
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
34+
; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
35+
; OPT-NEXT: br i1 [[TMP9]], label [[IF:%.*]], label [[LOOP_OUTER]]
3836
; OPT: IF:
39-
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
37+
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
4038
; OPT-NEXT: ret void
4139
; OPT: ENDIF:
4240
; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1
@@ -156,7 +154,7 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
156154
; OPT-NEXT: [[CMP2]] = icmp sge i32 [[TMP]], [[LOAD2]]
157155
; OPT-NEXT: br label [[FLOW3]]
158156
; OPT: Flow5:
159-
; OPT-NEXT: [[TMP9]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE0]] ], [ [[TMP6]], [[LEAFBLOCK]] ]
157+
; OPT-NEXT: [[TMP9]] = phi i32 [ [[LSR_IV_NEXT]], [[CASE0]] ], [ undef, [[LEAFBLOCK]] ]
160158
; OPT-NEXT: [[TMP10]] = phi i1 [ [[CMP1]], [[CASE0]] ], [ [[TMP7]], [[LEAFBLOCK]] ]
161159
; OPT-NEXT: br label [[FLOW4]]
162160
; OPT: bb9:

0 commit comments

Comments
 (0)