Skip to content

Commit d475030

Browse files
committed
[SCEV] Apply loop guards to divisibility tests
Extend applyLoopGuards() to take into account conditions/assumes proving some value %v to be divisible by D by rewriting %v to (%v / D) * D. This lets the loop unroller and the loop vectorizer identify more loops as not requiring remainder loops. Differential Revision: https://reviews.llvm.org/D95521
1 parent 80f5395 commit d475030

File tree

6 files changed

+344
-22
lines changed

6 files changed

+344
-22
lines changed

llvm/include/llvm/Analysis/ScalarEvolution.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,6 +1177,9 @@ class ScalarEvolution {
11771177
/// sharpen it.
11781178
void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags);
11791179

1180+
/// Try to apply information from loop guards for \p L to \p Expr.
1181+
const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
1182+
11801183
private:
11811184
/// A CallbackVH to arrange for ScalarEvolution to be notified whenever a
11821185
/// Value is deleted.
@@ -2021,9 +2024,6 @@ class ScalarEvolution {
20212024
/// Assign A and B to LHS and RHS, respectively.
20222025
bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
20232026

2024-
/// Try to apply information from loop guards for \p L to \p Expr.
2025-
const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
2026-
20272027
/// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
20282028
/// `UniqueSCEVs`.
20292029
///

llvm/lib/Analysis/ScalarEvolution.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6887,7 +6887,8 @@ ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
68876887
// Attempt to factor more general cases. Returns the greatest power of
68886888
// two divisor. If overflow happens, the trip count expression is still
68896889
// divisible by the greatest power of 2 divisor returned.
6890-
return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));
6890+
return 1U << std::min((uint32_t)31,
6891+
GetMinTrailingZeros(applyLoopGuards(TCExpr, L)));
68916892

68926893
ConstantInt *Result = TC->getValue();
68936894

@@ -13259,6 +13260,27 @@ class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
1325913260
const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
1326013261
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
1326113262
const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) {
13263+
// If we have LHS == 0, check if LHS is computing a property of some unknown
13264+
// SCEV %v which we can rewrite %v to express explicitly.
13265+
const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
13266+
if (Predicate == CmpInst::ICMP_EQ && RHSC &&
13267+
RHSC->getValue()->isNullValue()) {
13268+
// If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
13269+
// explicitly express that.
13270+
const SCEV *URemLHS = nullptr;
13271+
const SCEV *URemRHS = nullptr;
13272+
if (matchURem(LHS, URemLHS, URemRHS)) {
13273+
if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
13274+
Value *V = LHSUnknown->getValue();
13275+
auto Multiple =
13276+
getMulExpr(getUDivExpr(URemLHS, URemRHS), URemRHS,
13277+
(SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW));
13278+
RewriteMap[V] = Multiple;
13279+
return;
13280+
}
13281+
}
13282+
}
13283+
1326213284
if (!isa<SCEVUnknown>(LHS)) {
1326313285
std::swap(LHS, RHS);
1326413286
Predicate = CmpInst::getSwappedPredicate(Predicate);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5573,7 +5573,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
55735573
const SCEV *ExitCount = SE->getAddExpr(
55745574
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
55755575
const SCEV *Rem = SE->getURemExpr(
5576-
ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5576+
SE->applyLoopGuards(ExitCount, TheLoop),
5577+
SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
55775578
if (Rem->isZero()) {
55785579
// Accept MaxVF if we do not have a tail.
55795580
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");

llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ define void @test_trip_multiple_4(i32 %num) {
99
; CHECK: Loop %for.body: backedge-taken count is (-1 + %num)
1010
; CHECK-NEXT: Loop %for.body: max backedge-taken count is -2
1111
; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %num)
12-
; CHECK: Loop %for.body: Trip multiple is 1
12+
; CHECK: Loop %for.body: Trip multiple is 4
1313
;
1414
entry:
1515
%u = urem i32 %num, 4
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-count=2 | FileCheck %s
3+
4+
; Make sure the loop is unrolled without a remainder loop based on an assumption
5+
; that the least significant bit is known to be zero.
6+
7+
define dso_local void @assumeDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr {
8+
; CHECK-LABEL: @assumeDivisibleTC(
9+
; CHECK-NEXT: entry:
10+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 1
11+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
12+
; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
13+
; CHECK: guarded:
14+
; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2
15+
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
16+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
17+
; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
18+
; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
19+
; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
20+
; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]]
21+
; CHECK: for.body.preheader:
22+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
23+
; CHECK: for.body:
24+
; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
25+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]]
26+
; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
27+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP0]], 3
28+
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]]
29+
; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1
30+
; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1
31+
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]]
32+
; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
33+
; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP1]], 3
34+
; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]]
35+
; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1
36+
; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1
37+
; CHECK-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[INC_1]], [[N]]
38+
; CHECK-NEXT: br i1 [[CMP1_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]], [[LOOP0:!llvm.loop !.*]]
39+
; CHECK: exit.loopexit:
40+
; CHECK-NEXT: br label [[EXIT]]
41+
; CHECK: exit:
42+
; CHECK-NEXT: ret void
43+
;
44+
entry:
45+
%and = and i32 %p, 1
46+
%cmp = icmp eq i32 %and, 0
47+
br i1 %cmp, label %guarded, label %exit
48+
49+
guarded:
50+
%rem = urem i32 %q, 2
51+
%cmp2 = icmp eq i32 %rem, 0
52+
tail call void @llvm.assume(i1 %cmp2)
53+
%gt = icmp sgt i32 %p, %q
54+
%n = select i1 %gt, i32 %p, i32 %q
55+
%cmp110 = icmp sgt i32 %n, 0
56+
br i1 %cmp110, label %for.body, label %exit
57+
58+
for.body:
59+
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ]
60+
%arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011
61+
%0 = load i8, i8* %arrayidx, align 1
62+
%add = add i8 %0, 3
63+
%arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011
64+
store i8 %add, i8* %arrayidx4, align 1
65+
%inc = add nuw nsw i32 %i.011, 1
66+
%cmp1 = icmp slt i32 %inc, %n
67+
br i1 %cmp1, label %for.body, label %exit
68+
69+
exit:
70+
ret void
71+
}
72+
73+
; Make sure the loop is unrolled with a remainder loop when the trip-count
74+
; is not provably divisible by the unroll factor.
75+
76+
define dso_local void @cannotProveDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr {
77+
; CHECK-LABEL: @cannotProveDivisibleTC(
78+
; CHECK-NEXT: entry:
79+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 6
80+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
81+
; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
82+
; CHECK: guarded:
83+
; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2
84+
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
85+
; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
86+
; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
87+
; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
88+
; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
89+
; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]]
90+
; CHECK: for.body.preheader:
91+
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
92+
; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[N]], 1
93+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 1
94+
; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
95+
; CHECK: for.body.preheader.new:
96+
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[N]], [[XTRAITER]]
97+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
98+
; CHECK: for.body:
99+
; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
100+
; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[FOR_BODY]] ]
101+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]]
102+
; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
103+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP2]], 3
104+
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]]
105+
; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1
106+
; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1
107+
; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
108+
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]]
109+
; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
110+
; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP3]], 3
111+
; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]]
112+
; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1
113+
; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1
114+
; CHECK-NEXT: [[NITER_NSUB_1]] = sub i32 [[NITER_NSUB]], 1
115+
; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp ne i32 [[NITER_NSUB_1]], 0
116+
; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], [[LOOP2:!llvm.loop !.*]]
117+
; CHECK: exit.loopexit.unr-lcssa.loopexit:
118+
; CHECK-NEXT: [[I_011_UNR_PH:%.*]] = phi i32 [ [[INC_1]], [[FOR_BODY]] ]
119+
; CHECK-NEXT: br label [[EXIT_LOOPEXIT_UNR_LCSSA]]
120+
; CHECK: exit.loopexit.unr-lcssa:
121+
; CHECK-NEXT: [[I_011_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[I_011_UNR_PH]], [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
122+
; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
123+
; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[EXIT_LOOPEXIT:%.*]]
124+
; CHECK: for.body.epil.preheader:
125+
; CHECK-NEXT: br label [[FOR_BODY_EPIL:%.*]]
126+
; CHECK: for.body.epil:
127+
; CHECK-NEXT: [[I_011_EPIL:%.*]] = phi i32 [ [[I_011_UNR]], [[FOR_BODY_EPIL_PREHEADER]] ]
128+
; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011_EPIL]]
129+
; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_EPIL]], align 1
130+
; CHECK-NEXT: [[ADD_EPIL:%.*]] = add i8 [[TMP4]], 3
131+
; CHECK-NEXT: [[ARRAYIDX4_EPIL:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011_EPIL]]
132+
; CHECK-NEXT: store i8 [[ADD_EPIL]], i8* [[ARRAYIDX4_EPIL]], align 1
133+
; CHECK-NEXT: [[INC_EPIL:%.*]] = add nuw nsw i32 [[I_011_EPIL]], 1
134+
; CHECK-NEXT: [[CMP1_EPIL:%.*]] = icmp slt i32 [[INC_EPIL]], [[N]]
135+
; CHECK-NEXT: br label [[EXIT_LOOPEXIT_EPILOG_LCSSA:%.*]]
136+
; CHECK: exit.loopexit.epilog-lcssa:
137+
; CHECK-NEXT: br label [[EXIT_LOOPEXIT]]
138+
; CHECK: exit.loopexit:
139+
; CHECK-NEXT: br label [[EXIT]]
140+
; CHECK: exit:
141+
; CHECK-NEXT: ret void
142+
;
143+
entry:
144+
%and = and i32 %p, 6
145+
%cmp = icmp eq i32 %and, 0
146+
br i1 %cmp, label %guarded, label %exit
147+
148+
guarded:
149+
%rem = urem i32 %q, 2
150+
%cmp2 = icmp eq i32 %rem, 0
151+
tail call void @llvm.assume(i1 %cmp2)
152+
%gt = icmp sgt i32 %p, %q
153+
%n = select i1 %gt, i32 %p, i32 %q
154+
%cmp110 = icmp sgt i32 %n, 0
155+
br i1 %cmp110, label %for.body, label %exit
156+
157+
for.body:
158+
%i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ]
159+
%arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011
160+
%0 = load i8, i8* %arrayidx, align 1
161+
%add = add i8 %0, 3
162+
%arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011
163+
store i8 %add, i8* %arrayidx4, align 1
164+
%inc = add nuw nsw i32 %i.011, 1
165+
%cmp1 = icmp slt i32 %inc, %n
166+
br i1 %cmp1, label %for.body, label %exit
167+
168+
exit:
169+
ret void
170+
}
171+
172+
declare void @llvm.assume(i1 noundef) nofree nosync nounwind willreturn

0 commit comments

Comments
 (0)