llvm
diff --git a/‎llvm/include/llvm/Analysis/ScalarEvolution.h‎
Lines changed: 3 additions & 3 deletions b/‎llvm/include/llvm/Analysis/ScalarEvolution.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎llvm/lib/Analysis/ScalarEvolution.cpp‎
Lines changed: 23 additions & 1 deletion b/‎llvm/lib/Analysis/ScalarEvolution.cpp‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 2 additions & 1 deletion b/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll‎
Lines changed: 1 addition & 1 deletion b/‎llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll‎
Lines changed: 172 additions & 0 deletions b/‎llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll‎
Lines changed: 172 additions & 0 deletions
@@ -1177,6 +1177,9 @@ class ScalarEvolution {
  /// sharpen it.
  void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags);
 
+ /// Try to apply information from loop guards for \p L to \p Expr.
+ const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
+
 private:
  /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a
  /// Value is deleted.
@@ -2021,9 +2024,6 @@ class ScalarEvolution {
  /// Assign A and B to LHS and RHS, respectively.
  bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
 
- /// Try to apply information from loop guards for \p L to \p Expr.
- const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
-
  /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
  /// `UniqueSCEVs`.
  ///
 
@@ -6887,7 +6887,8 @@ ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
  // Attempt to factor more general cases. Returns the greatest power of
  // two divisor. If overflow happens, the trip count expression is still
  // divisible by the greatest power of 2 divisor returned.
- return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));
+ return 1U << std::min((uint32_t)31,
+ GetMinTrailingZeros(applyLoopGuards(TCExpr, L)));
 
  ConstantInt *Result = TC->getValue();
 
@@ -13259,6 +13260,27 @@ class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
 const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
  auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
  const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) {
+ // If we have LHS == 0, check if LHS is computing a property of some unknown
+ // SCEV %v which we can rewrite %v to express explicitly.
+ const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
+ if (Predicate == CmpInst::ICMP_EQ && RHSC &&
+ RHSC->getValue()->isNullValue()) {
+ // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
+ // explicitly express that.
+ const SCEV *URemLHS = nullptr;
+ const SCEV *URemRHS = nullptr;
+ if (matchURem(LHS, URemLHS, URemRHS)) {
+ if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
+ Value *V = LHSUnknown->getValue();
+ auto Multiple =
+ getMulExpr(getUDivExpr(URemLHS, URemRHS), URemRHS,
+ (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW));
+ RewriteMap[V] = Multiple;
+ return;
+ }
+ }
+ }
+
  if (!isa<SCEVUnknown>(LHS)) {
  std::swap(LHS, RHS);
  Predicate = CmpInst::getSwappedPredicate(Predicate);
 
@@ -5573,7 +5573,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
  const SCEV *ExitCount = SE->getAddExpr(
  BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
  const SCEV *Rem = SE->getURemExpr(
- ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+ SE->applyLoopGuards(ExitCount, TheLoop),
+ SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
  if (Rem->isZero()) {
  // Accept MaxVF if we do not have a tail.
  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
 
@@ -9,7 +9,7 @@ define void @test_trip_multiple_4(i32 %num) {
 ; CHECK: Loop %for.body: backedge-taken count is (-1 + %num)
 ; CHECK-NEXT: Loop %for.body: max backedge-taken count is -2
 ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %num)
-; CHECK: Loop %for.body: Trip multiple is 1
+; CHECK: Loop %for.body: Trip multiple is 4
 ;
 entry:
  %u = urem i32 %num, 4
 
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-count=2 | FileCheck %s
+
+; Make sure the loop is unrolled without a remainder loop based on an assumption
+; that the least significant bit is known to be zero.
+
+define dso_local void @assumeDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr {
+; CHECK-LABEL: @assumeDivisibleTC(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
+; CHECK: guarded:
+; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
+; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
+; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP0]], 3
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]]
+; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
+; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP1]], 3
+; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]]
+; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1
+; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[INC_1]], [[N]]
+; CHECK-NEXT: br i1 [[CMP1_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]], [[LOOP0:!llvm.loop !.*]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %and = and i32 %p, 1
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %guarded, label %exit
+
+guarded:
+ %rem = urem i32 %q, 2
+ %cmp2 = icmp eq i32 %rem, 0
+ tail call void @llvm.assume(i1 %cmp2)
+ %gt = icmp sgt i32 %p, %q
+ %n = select i1 %gt, i32 %p, i32 %q
+ %cmp110 = icmp sgt i32 %n, 0
+ br i1 %cmp110, label %for.body, label %exit
+
+for.body:
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ]
+ %arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = add i8 %0, 3
+ %arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011
+ store i8 %add, i8* %arrayidx4, align 1
+ %inc = add nuw nsw i32 %i.011, 1
+ %cmp1 = icmp slt i32 %inc, %n
+ br i1 %cmp1, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+; Make sure the loop is unrolled with a remainder loop when the trip-count
+; is not provably divisible by the unroll factor.
+
+define dso_local void @cannotProveDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr {
+; CHECK-LABEL: @cannotProveDivisibleTC(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 6
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
+; CHECK: guarded:
+; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
+; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
+; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[N]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 1
+; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
+; CHECK: for.body.preheader.new:
+; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[N]], [[XTRAITER]]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP2]], 3
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]]
+; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1
+; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
+; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP3]], 3
+; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]]
+; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1
+; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT: [[NITER_NSUB_1]] = sub i32 [[NITER_NSUB]], 1
+; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp ne i32 [[NITER_NSUB_1]], 0
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], [[LOOP2:!llvm.loop !.*]]
+; CHECK: exit.loopexit.unr-lcssa.loopexit:
+; CHECK-NEXT: [[I_011_UNR_PH:%.*]] = phi i32 [ [[INC_1]], [[FOR_BODY]] ]
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT_UNR_LCSSA]]
+; CHECK: exit.loopexit.unr-lcssa:
+; CHECK-NEXT: [[I_011_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[I_011_UNR_PH]], [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK: for.body.epil.preheader:
+; CHECK-NEXT: br label [[FOR_BODY_EPIL:%.*]]
+; CHECK: for.body.epil:
+; CHECK-NEXT: [[I_011_EPIL:%.*]] = phi i32 [ [[I_011_UNR]], [[FOR_BODY_EPIL_PREHEADER]] ]
+; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011_EPIL]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_EPIL]], align 1
+; CHECK-NEXT: [[ADD_EPIL:%.*]] = add i8 [[TMP4]], 3
+; CHECK-NEXT: [[ARRAYIDX4_EPIL:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011_EPIL]]
+; CHECK-NEXT: store i8 [[ADD_EPIL]], i8* [[ARRAYIDX4_EPIL]], align 1
+; CHECK-NEXT: [[INC_EPIL:%.*]] = add nuw nsw i32 [[I_011_EPIL]], 1
+; CHECK-NEXT: [[CMP1_EPIL:%.*]] = icmp slt i32 [[INC_EPIL]], [[N]]
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT_EPILOG_LCSSA:%.*]]
+; CHECK: exit.loopexit.epilog-lcssa:
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %and = and i32 %p, 6
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %guarded, label %exit
+
+guarded:
+ %rem = urem i32 %q, 2
+ %cmp2 = icmp eq i32 %rem, 0
+ tail call void @llvm.assume(i1 %cmp2)
+ %gt = icmp sgt i32 %p, %q
+ %n = select i1 %gt, i32 %p, i32 %q
+ %cmp110 = icmp sgt i32 %n, 0
+ br i1 %cmp110, label %for.body, label %exit
+
+for.body:
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ]
+ %arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = add i8 %0, 3
+ %arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011
+ store i8 %add, i8* %arrayidx4, align 1
+ %inc = add nuw nsw i32 %i.011, 1
+ %cmp1 = icmp slt i32 %inc, %n
+ br i1 %cmp1, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+declare void @llvm.assume(i1 noundef) nofree nosync nounwind willreturn
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ define void @test_trip_multiple_4(i32 %num) {`
`9`	`9`	`; CHECK: Loop %for.body: backedge-taken count is (-1 + %num)`
`10`	`10`	`; CHECK-NEXT: Loop %for.body: max backedge-taken count is -2`
`11`	`11`	`; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %num)`
`12`		`-; CHECK: Loop %for.body: Trip multiple is 1`
	`12`	`+; CHECK: Loop %for.body: Trip multiple is 4`
`13`	`13`	`;`
`14`	`14`	`entry:`
`15`	`15`	`%u = urem i32 %num, 4`