[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. #149042

fhahn · 2025-07-16T09:15:31Z

Building on top of #148817, introduce
a new abstract LastActiveLane opcode that gets lowered to
Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1).

When folding the tail, update all extracts for uses outside the loop the extract the
value of the last actice lane.

See also #148603

Depends on #148817 (included in
the PR).

Patch is 134.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149042.diff

17 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (-18)
(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+20-11)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+4)
(modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+2)
(modified) llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp (+47-5)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+45-3)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+4-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (+7)
(modified) llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll (+56-6)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll (+43-9)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll (+95-38)
(modified) llvm/test/Transforms/LoopVectorize/X86/small-size.ll (+71-10)
(modified) llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll (+73-15)
(modified) llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll (+15-8)
(modified) llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll (+334-32)
(modified) llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll (+108-88)
(modified) llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll (+25-22)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 969d225c6ef2e..b3b5f2aa39540 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1929,24 +1929,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const { for (const auto &Reduction : getReductionVars()) ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); - // TODO: handle non-reduction outside users when tail is folded by masking. - for (auto *AE : AllowedExit) { - // Check that all users of allowed exit values are inside the loop or - // are the live-out of a reduction. - if (ReductionLiveOuts.count(AE)) - continue; - for (User *U : AE->users()) { - Instruction *UI = cast<Instruction>(U); - if (TheLoop->contains(UI)) - continue; - LLVM_DEBUG( - dbgs() - << "LV: Cannot fold tail by masking, loop has an outside user for " - << *UI << "\n"); - return false; - } - } - for (const auto &Entry : getInductionVars()) { PHINode *OrigPhi = Entry.first; for (User *U : OrigPhi->users()) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ceeabd65cced3..dbd97cdad607f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8446,7 +8446,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, /// exit block. The penultimate value of recurrences is fed to their LCSSA phi /// users in the original exit block using the VPIRInstruction wrapping to the /// LCSSA phi. -static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) { +static bool addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) { + using namespace llvm::VPlanPatternMatch; + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); auto *ScalarPHVPBB = Plan.getScalarPreheader(); auto *MiddleVPBB = Plan.getMiddleBlock(); @@ -8465,6 +8467,15 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) { assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && "Cannot handle loops with uncountable early exits"); + // TODO: Support ExtractLane of last-active-lane with first-order + // recurrences. + + if (any_of(FOR->users(), [FOR](VPUser *U) { + return match(U, m_VPInstruction<VPInstruction::ExtractLane>( + m_VPValue(), m_Specific(FOR))); + })) + return false; + // This is the second phase of vectorizing first-order recurrences, creating // extract for users outside the loop. An overview of the transformation is // described below. Suppose we have the following loop with some use after @@ -8536,10 +8547,10 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) { // Extract the penultimate value of the recurrence and use it as operand for // the VPIRInstruction modeling the phi. for (VPUser *U : FOR->users()) { - using namespace llvm::VPlanPatternMatch; if (!match(U, m_VPInstruction<VPInstruction::ExtractLastElement>( m_Specific(FOR)))) continue; + // For VF vscale x 1, if vscale = 1, we are unable to extract the // penultimate value of the recurrence. Instead we rely on the existing // extract of the last element from the result of @@ -8547,13 +8558,14 @@ static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) { // TODO: Consider vscale_range info and UF. if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne, Range)) - return; + return true; VPValue *PenultimateElement = MiddleBuilder.createNaryOp( VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()}, {}, "vector.recur.extract.for.phi"); cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement); } } + return true; } VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( @@ -8758,7 +8770,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( R->setOperand(1, WideIV->getStepValue()); } - addExitUsersForFirstOrderRecurrences(*Plan, Range); + if (!addExitUsersForFirstOrderRecurrences(*Plan, Range)) + return nullptr; DenseMap<VPValue *, VPValue *> IVEndValues; addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); @@ -9170,7 +9183,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( continue; U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult); if (match(U, m_VPInstruction<VPInstruction::ExtractLastElement>( - m_VPValue()))) + m_VPValue())) || + match(U, m_VPInstruction<VPInstruction::ExtractLane>(m_VPValue(), + m_VPValue()))) cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult); } @@ -10022,12 +10037,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); - if (LVL.hasUncountableEarlyExit() && UserIC != 1) { - UserIC = 1; - reportVectorizationInfo("Interleaving not supported for loops " - "with uncountable early exits", - "InterleaveEarlyExitDisabled", ORE, L); - } // Plan how to best vectorize. LVP.plan(UserVF, UserIC); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 703cfe969577d..a81dc0bb0bef6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1012,6 +1012,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, ReductionStartVector, // Creates a step vector starting from 0 to VF with a step of 1. StepVector, + /// Extracts a single lane (first operand) from a set of vector operands. + /// The lane specifies an index into a vector formed by combining all vector + /// operands (all operands after the first one). + ExtractLane, }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index b27a7ffeed208..a0f5f10beb9fa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -109,6 +109,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: return SetResultTyFromOp(); + case VPInstruction::ExtractLane: + return inferScalarType(R->getOperand(1)); case VPInstruction::FirstActiveLane: return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index f0cab79197b4d..9a1e25ee2f28c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -14,11 +14,13 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanCFG.h" +#include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; +using namespace VPlanPatternMatch; namespace { class VPPredicator { @@ -42,11 +44,6 @@ class VPPredicator { /// possibly inserting new recipes at \p Dst (using Builder's insertion point) VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst); - /// Returns the *entry* mask for \p VPBB. - VPValue *getBlockInMask(VPBasicBlock *VPBB) const { - return BlockMaskCache.lookup(VPBB); - } - /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not /// already have a mask. void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) { @@ -66,6 +63,11 @@ class VPPredicator { } public: + /// Returns the *entry* mask for \p VPBB. + VPValue *getBlockInMask(VPBasicBlock *VPBB) const { + return BlockMaskCache.lookup(VPBB); + } + /// Returns the precomputed predicate of the edge from \p Src to \p Dst. VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const { return EdgeMaskCache.lookup({Src, Dst}); @@ -300,5 +302,45 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) { PrevVPBB = VPBB; } + + // If we folded the tail and introduced a header mask, any extract of the last element must be updated to only extract the last-active-lane of the header mask. + if (FoldTail) { + assert(Plan.getExitBlocks().size() == 1 && + "only a single-exit block is supported currently"); + VPBasicBlock *EB = Plan.getExitBlocks().front(); + assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() && + "the exit block must have middle block as single predecessor"); + + VPValue *LastActiveLane = nullptr; + VPBuilder B(Plan.getMiddleBlock()->getTerminator()); + for (auto &P : EB->phis()) { + auto *ExitIRI = cast<VPIRPhi>(&P); + VPValue *Inc = ExitIRI->getIncomingValue(0); + VPValue *Op; + if (!match(Inc, m_VPInstruction<VPInstruction::ExtractLastElement>( + m_VPValue(Op)))) + continue; + + if (!LastActiveLane) { + // Compute the index of the last active lane, by getting the + // first-active-lane of the negated header mask (which is the first lane + // the original header mask was false) and subtract 1. + VPValue *HeaderMask = Predicator.getBlockInMask( + Plan.getVectorLoopRegion()->getEntryBasicBlock()); + LastActiveLane = B.createNaryOp( + Instruction::Sub, + {B.createNaryOp(VPInstruction::FirstActiveLane, + {B.createNot(HeaderMask)}), + Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get( + Plan.getScalarHeader()->getIRBasicBlock()->getContext(), + 64), + 1))}); + } + auto *Ext = + B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op}); + Inc->replaceAllUsesWith(Ext); + } + } return Predicator.getBlockMaskCache(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1664bcc3881aa..cd95f648ffc11 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -862,6 +862,31 @@ Value *VPInstruction::generate(VPTransformState &State) { Res = Builder.CreateOr(Res, State.get(Op)); return Builder.CreateOrReduce(Res); } + case VPInstruction::ExtractLane: { + Value *LaneToExtract = State.get(getOperand(0), true); + Type *IdxTy = State.TypeAnalysis.inferScalarType(getOperand(0)); + Value *Res = nullptr; + Value *RuntimeVF = getRuntimeVF(State.Builder, IdxTy, State.VF); + + for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) { + Value *VectorStart = + Builder.CreateMul(RuntimeVF, ConstantInt::get(IdxTy, Idx - 1)); + Value *VectorIdx = Idx == 1 + ? LaneToExtract + : Builder.CreateSub(LaneToExtract, VectorStart); + Value *Ext = State.VF.isScalar() + ? State.get(getOperand(Idx)) + : Builder.CreateExtractElement( + State.get(getOperand(Idx)), VectorIdx); + if (Res) { + Value *Cmp = Builder.CreateICmpUGE(LaneToExtract, VectorStart); + Res = Builder.CreateSelect(Cmp, Ext, Res); + } else { + Res = Ext; + } + } + return Res; + } case VPInstruction::FirstActiveLane: { if (getNumOperands() == 1) { Value *Mask = State.get(getOperand(0)); @@ -876,8 +901,17 @@ Value *VPInstruction::generate(VPTransformState &State) { unsigned LastOpIdx = getNumOperands() - 1; Value *Res = nullptr; for (int Idx = LastOpIdx; Idx >= 0; --Idx) { - Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( - Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name); + Value *TrailingZeros = + State.VF.isScalar() + ? Builder.CreateZExt( + Builder.CreateICmpEQ(State.get(getOperand(Idx)), + Builder.getInt1(0)), + Builder.getInt64Ty()) + : Builder.CreateCountTrailingZeroElems( + // Value *TrailingZeros = + // Builder.CreateCountTrailingZeroElems( + Builder.getInt64Ty(), State.get(getOperand(Idx)), true, + Name); Value *Current = Builder.CreateAdd( Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); if (Res) { @@ -920,7 +954,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { - case Instruction::ExtractElement: { + case Instruction::ExtractElement: + case VPInstruction::ExtractLane: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, @@ -982,6 +1017,7 @@ bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractLastElement || getOpcode() == VPInstruction::ExtractPenultimateElement || getOpcode() == Instruction::ExtractElement || + getOpcode() == VPInstruction::ExtractLane || getOpcode() == VPInstruction::FirstActiveLane || getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || @@ -1040,6 +1076,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::ExtractLane: case VPInstruction::ExtractLastElement: case VPInstruction::ExtractPenultimateElement: case VPInstruction::FirstActiveLane: @@ -1088,6 +1125,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::ComputeAnyOfResult: case VPInstruction::ComputeFindIVResult: return Op == getOperand(1); + case VPInstruction::ExtractLane: + return Op == getOperand(0); }; llvm_unreachable("switch should return"); } @@ -1166,6 +1205,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BuildVector: O << "buildvector"; break; + case VPInstruction::ExtractLane: + O << "extract-lane"; + break; case VPInstruction::ExtractLastElement: O << "extract-last-element"; break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 6a3b3e6e41955..338001820d593 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -774,10 +774,10 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan, using namespace VPlanPatternMatch; VPValue *Incoming, *Mask; - if (!match(Op, m_VPInstruction<Instruction::ExtractElement>( - m_VPValue(Incoming), + if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>( m_VPInstruction<VPInstruction::FirstActiveLane>( - m_VPValue(Mask))))) + m_VPValue(Mask)), + m_VPValue(Incoming)))) return nullptr; auto *WideIV = getOptimizableIVOf(Incoming); @@ -2831,7 +2831,7 @@ void VPlanTransforms::handleUncountableEarlyExit( VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr, "first.active.lane"); IncomingFromEarlyExit = EarlyExitB.createNaryOp( - Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane}, + VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit}, nullptr, "early.exit.value"); ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index b89cd21595efd..871e37ef3966a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -363,6 +363,13 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { continue; } VPValue *Op0; + if (match(&R, m_VPInstruction<VPInstruction::ExtractLane>( + m_VPValue(Op0), m_VPValue(Op1)))) { + addUniformForAllParts(cast<VPInstruction>(&R)); + for (unsigned Part = 1; Part != UF; ++Part) + R.addOperand(getValueForPart(Op1, Part)); + continue; + } if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>( m_VPValue(Op0))) || match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll index 61ef3cef603fa..c7be4593c6a9c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll @@ -14,15 +14,16 @@ define i64 @same_exit_block_pre_inc_use1() #0 { ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 510, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 64 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 510, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 510, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 64 ; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 3, [[N_VEC]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: @@ -30,13 +31,43 @@ define i64 @same_exit_block_pre_inc_use1() #0 { ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP36:%.*]] = mul nuw i64 [[TMP29]], 32 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP36]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP15]], 48 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP38]] ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP37]], align 1 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP54]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementp... [truncated]

github-actions · 2025-07-16T09:17:50Z

✅ With the latest revision this PR passed the C/C++ code formatter.

arcbbb · 2025-07-23T10:07:09Z

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Does the PR handle the IV liveout user as well?

This patch still leaves inductions untouched for now

Add additional test coverage for tail-folding loops with first-order recurrences and users outside the loop. Test a combination of vectorization factors and interleave counts. Also update check lines in reduction-order.ll and adjust naming for clarity. This adds extra test coverage for #149042.

Add additional test coverage for tail-folding loops with first-order recurrences and users outside the loop. Test a combination of vectorization factors and interleave counts. Also update check lines in reduction-order.ll and adjust naming for clarity. This adds extra test coverage for llvm/llvm-project#149042.

fhahn

ping :)

this should be ready now, with first-order recurrences also handled

Add additional test coverage for tail-folding loops with first-order recurrences and users outside the loop. Test a combination of vectorization factors and interleave counts. Also update check lines in reduction-order.ll and adjust naming for clarity. This adds extra test coverage for llvm#149042.

fhahn

ping :)

david-arm · 2025-10-08T08:25:21Z

The difference between the title of the PR and the commit message is a bit confusing, i.e. sometimes refers to LastActiveLane and sometimes to FirstActiveLane. Perhaps worth making consistent?

david-arm · 2025-10-08T08:16:50Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

 case Instruction::Select: {
- bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+ bool OnlyFirstLaneUsed =
+ vputils::onlyFirstLaneUsed(this) ||


Is this change worth submitting separately? Not sure if it has any effect outside this PR.

I couldn't find any cases independent of the PR, as only now we have selects with to vectorToScalar ops.

david-arm · 2025-10-08T08:17:51Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

- bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+ bool OnlyFirstLaneUsed =
+ vputils::onlyFirstLaneUsed(this) ||
+ (isa<VPInstruction>(getOperand(1)) &&


Why is this restricted to just VPInstruction types? Can isVectorToScalar only be called for VPInstructions?

Yep, isVectorToScalar is a property of VPInstructions only at the moment

lukel97 · 2025-11-14T07:02:34Z

@mikaelholmen Thanks for reducing that, I think #167897 should fix it

lukel97 · 2025-11-14T07:28:43Z

@asb it would be great if you could help narrow this down. I cannot reproduce the failures with tail-folding + AVX512, so it might be EVL related.

I reverted the patch for now.

@lukel97 did you by any chance get performance numbers since the change landed?

Not yet, by sheer bad luck there was a power outage yesterday on cc-perf.igalia.com which killed the boards halfway through running the set of changes that included this PR.

I'll try and measure this PR individually and will hopefully have something to report back on Monday.

…lvm#149042) Building on top of llvm#148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also llvm#148603 PR: llvm#149042

asb · 2025-11-16T15:29:39Z

I'm not sure it's overly useful, but here is a reduction of SingleSource/Benchmarks/Stanford/Oscar.c that produces an expected result without this patch, and a different one with it:

void printf(char *, ...); struct complex { float rp, ip } z[256], w[256]; float e_1_1, e_1_0, z_0_1, zr; int Printcomplex_finish, Fft_i, Fft_j, Fft_k, Fft_l, Fft_m, Fft_index, Oscar_i, Oscar_s = 5767; void Uniform11(int *iy, float *yfl) { *iy = 4855 * *iy + 1731 & 8191; *yfl = *iy / 8192.0f; } void Fft(int n, struct complex w[], float sqrinv) { Fft_m = n / 2; Fft_l = 1; do { Fft_k = 0; Fft_j = Fft_l; Fft_i = 1; do { do { w[Fft_i + Fft_k].rp = z[Fft_i].rp + z[Fft_m + Fft_i].rp; w[Fft_i + Fft_j].rp = z[Fft_i].rp - z[Fft_i + Fft_m].rp - e_1_1; w[Fft_j].ip = e_1_0 * z_0_1 * z[Fft_i].rp - z[Fft_m].rp; Fft_i = Fft_i + 1; } while (Fft_i <= Fft_j); Fft_k = Fft_j; Fft_j = Fft_j + Fft_l; } while (Fft_j <= Fft_m); Fft_index = 1; do { z[Fft_index] = w[Fft_index]; Fft_index = Fft_index + 1; } while (Fft_index <= n); Fft_l = Fft_l + Fft_l; } while (Fft_l <= Fft_m); Fft_i = 1; for (; Fft_i <= n; Fft_i++) z[Fft_i].rp = sqrinv * z[Fft_i].rp; } void main() { Oscar_i = 1; Uniform11(&Oscar_s, &zr); z[Oscar_i].rp = 20.0f * zr - 10.0f; for (; Oscar_i <= 20; Oscar_i++) Fft(256, w, 0.0625f); printf("\n" " %15.3f%15.3f", z[1].rp); while (Printcomplex_finish) ; }

The output:

./tc.good/bin/clang --target=riscv64-linux-gnu --sysroot=$HOME/rvsysroot -march=rva23u64 -O3 -o Oscar.good Oscar.i ./tc.bad/bin/clang --target=riscv64-linux-gnu --sysroot=$HOME/rvsysroot -march=rva23u64 -O3 -o Oscar.bad Oscar.i ./Oscar.good # output follows -9.365 0.000 ./Oscar.bad # output follows -1130966548480.000 0.000

Reducing the .ll such that you get the same answer for the 'good' toolchain and an incorrect answer for the 'bad' one:

target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" target triple = "riscv64-unknown-linux-gnu" %struct.complex = type { float, float } @Oscar_s = global i32 5767 @Fft_m = dso_local global i32 0 @Fft_l = global i32 0 @Fft_k = global i32 0 @Fft_j = global i32 0 @Fft_i = global i32 0 @z = global [256 x %struct.complex] zeroinitializer @e_1_1 = dso_local global float 0.000000e+00 @e_1_0 = global float 0.000000e+00 @Fft_index = global i32 0 @Oscar_i = global i32 0 @w = global [256 x %struct.complex] zeroinitializer @.str = constant [16 x i8] c"\0A %15.3f%15.3f\00" define void @Uniform11(ptr %iy, ptr %yfl) { entry: %0 = load i32, ptr %iy, align 4 %mul = mul i32 %0, 4855 %add = add i32 %mul, 1731 %and = and i32 %add, 8191 %conv = uitofp i32 %and to float %div = fmul float %conv, 0x3F20000000000000 store float %div, ptr %yfl, align 4 ret void } define void @Fft(i32 %n, ptr %w, float %sqrinv) { entry: %div = sdiv i32 %n, 2 store i32 %div, ptr @Fft_m, align 4 store i32 1, ptr @Fft_l, align 4 br label %do.body do.body: ; preds = %do.end45, %entry store i32 0, ptr @Fft_k, align 4 store i32 0, ptr @Fft_j, align 4 store i32 0, ptr @Fft_i, align 4 br label %do.body1 do.body1: ; preds = %do.end, %do.body br label %do.body2 do.body2: ; preds = %do.body2, %do.body1 %0 = load i32, ptr @Fft_i, align 4 %idxprom = sext i32 %0 to i64 %arrayidx = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom %1 = load float, ptr %arrayidx, align 4 %2 = load i32, ptr @Fft_m, align 4, !tbaa !0 %3 = load i32, ptr @Fft_i, align 4 %add = add i32 %2, %3 %idxprom3 = sext i32 %add to i64 %arrayidx4 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom3 %4 = load float, ptr %arrayidx4, align 4 %add6 = fadd float %1, %4 %5 = load i32, ptr @Fft_i, align 4 %6 = load i32, ptr @Fft_k, align 4, !tbaa !0 %add7 = add i32 %5, %6 %idxprom8 = sext i32 %add7 to i64 %arrayidx9 = getelementptr %struct.complex, ptr %w, i64 %idxprom8 store float %add6, ptr %arrayidx9, align 4, !tbaa !4 %7 = load i32, ptr @Fft_i, align 4, !tbaa !0 %idxprom11 = sext i32 %7 to i64 %arrayidx12 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom11 %8 = load float, ptr %arrayidx12, align 4 %9 = load i32, ptr @Fft_i, align 4 %10 = load i32, ptr @Fft_m, align 4, !tbaa !0 %add14 = add i32 %9, %10 %idxprom15 = sext i32 %add14 to i64 %arrayidx16 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom15 %11 = load float, ptr %arrayidx16, align 4 %sub = fsub float %8, %11 %12 = load float, ptr @e_1_1, align 4 %sub18 = fsub float %sub, %12 %13 = load i32, ptr @Fft_i, align 4 %14 = load i32, ptr @Fft_j, align 4, !tbaa !0 %add19 = add i32 %13, %14 %idxprom20 = sext i32 %add19 to i64 %arrayidx21 = getelementptr %struct.complex, ptr %w, i64 %idxprom20 store float %sub18, ptr %arrayidx21, align 4, !tbaa !4 %15 = load i32, ptr @Fft_i, align 4, !tbaa !0 %add32 = add i32 %15, 1 store i32 %add32, ptr @Fft_i, align 4 %16 = load i32, ptr @Fft_i, align 4 %17 = load i32, ptr @Fft_j, align 4, !tbaa !0 %cmp = icmp sle i32 %16, %17 br i1 %cmp, label %do.body2, label %do.end do.end: ; preds = %do.body2 %18 = load i32, ptr @Fft_j, align 4 store i32 %18, ptr @Fft_k, align 4 %19 = load i32, ptr @Fft_j, align 4 %20 = load i32, ptr @Fft_l, align 4 %add33 = add i32 %19, %20 store i32 %add33, ptr @Fft_j, align 4 %21 = load i32, ptr @Fft_j, align 4 %22 = load i32, ptr @Fft_m, align 4, !tbaa !0 %cmp35 = icmp sle i32 %21, %22 br i1 %cmp35, label %do.body1, label %do.end36 do.end36: ; preds = %do.end store i32 0, ptr @Fft_index, align 4 br label %do.body37 do.body37: ; preds = %do.body37, %do.end36 %23 = load i32, ptr @Fft_index, align 4 %idxprom38 = sext i32 %23 to i64 %arrayidx39 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom38 %24 = load i32, ptr @Fft_index, align 4 %idxprom40 = sext i32 %24 to i64 %arrayidx41 = getelementptr %struct.complex, ptr %w, i64 %idxprom40 call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx39, ptr %arrayidx41, i64 8, i1 false) %25 = load i32, ptr @Fft_index, align 4 %add42 = add i32 %25, 1 store i32 %add42, ptr @Fft_index, align 4 %26 = load i32, ptr @Fft_index, align 4 %cmp44 = icmp sle i32 %26, %n br i1 %cmp44, label %do.body37, label %do.end45 do.end45: ; preds = %do.body37 %27 = load i32, ptr @Fft_l, align 4 %28 = load i32, ptr @Fft_l, align 4 %add46 = add i32 %27, %28 store i32 %add46, ptr @Fft_l, align 4 %29 = load i32, ptr @Fft_l, align 4 %cmp48 = icmp sle i32 %29, %n br i1 %cmp48, label %do.body, label %for.cond for.cond: ; preds = %for.body, %do.end45 %30 = load i32, ptr @Fft_i, align 4 %cmp50 = icmp sle i32 %30, %n br i1 %cmp50, label %for.body, label %for.end for.body: ; preds = %for.cond %31 = load i32, ptr @Fft_i, align 4 %idxprom51 = sext i32 %31 to i64 %arrayidx52 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom51 %32 = load float, ptr %arrayidx52, align 4 %mul54 = fmul float %sqrinv, %32 %33 = load i32, ptr @Fft_i, align 4 %idxprom55 = sext i32 %33 to i64 %arrayidx56 = getelementptr [256 x %struct.complex], ptr @z, i64 0, i64 %idxprom55 store float %mul54, ptr %arrayidx56, align 4 %34 = load i32, ptr @Fft_i, align 4 %inc = add i32 %34, 1 store i32 %inc, ptr @Fft_i, align 4 br label %for.cond for.end: ; preds = %for.cond ret void } ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.fmuladd.f32(float, float, float) #0 ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly captures(none), ptr noalias readonly captures(none), i64, i1 immarg) #1 define i32 @main(i64 %idxprom, ptr %zr) { entry: store i32 1, ptr @Oscar_i, align 4 call void @Uniform11(ptr @Oscar_s, ptr %zr) %0 = load float, ptr %zr, align 4 %1 = call float @llvm.fmuladd.f32(float %0, float 2.000000e+01, float -1.000000e+01) %arrayidx = getelementptr %struct.complex, ptr @z, i64 %idxprom store float %1, ptr %arrayidx, align 4 br label %for.cond for.cond: ; preds = %for.body, %entry %2 = phi i32 [ %inc, %for.body ], [ 0, %entry ] %cmp = icmp slt i32 %2, 21 br i1 %cmp, label %for.body, label %for.end for.body: ; preds = %for.cond call void @Fft(i32 256, ptr @w, float 6.250000e-02) %3 = load i32, ptr @Oscar_i, align 4 %inc = add i32 %3, 1 store i32 %inc, ptr @Oscar_i, align 4 br label %for.cond for.end: ; preds = %for.cond %4 = load float, ptr getelementptr inbounds nuw (i8, ptr @z, i64 8), align 4 %conv = fpext float %4 to double call void (ptr, ...) @printf(ptr @.str, double %conv) ret i32 0 } declare void @printf(ptr, ...) attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } !0 = !{!1, !1, i64 0} !1 = !{!"int", !2, i64 0} !2 = !{!"omnipotent char", !3, i64 0} !3 = !{!"Simple C/C++ TBAA"} !4 = !{!5, !6, i64 0} !5 = !{!"complex", !6, i64 0, !6, i64 4} !6 = !{!"float", !2, i64 0}

The above .ll can be compiled with:

./tc.good/bin/clang \ --target=riscv64-linux-gnu \ --sysroot=$HOME/rvsysroot \ -march=rva23u64 \ -O3 \ reduced.ll \ -o Oscar.good ./tc.bad/bin/clang \ --target=riscv64-linux-gnu \ --sysroot=$HOME/rvsysroot \ -march=rva23u64 \ -O3 \ reduced.ll \ -o Oscar.bad ./Oscar.good # output follows -9.365 0.000 ./Oscar.bad # output follows -70923976704.000 0.000

lukel97 · 2025-11-17T08:12:59Z

llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll

+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 [[TMP8]]


I think I've narrowed down the cause of the RISC-V buildbot failures, LastActiveLane will return the wrong lane whenever the header mask is full, i.e. when there is no tail. E.g. for VF=8:

firstActiveLane(not(header-mask)) - 1 = cttz.elts(not(header-mask), isZeroPoison=true) - 1 = cttz.elts(00000000, isZeroPoison=true) - 1 = poison - 1

Setting isZeroPoison=false should fix this, but I also tried just lowering it as an add reduction and confirmed ity fixes it:

+ case VPInstruction::LastActiveLane: { + Value *Mask = State.get(getOperand(0)); + Value *ZExt = + Builder.CreateZExt(Mask, toVectorTy(Builder.getInt64Ty(), State.VF)); + return Builder.CreateSub(Builder.CreateAddReduce(ZExt), + ConstantInt::get(Builder.getInt64Ty(), 1)); + }

Ah right, I guess the reason this worked fine with AVX512 is related to the lowering.

#168392) Whenever #149042 is relanded we will soon start EVL tail folding vectorized loops that have live-outs, e.g.: ```c int f(int *x, int n) { for (int i = 0; i < n; i++) { int y = x[i] + 1; x[y] = y; } return y; } ``` These are vectorized by extracting the last "active lane" in the loop's exit: ```llvm loop: %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) ... exit: %lastidx = sub i64 %vl, 1 %lastelt = extractelement <vscale x 4 x i32> %y, i64 %lastidx ``` Which in RISC-V translates to a vslidedown.vx with a VL of 1: ```llvm bb.loop: %vl:gprnox0 = PseudoVSETVLI ... %y:vr = PseudoVADD_VI_M1 $noreg, %x, 1, AVL=-1 ... bb.exit: %lastidx:gprnox0 = ADDI %vl, -1 %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %y, %lastidx, AVL=1 ``` However today we will fail to reduce the VL of %y in the loop and will end up with two extra VL toggles. The reason being that today RISCVVLOptimizer is conservative with vslidedown.vx as it can read the lanes of %y past its own VL. So in `getMinimumVLForUser` we say that vslidedown.vx demands the entirety of %y. One observation with the sequence above is that it only actually needs to read the first %vl lanes of %y, because the last lane of vs2 used is offset + 1. In this case, that's `%lastidx + 1 = %vl - 1 + 1 = %vl`. This PR teaches RISCVVLOptimizer about this case in `getMinimumVLForVSLIDEDOWN_VX`, and in doing so removes the VL toggles. The one case that I had to think about for a bit was whenever `ADDI %vl, -1` wraps, i.e. when %vl=0 and the resulting offset is all ones. This should always be larger than the largest VLMAX, so vs2 will be completely slid down and absent from the output. So we don't need to read anything from vs2. This patch on its own has no observable effect on llvm-test-suite or SPEC CPU 2017 w/ rva23u64 today.

…folding. (llvm#149042)" This reverts commit a6edeed.

If the expected trip count is less than the VF, the vector loop will only execute a single iteration. When that's the case, the cost of the middle block has the same impact as the cost of the vector loop. Include it in isOutsideLoopWorkProfitable to avoid vectorizing when the extra work in the middle block makes it unprofitable. Note that isOutsideLoopWorkProfitable already scales the cost of blocks outside the vector region, but the patch restricts accounting for the middle block to cases where VF <= ExpectedTC, to initially catch some worst cases and avoid regressions. This initial version should specifically avoid unprofitable tail-folding for loops with low trip counts after re-applying llvm#149042.

If the expected trip count is less than the VF, the vector loop will only execute a single iteration. When that's the case, the cost of the middle block has the same impact as the cost of the vector loop. Include it in isOutsideLoopWorkProfitable to avoid vectorizing when the extra work in the middle block makes it unprofitable. Note that isOutsideLoopWorkProfitable already scales the cost of blocks outside the vector region, but the patch restricts accounting for the middle block to cases where VF <= ExpectedTC, to initially catch some worst cases and avoid regressions. This initial version should specifically avoid unprofitable tail-folding for loops with low trip counts after re-applying #149042. PR: #168949

If the expected trip count is less than the VF, the vector loop will only execute a single iteration. When that's the case, the cost of the middle block has the same impact as the cost of the vector loop. Include it in isOutsideLoopWorkProfitable to avoid vectorizing when the extra work in the middle block makes it unprofitable. Note that isOutsideLoopWorkProfitable already scales the cost of blocks outside the vector region, but the patch restricts accounting for the middle block to cases where VF <= ExpectedTC, to initially catch some worst cases and avoid regressions. This initial version should specifically avoid unprofitable tail-folding for loops with low trip counts after re-applying llvm/llvm-project#149042. PR: llvm/llvm-project#168949

llvm#168392) Whenever llvm#149042 is relanded we will soon start EVL tail folding vectorized loops that have live-outs, e.g.: ```c int f(int *x, int n) { for (int i = 0; i < n; i++) { int y = x[i] + 1; x[y] = y; } return y; } ``` These are vectorized by extracting the last "active lane" in the loop's exit: ```llvm loop: %vl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) ... exit: %lastidx = sub i64 %vl, 1 %lastelt = extractelement <vscale x 4 x i32> %y, i64 %lastidx ``` Which in RISC-V translates to a vslidedown.vx with a VL of 1: ```llvm bb.loop: %vl:gprnox0 = PseudoVSETVLI ... %y:vr = PseudoVADD_VI_M1 $noreg, %x, 1, AVL=-1 ... bb.exit: %lastidx:gprnox0 = ADDI %vl, -1 %w:vr = PseudoVSLIDEDOWN_VX_M1 $noreg, %y, %lastidx, AVL=1 ``` However today we will fail to reduce the VL of %y in the loop and will end up with two extra VL toggles. The reason being that today RISCVVLOptimizer is conservative with vslidedown.vx as it can read the lanes of %y past its own VL. So in `getMinimumVLForUser` we say that vslidedown.vx demands the entirety of %y. One observation with the sequence above is that it only actually needs to read the first %vl lanes of %y, because the last lane of vs2 used is offset + 1. In this case, that's `%lastidx + 1 = %vl - 1 + 1 = %vl`. This PR teaches RISCVVLOptimizer about this case in `getMinimumVLForVSLIDEDOWN_VX`, and in doing so removes the VL toggles. The one case that I had to think about for a bit was whenever `ADDI %vl, -1` wraps, i.e. when %vl=0 and the resulting offset is all ones. This should always be larger than the largest VLMAX, so vs2 will be completely slid down and absent from the output. So we don't need to read anything from vs2. This patch on its own has no observable effect on llvm-test-suite or SPEC CPU 2017 w/ rva23u64 today.

If the expected trip count is less than the VF, the vector loop will only execute a single iteration. When that's the case, the cost of the middle block has the same impact as the cost of the vector loop. Include it in isOutsideLoopWorkProfitable to avoid vectorizing when the extra work in the middle block makes it unprofitable. Note that isOutsideLoopWorkProfitable already scales the cost of blocks outside the vector region, but the patch restricts accounting for the middle block to cases where VF <= ExpectedTC, to initially catch some worst cases and avoid regressions. This initial version should specifically avoid unprofitable tail-folding for loops with low trip counts after re-applying llvm#149042. PR: llvm#168949

…folding. (llvm#149042)" This reverts commit a6edeed.

…folding. (#149042)" This reverts commit a6edeed. The following fixes have landed, addressing issues causing the original revert: * #169298 * #167897 * #168949 Original message: Building on top of #148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also #148603 PR: #149042

…en tail-folding. (#149042)"" This reverts commit 72e51d3. Missed some test updates.

… when tail-folding. (#149042)" This reverts commit a6edeed. The following fixes have landed, addressing issues causing the original revert: * llvm/llvm-project#169298 * llvm/llvm-project#167897 * llvm/llvm-project#168949 Original message: Building on top of llvm/llvm-project#148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also llvm/llvm-project#148603 PR: llvm/llvm-project#149042

…folding. (#149042)" This reverts commit a6edeed. The following fixes have landed, addressing issues causing the original revert: * #169298 * #167897 * #168949 Original message: Building on top of #148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also #148603 PR: #149042

… when tail-folding. (#149042)" This reverts commit a6edeed. The following fixes have landed, addressing issues causing the original revert: * llvm/llvm-project#169298 * llvm/llvm-project#167897 * llvm/llvm-project#168949 Original message: Building on top of llvm/llvm-project#148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also llvm/llvm-project#148603 PR: llvm/llvm-project#149042

…folding. (llvm#149042)" This reverts commit a6edeed. The following fixes have landed, addressing issues causing the original revert: * llvm#169298 * llvm#167897 * llvm#168949 Original message: Building on top of llvm#148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also llvm#148603 PR: llvm#149042

…en tail-folding. (llvm#149042)"" This reverts commit 72e51d3. Missed some test updates.

…folding. (llvm#149042)" This reverts commit a6edeed. The following fixes have landed, addressing issues causing the original revert: * llvm#169298 * llvm#167897 * llvm#168949 Original message: Building on top of llvm#148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also llvm#148603 PR: llvm#149042

…en tail-folding. (llvm#149042)"" This reverts commit 72e51d3. Missed some test updates.

…folding. (llvm#149042)" This reverts commit a6edeed. The following fixes have landed, addressing issues causing the original revert: * llvm#169298 * llvm#167897 * llvm#168949 Original message: Building on top of llvm#148817, introduce a new abstract LastActiveLane opcode that gets lowered to Not(Mask) → FirstActiveLane(NotMask) → Sub(result, 1). When folding the tail, update all extracts for uses outside the loop the extract the value of the last actice lane. See also llvm#148603 PR: llvm#149042

fhahn requested review from aniragil, ayalz, lukel97 and preames July 16, 2025 09:15

llvmbot added backend:RISC-V vectorizers llvm:transforms labels Jul 16, 2025

fhahn mentioned this pull request Jul 16, 2025

Vectorize loops with exit users with tail folding #148603

Closed

fhahn force-pushed the lv-tf-external-users branch from 04ae316 to 4f1914b Compare July 16, 2025 09:18

lukel97 mentioned this pull request Jul 16, 2025

[VPlan] Fold safe divisors into VP intrinsics with EVL #148828

Closed

arcbbb reviewed Jul 23, 2025

View reviewed changes

fhahn force-pushed the lv-tf-external-users branch from 4f1914b to 51369a9 Compare August 28, 2025 15:20

fhahn force-pushed the lv-tf-external-users branch 3 times, most recently from c270ba0 to 21fa274 Compare October 2, 2025 16:52

fhahn commented Oct 2, 2025

View reviewed changes

fhahn changed the title ~~[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. (WIP)~~ [LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. Oct 2, 2025

fhahn force-pushed the lv-tf-external-users branch from 21fa274 to 230ea48 Compare October 2, 2025 21:02

fhahn requested a review from david-arm October 2, 2025 21:06

[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding.

b7b23a9

fhahn force-pushed the lv-tf-external-users branch from 230ea48 to b7b23a9 Compare October 8, 2025 08:04

fhahn commented Oct 8, 2025

View reviewed changes

david-arm reviewed Oct 8, 2025

View reviewed changes

Merge remote-tracking branch 'origin/main' into lv-tf-external-users

9985387

lukel97 reviewed Nov 17, 2025

View reviewed changes

lukel97 mentioned this pull request Nov 17, 2025

[RISCV] Reduce minimum VL needed for vslidedown.vx in RISCVVLOptimizer #168392

Merged

lukel97 added a commit to lukel97/llvm-project that referenced this pull request Nov 19, 2025

Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-…

dfa67d5

…folding. (llvm#149042)" This reverts commit a6edeed.

lukel97 pushed a commit to lukel97/llvm-project that referenced this pull request Nov 19, 2025

Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-…

23f8762

…folding. (llvm#149042)" This reverts commit a6edeed.

lukel97 mentioned this pull request Nov 19, 2025

Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. (#149042)" #168738

Closed

fhahn added a commit to fhahn/llvm-project that referenced this pull request Nov 20, 2025

Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-…

5d3f690

…folding. (llvm#149042)" This reverts commit a6edeed.

This was referenced Nov 20, 2025

[LV] Count cost of middle block if TC <= VF. #168949

Merged

[VPlan] Set ZeroIsPoison=false for FirstActiveLane #169298

Merged

fhahn added a commit to fhahn/llvm-project that referenced this pull request Nov 26, 2025

Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-…

f9044d0

…folding. (llvm#149042)" This reverts commit a6edeed.

fhahn added a commit that referenced this pull request Nov 26, 2025

Revert "Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs wh…

d58ebe3

…en tail-folding. (#149042)"" This reverts commit 72e51d3. Missed some test updates.

tanji-dg pushed a commit to tanji-dg/llvm-project that referenced this pull request Nov 27, 2025

Revert "Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs wh…

4ef2349

…en tail-folding. (llvm#149042)"" This reverts commit 72e51d3. Missed some test updates.

GeneraluseAI pushed a commit to GeneraluseAI/llvm-project that referenced this pull request Nov 27, 2025

Revert "Reapply "[LV] Use ExtractLane(LastActiveLane, V) live outs wh…

27232f4

…en tail-folding. (llvm#149042)"" This reverts commit 72e51d3. Missed some test updates.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. #149042

[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. #149042

Uh oh!

fhahn commented Jul 16, 2025 •

edited

Loading

llvmbot commented Jul 16, 2025 •

edited

Loading

github-actions bot commented Jul 16, 2025 •

edited

Loading

arcbbb Jul 23, 2025

fhahn Oct 2, 2025

fhahn left a comment

fhahn left a comment

david-arm commented Oct 8, 2025

david-arm Oct 8, 2025

fhahn Oct 14, 2025

david-arm Oct 8, 2025

fhahn Oct 14, 2025

lukel97 commented Nov 14, 2025

lukel97 commented Nov 14, 2025

asb commented Nov 16, 2025 •

edited

Loading

lukel97 Nov 17, 2025 •

edited

Loading

fhahn Nov 19, 2025

Labels

10 participants

[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. #149042

[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding. #149042

Uh oh!

Conversation

fhahn commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

llvmbot commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

github-actions bot commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

arcbbb Jul 23, 2025

Choose a reason for hiding this comment

fhahn Oct 2, 2025

Choose a reason for hiding this comment

fhahn left a comment

Choose a reason for hiding this comment

fhahn left a comment

Choose a reason for hiding this comment

david-arm commented Oct 8, 2025

david-arm Oct 8, 2025

Choose a reason for hiding this comment

fhahn Oct 14, 2025

Choose a reason for hiding this comment

david-arm Oct 8, 2025

Choose a reason for hiding this comment

fhahn Oct 14, 2025

Choose a reason for hiding this comment

lukel97 commented Nov 14, 2025

lukel97 commented Nov 14, 2025

asb commented Nov 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

lukel97 Nov 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

fhahn Nov 19, 2025

Choose a reason for hiding this comment

Labels

10 participants

fhahn commented Jul 16, 2025 •

edited

Loading

llvmbot commented Jul 16, 2025 •

edited

Loading

github-actions bot commented Jul 16, 2025 •

edited

Loading

asb commented Nov 16, 2025 •

edited

Loading

lukel97 Nov 17, 2025 •

edited

Loading