- Notifications
You must be signed in to change notification settings - Fork 15.3k
[VPlan] Explicitly unoll replicate-regions without live-outs by VF. #170212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Extend replicateByVF to also handle VPScalarIVStepsRecipe. To do so, the patch adds a new lane operand to VPScalarIVStepsRecipe, which is only added when replicating. This enables removing a number of lane 0 computations. The lane operand will also be used to explicitly replicate replicate regions in a follow-up.
This patch adds a new replicateReplicateRegionsByVF transform to unroll replicate=regions by VF, dissolving them. The transform creates VF copies of the replicate-region's content, connects them and converts recipes to single-scalar variants for the corresponding lanes. The initial version skips regions with live-outs (VPPredInstPHIRecipe), which will be added in follow-up patches. Depends on llvm#170053
| @llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Florian Hahn (fhahn) ChangesThis patch adds a new replicateReplicateRegionsByVF transform to The initial version skips regions with live-outs (VPPredInstPHIRecipe), Depends on #170053 Patch is 852.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170212.diff 117 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4a89f7dd8672e..0a8209ec3d9bf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7306,6 +7306,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); + VPlanTransforms::runPass(VPlanTransforms::unrollReplicateRegions, BestVPlan, + BestVF); + VPlanTransforms::runPass(VPlanTransforms::mergeBlocksIntoPredecessors, + BestVPlan); bool HasBranchWeights = hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()); if (HasBranchWeights) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6ca750fc53279..11f46d11087f3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3788,7 +3788,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe { /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their scalar values. class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, - public VPUnrollPartAccessor<3> { + public VPUnrollPartAccessor<4> { Instruction::BinaryOps InductionOpcode; public: @@ -3812,10 +3812,14 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, ~VPScalarIVStepsRecipe() override = default; VPScalarIVStepsRecipe *clone() override { - return new VPScalarIVStepsRecipe( + auto *NewR = new VPScalarIVStepsRecipe( getOperand(0), getOperand(1), getOperand(2), InductionOpcode, hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(), getDebugLoc()); + // Add lane/unroll-part operands, if present. + for (VPValue *Op : drop_begin(operands(), 3)) + NewR->addOperand(Op); + return NewR; } /// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6491a2ce6813b..68a8c0abf2682 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2368,7 +2368,16 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { if (State.Lane) { StartLane = State.Lane->getKnownLane(); EndLane = StartLane + 1; + } else if (getNumOperands() == 5) { + // Operand 3 is the Lane operand (when present after replicating by VF). + VPValue *Op3 = getOperand(3); + assert(Op3->isLiveIn() && "lane operand must be a live-in"); + auto *C = cast<ConstantInt>(Op3->getLiveInIRValue()); + unsigned Val = C->getZExtValue(); + StartLane = Val; + EndLane = Val + 1; } + Value *StartIdx0; if (getUnrollPart(*this) == 0) StartIdx0 = ConstantInt::get(IntStepTy, 0); @@ -2395,7 +2404,10 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) { "scalable"); auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul); - State.set(this, Add, VPLane(Lane)); + if (State.Lane) + State.set(this, Add, VPLane(Lane)); + else + State.set(this, Add, VPLane(0)); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 827dd4b6439ae..7e750a3c13afa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -481,7 +481,7 @@ static void addReplicateRegions(VPlan &Plan) { /// Remove redundant VPBasicBlocks by merging them into their predecessor if /// the predecessor has a single successor. -static bool mergeBlocksIntoPredecessors(VPlan &Plan) { +bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) { SmallVector<VPBasicBlock *> WorkList; for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( vp_depth_first_deep(Plan.getEntry()))) { @@ -1440,9 +1440,14 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { } // VPScalarIVSteps for part 0 can be replaced by their start value, if only - // the first lane is demanded. + // the first lane is demanded and both Lane and UnrollPart operands are 0. if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) { - if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) { + bool LaneIsZero = Steps->getNumOperands() >= 4 && + match(Steps->getOperand(3), m_ZeroInt()); + bool PartIsZero = + Steps->getNumOperands() < 5 || match(Steps->getOperand(4), m_ZeroInt()); + if (Steps->isPart0() && LaneIsZero && PartIsZero && + vputils::onlyFirstLaneUsed(Steps)) { Steps->replaceAllUsesWith(Steps->getOperand(0)); return; } @@ -4314,9 +4319,9 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) { for (VPBasicBlock *VPBB : concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (!isa<VPReplicateRecipe, VPInstruction>(&R)) + if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(&R)) continue; - auto *DefR = cast<VPRecipeWithIRFlags>(&R); + auto *DefR = cast<VPSingleDefRecipe>(&R); auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) { VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion(); return !U->usesScalars(DefR) || ParentRegion != LoopRegion; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index ae3797dee1f07..d9c4b1d96d6ea 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -178,6 +178,10 @@ struct VPlanTransforms { /// replicate regions, thereby dissolving the latter. static void replicateByVF(VPlan &Plan, ElementCount VF); + /// Replace replicate regions by explicitly replicating the regions' contents + /// \p VF times, each copy processing a single lane. + static void unrollReplicateRegions(VPlan &Plan, ElementCount VF); + /// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, @@ -189,6 +193,8 @@ struct VPlanTransforms { /// block merging. LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan); + static bool mergeBlocksIntoPredecessors(VPlan &Plan); + /// Wrap predicated VPReplicateRecipes with a mask operand in an if-then /// region block and remove the mask operand. Optimize the created regions by /// iteratively sinking scalar operands into the region, followed by merging diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index f215476b1e163..6673eaf9b67ae 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -24,6 +24,9 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "vplan" using namespace llvm; using namespace llvm::VPlanPatternMatch; @@ -121,6 +124,7 @@ class UnrollState { R->setOperand(OpIdx, getValueForPart(Op, Part)); } }; + } // namespace void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { @@ -137,6 +141,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) { remapOperands(&PartIR, Part); if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) { + ScalarIVSteps->addOperand(getConstantInt(0)); ScalarIVSteps->addOperand(getConstantInt(Part)); } @@ -526,9 +531,21 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR, RepR->getDebugLoc()); } else { - assert(isa<VPInstruction>(DefR) && + assert((isa<VPInstruction, VPScalarIVStepsRecipe>(DefR)) && "DefR must be a VPReplicateRecipe or VPInstruction"); New = DefR->clone(); + if (isa<VPScalarIVStepsRecipe>(New)) { + // Add or update lane operand for VPScalarIVStepsRecipe. + if (NewOps.size() == 3) { + NewOps.push_back(Plan.getConstantInt(IdxTy, 0)); + New->addOperand(NewOps.back()); + } + NewOps.push_back(Plan.getConstantInt(IdxTy, Lane.getKnownLane())); + New->addOperand(NewOps.back()); + if (NewOps.size() == 5) + std::swap(NewOps[3], NewOps[4]); + } + for (const auto &[Idx, Op] : enumerate(NewOps)) { New->setOperand(Idx, Op); } @@ -558,7 +575,7 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { SmallVector<VPRecipeBase *> ToRemove; for (VPBasicBlock *VPBB : VPBBsToUnroll) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (!isa<VPInstruction, VPReplicateRecipe>(&R) || + if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(&R) || (isa<VPReplicateRecipe>(&R) && cast<VPReplicateRecipe>(&R)->isSingleScalar()) || (isa<VPInstruction>(&R) && @@ -566,6 +583,19 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack)) continue; + if (isa<VPScalarIVStepsRecipe>(&R) && Plan.hasScalarVFOnly()) { + // Add lane operand to VPScalarIVStepsRecipe only when the plan is + // scalar. + if (R.getNumOperands() == 4) { + R.addOperand(R.getOperand(3)); + R.setOperand(3, Plan.getConstantInt(IdxTy, 0)); + } else { + R.addOperand(Plan.getConstantInt(IdxTy, 0)); + R.addOperand(Plan.getConstantInt(IdxTy, 0)); + } + continue; + } + auto *DefR = cast<VPSingleDefRecipe>(&R); VPBuilder Builder(DefR); if (DefR->getNumUsers() == 0) { @@ -608,3 +638,177 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { for (auto *R : reverse(ToRemove)) R->eraseFromParent(); } + +/// Process recipes in a single lane's blocks, updating them for lane-specific +/// operations. +static void processLane(VPlan &Plan, Type *IdxTy, unsigned Lane, + ElementCount VF, ArrayRef<VPBlockBase *> RegionBlocks, + DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewBlocks) { + DenseMap<VPValue *, VPValue *> Old2NewVPValues; + for (VPBlockBase *OldVPB : RegionBlocks) { + auto *OldBB = cast<VPBasicBlock>(OldVPB); + auto *NewBB = cast<VPBasicBlock>(Old2NewBlocks.lookup(OldVPB)); + for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) { + for (const auto &[OldV, NewV] : + zip(OldR.definedValues(), NewR.definedValues())) + Old2NewVPValues[OldV] = NewV; + } + + // Update lane operands and remap operands to use copies for current lane. + for (VPRecipeBase &NewR : make_early_inc_range(*NewBB)) { + if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) + Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane)); + else if (match(&NewR, m_ExtractElement(m_VPValue(), m_ZeroInt()))) + NewR.setOperand(1, Plan.getConstantInt(IdxTy, Lane)); + + // Remap operands to use lane-specific values. + for (const auto &[I, Op] : enumerate(NewR.operands())) { + // Use cloned value if operand was defined in the region. + if (auto *New = Old2NewVPValues.lookup(Op)) + NewR.setOperand(I, New); + } + } + } +} + +/// Process a single lane: clone blocks (or reuse original for lane 0), collect +/// value mappings, and process recipes for lane-specific operations. +static void processSingleLane( + VPlan &Plan, Type *IdxTy, unsigned Lane, ElementCount VF, + ArrayRef<VPBlockBase *> RegionBlocks, VPBlockBase *Entry, + VPBlockBase *Exiting, + SmallVectorImpl<std::pair<VPBlockBase *, VPBlockBase *>> &LaneClones) { + DenseMap<VPBlockBase *, VPBlockBase *> Old2NewBlocks; + if (Lane == 0) { + // Lane 0 uses the original blocks, and the recipes are adjusted: + // VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is + // converted into BranchOnCond and extracts are created as needed. + for (VPBlockBase *VPB : RegionBlocks) { + Old2NewBlocks[VPB] = VPB; + + for (VPRecipeBase &NewR : + make_early_inc_range(*cast<VPBasicBlock>(VPB))) { + VPBuilder Builder(&NewR); + for (const auto &[I, Op] : enumerate(NewR.operands())) { + // Skip operands that don't need extraction: scalar VF (no vectors), + // values defined in the same block (already scalar), or values that + // are already single scalars. + if (VF.isScalar() || + (Op->getDefiningRecipe() && + Op->getDefiningRecipe()->getParent() == VPB) || + vputils::isSingleScalar(Op)) + continue; + + // Extract the lane from values defined outside the region. + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane); + VPValue *Extract = Builder.createNaryOp( + Instruction::ExtractElement, {Op, Idx}, NewR.getDebugLoc()); + NewR.setOperand(I, Extract); + } + + if (auto *RepR = dyn_cast<VPReplicateRecipe>(&NewR)) { + auto *New = new VPReplicateRecipe( + RepR->getUnderlyingInstr(), RepR->operands(), + /* IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR, + RepR->getDebugLoc()); + New->insertBefore(RepR); + RepR->replaceAllUsesWith(New); + RepR->eraseFromParent(); + } else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(&NewR)) { + Builder.createNaryOp(VPInstruction::BranchOnCond, + {BranchOnMask->getOperand(0)}, + BranchOnMask->getDebugLoc()); + BranchOnMask->eraseFromParent(); + } else if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) { + // Add lane operand (4th operand) for VPScalarIVStepsRecipe if not + // already present. + unsigned NumOps = Steps->getNumOperands(); + if (NumOps == 4) { + // Has UnrollPart at position 3, need to insert Lane before it. + VPValue *UnrollPart = Steps->getOperand(3); + Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane)); + Steps->addOperand(UnrollPart); + } else if (NumOps == 3) { + // Just BaseIV, Step, VF - add Lane. + Steps->addOperand(Plan.getConstantInt(IdxTy, Lane)); + Steps->addOperand(Plan.getConstantInt(IdxTy, 0)); + } + } + } + } + } else { + // Clone blocks and connect them according to original structure. + for (VPBlockBase *OrigBlock : RegionBlocks) { + VPBlockBase *ClonedBlock = OrigBlock->clone(); + Old2NewBlocks[OrigBlock] = ClonedBlock; + ClonedBlock->setParent(Entry->getParent()); + } + for (VPBlockBase *OrigBlock : RegionBlocks) { + if (OrigBlock == Exiting) + continue; + for (VPBlockBase *OrigSucc : OrigBlock->successors()) + VPBlockUtils::connectBlocks(Old2NewBlocks[OrigBlock], + Old2NewBlocks[OrigSucc]); + } + } + + processLane(Plan, IdxTy, Lane, VF, RegionBlocks, Old2NewBlocks); + LaneClones.push_back({Old2NewBlocks[Entry], Old2NewBlocks[Exiting]}); +} + +void VPlanTransforms::unrollReplicateRegions(VPlan &Plan, ElementCount VF) { + // Collect all replicate regions in the plan before modifying the CFG. + SmallVector<VPRegionBlock *> ReplicateRegions; + for (VPBlockBase *Block : + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())) { + if (auto *Region = dyn_cast<VPRegionBlock>(Block)) { + if (Region->isReplicator()) + ReplicateRegions.push_back(Region); + } + } + + Type *IdxTy = IntegerType::get(Plan.getContext(), 32); + + for (VPRegionBlock *Region : ReplicateRegions) { + assert(!VF.isScalable() && "cannot replicate across scalable VFs"); + + VPBlockBase *Entry = Region->getEntry(); + VPBlockBase *Exiting = Region->getExiting(); + + // Skip regions with live-outs as packing scalar results back into vectors + // is not yet implemented. + if (any_of(*cast<VPBasicBlock>(Exiting), IsaPred<VPPredInstPHIRecipe>)) + continue; + + // Get region context before dissolving. + VPBlockBase *Pred = Region->getSinglePredecessor(); + assert(Pred && "Replicate region must have a single predecessor"); + SmallVector<VPBlockBase *> Successors(Region->successors()); + + // Disconnect and dissolve the region. + VPBlockUtils::disconnectBlocks(Pred, Region); + for (VPBlockBase *Succ : Successors) + VPBlockUtils::disconnectBlocks(Region, Succ); + + SmallVector<VPBlockBase *> RegionBlocks(vp_depth_first_shallow(Entry)); + VPRegionBlock *ParentRegion = Region->getParent(); + for (VPBlockBase *Block : RegionBlocks) + Block->setParent(ParentRegion); + VPBlockUtils::connectBlocks(Pred, Entry); + + // Process each lane: clone blocks, collect value mappings, and process + // recipes for lane-specific operations. + SmallVector<std::pair<VPBlockBase *, VPBlockBase *>> LaneClones; + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) { + processSingleLane(Plan, IdxTy, Lane, VF, RegionBlocks, Entry, Exiting, + LaneClones); + } + + // Connect lanes sequentially and connect last lane to successors. + for (unsigned Lane = 1; Lane < VF.getKnownMinValue(); ++Lane) + VPBlockUtils::connectBlocks(LaneClones[Lane - 1].second, + LaneClones[Lane].first); + for (VPBlockBase *Succ : Successors) + VPBlockUtils::connectBlocks(LaneClones.back().second, Succ); + } +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll index 8ab5723a52a11..1c2686b67331a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll @@ -199,8 +199,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: -; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[IV]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP72]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]] ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 0 ; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP9]], align 1 ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index b549a06f08f8c..7feb83e7d9cba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -290,13 +290,12 @@ define void @latch_branch_cost(ptr %dst) { ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ] +; PRED-NEXT: [[TMP2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ] ; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ] ; P... [truncated] |
This patch adds a new replicateReplicateRegionsByVF transform to
unroll replicate=regions by VF, dissolving them. The transform creates
VF copies of the replicate-region's content, connects them and converts
recipes to single-scalar variants for the corresponding lanes.
The initial version skips regions with live-outs (VPPredInstPHIRecipe),
which will be added in follow-up patches.
Depends on #170053