Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7306,6 +7306,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
VPlanTransforms::runPass(VPlanTransforms::unrollReplicateRegions, BestVPlan,
BestVF);
VPlanTransforms::runPass(VPlanTransforms::mergeBlocksIntoPredecessors,
BestVPlan);
bool HasBranchWeights =
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
if (HasBranchWeights) {
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -3788,7 +3788,7 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their scalar values.
class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<3> {
public VPUnrollPartAccessor<4> {
Instruction::BinaryOps InductionOpcode;

public:
Expand All @@ -3812,10 +3812,14 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
~VPScalarIVStepsRecipe() override = default;

VPScalarIVStepsRecipe *clone() override {
return new VPScalarIVStepsRecipe(
auto *NewR = new VPScalarIVStepsRecipe(
getOperand(0), getOperand(1), getOperand(2), InductionOpcode,
hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(),
getDebugLoc());
// Add lane/unroll-part operands, if present.
for (VPValue *Op : drop_begin(operands(), 3))
NewR->addOperand(Op);
return NewR;
}

/// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that
Expand Down
14 changes: 13 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2368,7 +2368,16 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
if (State.Lane) {
StartLane = State.Lane->getKnownLane();
EndLane = StartLane + 1;
} else if (getNumOperands() == 5) {
// Operand 3 is the Lane operand (when present after replicating by VF).
VPValue *Op3 = getOperand(3);
assert(Op3->isLiveIn() && "lane operand must be a live-in");
auto *C = cast<ConstantInt>(Op3->getLiveInIRValue());
unsigned Val = C->getZExtValue();
StartLane = Val;
EndLane = Val + 1;
}

Value *StartIdx0;
if (getUnrollPart(*this) == 0)
StartIdx0 = ConstantInt::get(IntStepTy, 0);
Expand All @@ -2395,7 +2404,10 @@ void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
"scalable");
auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
State.set(this, Add, VPLane(Lane));
if (State.Lane)
State.set(this, Add, VPLane(Lane));
else
State.set(this, Add, VPLane(0));
}
}

Expand Down
15 changes: 10 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ static void addReplicateRegions(VPlan &Plan) {

/// Remove redundant VPBasicBlocks by merging them into their predecessor if
/// the predecessor has a single successor.
static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
SmallVector<VPBasicBlock *> WorkList;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
Expand Down Expand Up @@ -1440,9 +1440,14 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
}

// VPScalarIVSteps for part 0 can be replaced by their start value, if only
// the first lane is demanded.
// the first lane is demanded and both Lane and UnrollPart operands are 0.
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
bool LaneIsZero = Steps->getNumOperands() >= 4 &&
match(Steps->getOperand(3), m_ZeroInt());
bool PartIsZero =
Steps->getNumOperands() < 5 || match(Steps->getOperand(4), m_ZeroInt());
if (Steps->isPart0() && LaneIsZero && PartIsZero &&
vputils::onlyFirstLaneUsed(Steps)) {
Steps->replaceAllUsesWith(Steps->getOperand(0));
return;
}
Expand Down Expand Up @@ -4314,9 +4319,9 @@ void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
for (VPBasicBlock *VPBB :
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (!isa<VPReplicateRecipe, VPInstruction>(&R))
if (!isa<VPScalarIVStepsRecipe, VPReplicateRecipe, VPInstruction>(&R))
continue;
auto *DefR = cast<VPRecipeWithIRFlags>(&R);
auto *DefR = cast<VPSingleDefRecipe>(&R);
auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@ struct VPlanTransforms {
/// replicate regions, thereby dissolving the latter.
static void replicateByVF(VPlan &Plan, ElementCount VF);

/// Replace replicate regions by explicitly replicating the regions' contents
/// \p VF times, each copy processing a single lane.
static void unrollReplicateRegions(VPlan &Plan, ElementCount VF);

/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
Expand All @@ -189,6 +193,8 @@ struct VPlanTransforms {
/// block merging.
LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);

static bool mergeBlocksIntoPredecessors(VPlan &Plan);

/// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
/// region block and remove the mask operand. Optimize the created regions by
/// iteratively sinking scalar operands into the region, followed by merging
Expand Down
208 changes: 206 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Debug.h"

#define DEBUG_TYPE "vplan"

using namespace llvm;
using namespace llvm::VPlanPatternMatch;
Expand Down Expand Up @@ -121,6 +124,7 @@ class UnrollState {
R->setOperand(OpIdx, getValueForPart(Op, Part));
}
};

} // namespace

void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
Expand All @@ -137,6 +141,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
remapOperands(&PartIR, Part);
if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
ScalarIVSteps->addOperand(getConstantInt(0));
ScalarIVSteps->addOperand(getConstantInt(Part));
}

Expand Down Expand Up @@ -526,9 +531,21 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
/*IsSingleScalar=*/true, /*Mask=*/nullptr,
*RepR, *RepR, RepR->getDebugLoc());
} else {
assert(isa<VPInstruction>(DefR) &&
assert((isa<VPInstruction, VPScalarIVStepsRecipe>(DefR)) &&
"DefR must be a VPReplicateRecipe or VPInstruction");
New = DefR->clone();
if (isa<VPScalarIVStepsRecipe>(New)) {
// Add or update lane operand for VPScalarIVStepsRecipe.
if (NewOps.size() == 3) {
NewOps.push_back(Plan.getConstantInt(IdxTy, 0));
New->addOperand(NewOps.back());
}
NewOps.push_back(Plan.getConstantInt(IdxTy, Lane.getKnownLane()));
New->addOperand(NewOps.back());
if (NewOps.size() == 5)
std::swap(NewOps[3], NewOps[4]);
}

for (const auto &[Idx, Op] : enumerate(NewOps)) {
New->setOperand(Idx, Op);
}
Expand Down Expand Up @@ -558,14 +575,27 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (!isa<VPInstruction, VPReplicateRecipe>(&R) ||
if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(&R) ||
(isa<VPReplicateRecipe>(&R) &&
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
(isa<VPInstruction>(&R) &&
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
continue;

if (isa<VPScalarIVStepsRecipe>(&R) && Plan.hasScalarVFOnly()) {
// Add lane operand to VPScalarIVStepsRecipe only when the plan is
// scalar.
if (R.getNumOperands() == 4) {
R.addOperand(R.getOperand(3));
R.setOperand(3, Plan.getConstantInt(IdxTy, 0));
} else {
R.addOperand(Plan.getConstantInt(IdxTy, 0));
R.addOperand(Plan.getConstantInt(IdxTy, 0));
}
continue;
}

auto *DefR = cast<VPSingleDefRecipe>(&R);
VPBuilder Builder(DefR);
if (DefR->getNumUsers() == 0) {
Expand Down Expand Up @@ -608,3 +638,177 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
for (auto *R : reverse(ToRemove))
R->eraseFromParent();
}

/// Process recipes in a single lane's blocks, updating them for lane-specific
/// operations.
static void processLane(VPlan &Plan, Type *IdxTy, unsigned Lane,
ElementCount VF, ArrayRef<VPBlockBase *> RegionBlocks,
DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewBlocks) {
DenseMap<VPValue *, VPValue *> Old2NewVPValues;
for (VPBlockBase *OldVPB : RegionBlocks) {
auto *OldBB = cast<VPBasicBlock>(OldVPB);
auto *NewBB = cast<VPBasicBlock>(Old2NewBlocks.lookup(OldVPB));
for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
for (const auto &[OldV, NewV] :
zip(OldR.definedValues(), NewR.definedValues()))
Old2NewVPValues[OldV] = NewV;
}

// Update lane operands and remap operands to use copies for current lane.
for (VPRecipeBase &NewR : make_early_inc_range(*NewBB)) {
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR))
Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane));
else if (match(&NewR, m_ExtractElement(m_VPValue(), m_ZeroInt())))
NewR.setOperand(1, Plan.getConstantInt(IdxTy, Lane));

// Remap operands to use lane-specific values.
for (const auto &[I, Op] : enumerate(NewR.operands())) {
// Use cloned value if operand was defined in the region.
if (auto *New = Old2NewVPValues.lookup(Op))
NewR.setOperand(I, New);
}
}
}
}

/// Process a single lane: clone blocks (or reuse original for lane 0), collect
/// value mappings, and process recipes for lane-specific operations.
static void processSingleLane(
VPlan &Plan, Type *IdxTy, unsigned Lane, ElementCount VF,
ArrayRef<VPBlockBase *> RegionBlocks, VPBlockBase *Entry,
VPBlockBase *Exiting,
SmallVectorImpl<std::pair<VPBlockBase *, VPBlockBase *>> &LaneClones) {
DenseMap<VPBlockBase *, VPBlockBase *> Old2NewBlocks;
if (Lane == 0) {
// Lane 0 uses the original blocks, and the recipes are adjusted:
// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is
// converted into BranchOnCond and extracts are created as needed.
for (VPBlockBase *VPB : RegionBlocks) {
Old2NewBlocks[VPB] = VPB;

for (VPRecipeBase &NewR :
make_early_inc_range(*cast<VPBasicBlock>(VPB))) {
VPBuilder Builder(&NewR);
for (const auto &[I, Op] : enumerate(NewR.operands())) {
// Skip operands that don't need extraction: scalar VF (no vectors),
// values defined in the same block (already scalar), or values that
// are already single scalars.
if (VF.isScalar() ||
(Op->getDefiningRecipe() &&
Op->getDefiningRecipe()->getParent() == VPB) ||
vputils::isSingleScalar(Op))
continue;

// Extract the lane from values defined outside the region.
VPValue *Idx = Plan.getConstantInt(IdxTy, Lane);
VPValue *Extract = Builder.createNaryOp(
Instruction::ExtractElement, {Op, Idx}, NewR.getDebugLoc());
NewR.setOperand(I, Extract);
}

if (auto *RepR = dyn_cast<VPReplicateRecipe>(&NewR)) {
auto *New = new VPReplicateRecipe(
RepR->getUnderlyingInstr(), RepR->operands(),
/* IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR,
RepR->getDebugLoc());
New->insertBefore(RepR);
RepR->replaceAllUsesWith(New);
RepR->eraseFromParent();
} else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(&NewR)) {
Builder.createNaryOp(VPInstruction::BranchOnCond,
{BranchOnMask->getOperand(0)},
BranchOnMask->getDebugLoc());
BranchOnMask->eraseFromParent();
} else if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) {
// Add lane operand (4th operand) for VPScalarIVStepsRecipe if not
// already present.
unsigned NumOps = Steps->getNumOperands();
if (NumOps == 4) {
// Has UnrollPart at position 3, need to insert Lane before it.
VPValue *UnrollPart = Steps->getOperand(3);
Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane));
Steps->addOperand(UnrollPart);
} else if (NumOps == 3) {
// Just BaseIV, Step, VF - add Lane.
Steps->addOperand(Plan.getConstantInt(IdxTy, Lane));
Steps->addOperand(Plan.getConstantInt(IdxTy, 0));
}
}
}
}
} else {
// Clone blocks and connect them according to original structure.
for (VPBlockBase *OrigBlock : RegionBlocks) {
VPBlockBase *ClonedBlock = OrigBlock->clone();
Old2NewBlocks[OrigBlock] = ClonedBlock;
ClonedBlock->setParent(Entry->getParent());
}
for (VPBlockBase *OrigBlock : RegionBlocks) {
if (OrigBlock == Exiting)
continue;
for (VPBlockBase *OrigSucc : OrigBlock->successors())
VPBlockUtils::connectBlocks(Old2NewBlocks[OrigBlock],
Old2NewBlocks[OrigSucc]);
}
}

processLane(Plan, IdxTy, Lane, VF, RegionBlocks, Old2NewBlocks);
LaneClones.push_back({Old2NewBlocks[Entry], Old2NewBlocks[Exiting]});
}

void VPlanTransforms::unrollReplicateRegions(VPlan &Plan, ElementCount VF) {
// Collect all replicate regions in the plan before modifying the CFG.
SmallVector<VPRegionBlock *> ReplicateRegions;
for (VPBlockBase *Block :
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())) {
if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
if (Region->isReplicator())
ReplicateRegions.push_back(Region);
}
}

Type *IdxTy = IntegerType::get(Plan.getContext(), 32);

for (VPRegionBlock *Region : ReplicateRegions) {
assert(!VF.isScalable() && "cannot replicate across scalable VFs");

VPBlockBase *Entry = Region->getEntry();
VPBlockBase *Exiting = Region->getExiting();

// Skip regions with live-outs as packing scalar results back into vectors
// is not yet implemented.
if (any_of(*cast<VPBasicBlock>(Exiting), IsaPred<VPPredInstPHIRecipe>))
continue;

// Get region context before dissolving.
VPBlockBase *Pred = Region->getSinglePredecessor();
assert(Pred && "Replicate region must have a single predecessor");
SmallVector<VPBlockBase *> Successors(Region->successors());

// Disconnect and dissolve the region.
VPBlockUtils::disconnectBlocks(Pred, Region);
for (VPBlockBase *Succ : Successors)
VPBlockUtils::disconnectBlocks(Region, Succ);

SmallVector<VPBlockBase *> RegionBlocks(vp_depth_first_shallow(Entry));
VPRegionBlock *ParentRegion = Region->getParent();
for (VPBlockBase *Block : RegionBlocks)
Block->setParent(ParentRegion);
VPBlockUtils::connectBlocks(Pred, Entry);

// Process each lane: clone blocks, collect value mappings, and process
// recipes for lane-specific operations.
SmallVector<std::pair<VPBlockBase *, VPBlockBase *>> LaneClones;
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) {
processSingleLane(Plan, IdxTy, Lane, VF, RegionBlocks, Entry, Exiting,
LaneClones);
}

// Connect lanes sequentially and connect last lane to successors.
for (unsigned Lane = 1; Lane < VF.getKnownMinValue(); ++Lane)
VPBlockUtils::connectBlocks(LaneClones[Lane - 1].second,
LaneClones[Lane].first);
for (VPBlockBase *Succ : Successors)
VPBlockUtils::connectBlocks(LaneClones.back().second, Succ);
}
}
3 changes: 1 addition & 2 deletions llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 0
; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[IV]], 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP72]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 0
; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP9]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
Expand Down
Loading