llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 101 additions & 74 deletions b/‎llvm/lib/Transforms/Vectorize/LoopVectorize.cpp‎
Lines changed: 101 additions & 74 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlan.cpp‎
Lines changed: 10 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlan.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlan.h‎
Lines changed: 32 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlan.h‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp‎
Lines changed: 3 additions & 1 deletion b/‎llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h‎
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp‎
Lines changed: 5 additions & 5 deletions b/‎llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp‎
Lines changed: 5 additions & 5 deletions
@@ -428,6 +428,11 @@ class InnerLoopVectorizer {
  /// new unrolled loop, where UF is the unroll factor.
  using VectorParts = SmallVector<Value *, 2>;
 
+ /// Vectorize a single GetElementPtrInst based on information gathered and
+ /// decisions taken during planning.
+ void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
+ bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
+
  /// Vectorize a single PHINode in a block. This method handles the induction
  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
  /// arbitrary length vectors.
@@ -3961,6 +3966,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
  }
 }
 
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
+ unsigned VF, bool IsPtrLoopInvariant,
+ SmallBitVector &IsIndexLoopInvariant) {
+ // Construct a vector GEP by widening the operands of the scalar GEP as
+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+ // results in a vector of pointers when at least one operand of the GEP
+ // is vector-typed. Thus, to keep the representation compact, we only use
+ // vector-typed operands for loop-varying values.
+
+ if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ // If we are vectorizing, but the GEP has only loop-invariant operands,
+ // the GEP we build (by only using vector-typed operands for
+ // loop-varying values) would be a scalar pointer. Thus, to ensure we
+ // produce a vector of pointers, we need to either arbitrarily pick an
+ // operand to broadcast, or broadcast a clone of the original GEP.
+ // Here, we broadcast a clone of the original.
+ //
+ // TODO: If at some point we decide to scalarize instructions having
+ // loop-invariant operands, this special case will no longer be
+ // required. We would add the scalarization decision to
+ // collectLoopScalars() and teach getVectorValue() to broadcast
+ // the lane-zero scalar value.
+ auto *Clone = Builder.Insert(GEP->clone());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+ VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
+ addMetadata(EntryPart, GEP);
+ }
+ } else {
+ // If the GEP has at least one loop-varying operand, we are sure to
+ // produce a vector of pointers. But if we are only unrolling, we want
+ // to produce a scalar GEP for each unroll part. Thus, the GEP we
+ // produce with the code below will be scalar (if VF == 1) or vector
+ // (otherwise). Note that for the unroll-only case, we still maintain
+ // values in the vector mapping with initVector, as we do for other
+ // instructions.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // The pointer operand of the new GEP. If it's loop-invariant, we
+ // won't broadcast it.
+ auto *Ptr = IsPtrLoopInvariant
+ ? GEP->getPointerOperand()
+ : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+
+ // Collect all the indices for the new GEP. If any index is
+ // loop-invariant, we won't broadcast it.
+ SmallVector<Value *, 4> Indices;
+ for (auto Index : enumerate(GEP->indices())) {
+ Value *User = Index.value().get();
+ if (IsIndexLoopInvariant[Index.index()])
+ Indices.push_back(User);
+ else
+ Indices.push_back(getOrCreateVectorValue(User, Part));
+ }
+
+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+ // but it should be a vector, otherwise.
+ auto *NewGEP =
+ GEP->isInBounds()
+ ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
+ Indices)
+ : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
+ assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+ "NewGEP is not a pointer vector");
+ VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
+ addMetadata(NewGEP, GEP);
+ }
+ }
+}
+
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
  unsigned VF) {
  PHINode *P = cast<PHINode>(PN);
@@ -4063,76 +4137,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
  switch (I.getOpcode()) {
  case Instruction::Br:
  case Instruction::PHI:
+ case Instruction::GetElementPtr:
  llvm_unreachable("This instruction is handled by a different recipe.");
- case Instruction::GetElementPtr: {
- // Construct a vector GEP by widening the operands of the scalar GEP as
- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
- // results in a vector of pointers when at least one operand of the GEP
- // is vector-typed. Thus, to keep the representation compact, we only use
- // vector-typed operands for loop-varying values.
- auto *GEP = cast<GetElementPtrInst>(&I);
-
- if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
- // If we are vectorizing, but the GEP has only loop-invariant operands,
- // the GEP we build (by only using vector-typed operands for
- // loop-varying values) would be a scalar pointer. Thus, to ensure we
- // produce a vector of pointers, we need to either arbitrarily pick an
- // operand to broadcast, or broadcast a clone of the original GEP.
- // Here, we broadcast a clone of the original.
- //
- // TODO: If at some point we decide to scalarize instructions having
- // loop-invariant operands, this special case will no longer be
- // required. We would add the scalarization decision to
- // collectLoopScalars() and teach getVectorValue() to broadcast
- // the lane-zero scalar value.
- auto *Clone = Builder.Insert(GEP->clone());
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
- VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
- addMetadata(EntryPart, GEP);
- }
- } else {
- // If the GEP has at least one loop-varying operand, we are sure to
- // produce a vector of pointers. But if we are only unrolling, we want
- // to produce a scalar GEP for each unroll part. Thus, the GEP we
- // produce with the code below will be scalar (if VF == 1) or vector
- // (otherwise). Note that for the unroll-only case, we still maintain
- // values in the vector mapping with initVector, as we do for other
- // instructions.
- for (unsigned Part = 0; Part < UF; ++Part) {
- // The pointer operand of the new GEP. If it's loop-invariant, we
- // won't broadcast it.
- auto *Ptr =
- OrigLoop->isLoopInvariant(GEP->getPointerOperand())
- ? GEP->getPointerOperand()
- : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
-
- // Collect all the indices for the new GEP. If any index is
- // loop-invariant, we won't broadcast it.
- SmallVector<Value *, 4> Indices;
- for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
- if (OrigLoop->isLoopInvariant(U.get()))
- Indices.push_back(U.get());
- else
- Indices.push_back(getOrCreateVectorValue(U.get(), Part));
- }
-
- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
- // but it should be a vector, otherwise.
- auto *NewGEP =
- GEP->isInBounds()
- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
- Indices)
- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
- assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
- "NewGEP is not a pointer vector");
- VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
- addMetadata(NewGEP, GEP);
- }
- }
-
- break;
- }
  case Instruction::UDiv:
  case Instruction::SDiv:
  case Instruction::SRem:
@@ -6831,7 +6837,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
  case Instruction::FPTrunc:
  case Instruction::FRem:
  case Instruction::FSub:
- case Instruction::GetElementPtr:
  case Instruction::ICmp:
  case Instruction::IntToPtr:
  case Instruction::Load:
@@ -6896,12 +6901,13 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
 
  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
  return false;
-
  // If this ingredient's recipe is to be recorded, keep its recipe a singleton
  // to avoid having to split recipes later.
  bool IsSingleton = Ingredient2Recipe.count(I);
 
- // Success: widen this instruction. We optimize the common case where
+ // Success: widen this instruction.
+
+ // Use the default widening recipe. We optimize the common case where
  // consecutive instructions can be represented by a single recipe.
  if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
  LastExtensibleRecipe->appendInstruction(I))
@@ -6999,7 +7005,23 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
  return true;
  }
 
- // Check if Instr is to be widened by a general VPWidenRecipe.
+ // Handle GEP widening.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+ auto Scalarize = [&](unsigned VF) {
+ return CM.isScalarWithPredication(Instr, VF) ||
+ CM.isScalarAfterVectorization(Instr, VF) ||
+ CM.isProfitableToScalarize(Instr, VF);
+ };
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
+ return false;
+ VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
+ setRecipe(Instr, Recipe);
+ VPBB->appendRecipe(Recipe);
+ return true;
+ }
+
+ // Check if Instr is to be widened by a general VPWidenRecipe, after
+ // having first checked for specific widening recipes.
  if (tryToWiden(Instr, VPBB, Range))
  return true;
 
@@ -7241,7 +7263,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 
  SmallPtrSet<Instruction *, 1> DeadInstructions;
  VPlanHCFGTransforms::VPInstructionsToVPRecipes(
- Plan, Legal->getInductionVars(), DeadInstructions);
+ OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
 
  return Plan;
 }
@@ -7271,6 +7293,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {
  State.ILV->widenInstruction(Instr);
 }
 
+void VPWidenGEPRecipe::execute(VPTransformState &State) {
+ State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
+ IsIndexLoopInvariant);
+}
+
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
  assert(!State.Instance && "Int or FP induction being replicated.");
  State.ILV->widenIntOrFpInduction(IV, Trunc);
 
@@ -683,6 +683,16 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O,
  O << " " << VPlanIngredient(IV) << "\\l\"";
 }
 
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n" << Indent << "\"WIDEN-GEP ";
+ O << (IsPtrLoopInvariant ? "Inv" : "Var");
+ size_t IndicesNumber = IsIndexLoopInvariant.size();
+ for (size_t I = 0; I < IndicesNumber; ++I)
+ O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
+ O << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\"";
+}
+
 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
  O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
 }
 
@@ -31,6 +31,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -587,6 +588,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
  VPInterleaveSC,
  VPPredInstPHISC,
  VPReplicateSC,
+ VPWidenGEPSC,
  VPWidenIntOrFpInductionSC,
  VPWidenMemoryInstructionSC,
  VPWidenPHISC,
@@ -749,6 +751,36 @@ class VPWidenRecipe : public VPRecipeBase {
  void print(raw_ostream &O, const Twine &Indent) const override;
 };
 
+/// A recipe for handling GEP instructions.
+class VPWidenGEPRecipe : public VPRecipeBase {
+private:
+ GetElementPtrInst *GEP;
+ bool IsPtrLoopInvariant;
+ SmallBitVector IsIndexLoopInvariant;
+
+public:
+ VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop)
+ : VPRecipeBase(VPWidenGEPSC), GEP(GEP),
+ IsIndexLoopInvariant(GEP->getNumIndices(), false) {
+ IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
+ for (auto Index : enumerate(GEP->indices()))
+ IsIndexLoopInvariant[Index.index()] =
+ OrigLoop->isLoopInvariant(Index.value().get());
+ }
+ ~VPWidenGEPRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC;
+ }
+
+ /// Generate the gep nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+};
+
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
 class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
 
@@ -17,7 +17,7 @@
 using namespace llvm;
 
 void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
- VPlanPtr &Plan,
+ Loop *OrigLoop, VPlanPtr &Plan,
  LoopVectorizationLegality::InductionList *Inductions,
  SmallPtrSetImpl<Instruction *> &DeadInstructions) {
 
@@ -64,6 +64,8 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(
  NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
  } else
  NewRecipe = new VPWidenPHIRecipe(Phi);
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+ NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop);
  } else {
  // If the last recipe is a VPWidenRecipe, add Inst to it instead of
  // creating a new recipe.
 
@@ -25,7 +25,7 @@ class VPlanHCFGTransforms {
  /// Replaces the VPInstructions in \p Plan with corresponding
  /// widen recipes.
  static void VPInstructionsToVPRecipes(
- VPlanPtr &Plan,
+ Loop *OrigLoop, VPlanPtr &Plan,
  LoopVectorizationLegality::InductionList *Inductions,
  SmallPtrSetImpl<Instruction *> &DeadInstructions);
 };
 
@@ -89,8 +89,8 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) {
 
  LoopVectorizationLegality::InductionList Inductions;
  SmallPtrSet<Instruction *, 1> DeadInstructions;
- VPlanHCFGTransforms::VPInstructionsToVPRecipes(Plan, &Inductions,
-   DeadInstructions);
+ VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+ LI->getLoopFor(LoopHeader), Plan, &Inductions, DeadInstructions);
 }
 
 TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
@@ -119,8 +119,8 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
 
  LoopVectorizationLegality::InductionList Inductions;
  SmallPtrSet<Instruction *, 1> DeadInstructions;
- VPlanHCFGTransforms::VPInstructionsToVPRecipes(Plan, &Inductions,
-   DeadInstructions);
+ VPlanHCFGTransforms::VPInstructionsToVPRecipes(
+ LI->getLoopFor(LoopHeader), Plan, &Inductions, DeadInstructions);
 
  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
@@ -136,7 +136,7 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
  auto *Phi = dyn_cast<VPWidenPHIRecipe>(&*Iter++);
  EXPECT_NE(nullptr, Phi);
 
- auto *Idx = dyn_cast<VPWidenRecipe>(&*Iter++);
+ auto *Idx = dyn_cast<VPWidenGEPRecipe>(&*Iter++);
  EXPECT_NE(nullptr, Idx);
 
  auto *Load = dyn_cast<VPWidenMemoryInstructionRecipe>(&*Iter++);