[InterleavedAccess] Construct interleaved access store with shuffles #167737

ram-NK · 2025-11-12T18:30:28Z

Re-land reverted PR #164000

Added reported failure cases and fixed the issues.

llvmbot · 2025-11-12T18:31:00Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-aarch64

Author: Ramkrishnan (ram-NK)

Changes

Re-land reverted PR #164000

Added reported failure cases and fixed the issues.

Patch is 41.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167737.diff

8 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+3)
(modified) llvm/lib/CodeGen/InterleavedAccessPass.cpp (+11-2)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+146-3)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+5)
(modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+32-5)
(modified) llvm/test/CodeGen/AArch64/vldn_shuffle.ll (+238)
(added) llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll (+117)
(modified) llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll (+8-8)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index cec7d09f494d6..87809a27fe913 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3232,6 +3232,9 @@ class LLVM_ABI TargetLoweringBase { /// Default to be the minimum interleave factor: 2. virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } + /// Return true if the target has interleave with shuffles. + virtual bool hasInterleaveWithGatherScatter() const { return false; } + /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 5c27a20869f81..bfb875a01f29f 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, - unsigned MaxFactor) { + unsigned MaxFactor, + bool InterleaveWithShuffles) { unsigned NumElts = SVI->getShuffleMask().size(); if (NumElts < 4) return false; @@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return true; } + if (InterleaveWithShuffles) { + for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) { + Factor = i * MaxFactor; + if (SVI->isInterleave(Factor)) + return true; + } + } return false; } @@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore( cast<FixedVectorType>(SVI->getType())->getNumElements(); // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; - if (!isReInterleaveMask(SVI, Factor, MaxFactor)) + if (!isReInterleaveMask(SVI, Factor, MaxFactor, + TLI->hasInterleaveWithGatherScatter())) return false; assert(NumStoredElements % Factor == 0 && "number of stored element should be a multiple of Factor"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index eaa10ef031989..996d3a22fd5f1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -97,6 +97,7 @@ #include <cctype> #include <cstdint> #include <cstdlib> +#include <deque> #include <iterator> #include <limits> #include <optional> @@ -18010,12 +18011,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const { - - assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && - "Invalid interleave factor"); + assert(Factor >= 2 && "Invalid interleave factor"); auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; + + if (Factor > getMaxSupportedInterleaveFactor()) + return lowerInterleavedStoreWithShuffle(SI, SVI, Factor); + assert(!LaneMask && GapMask.popcount() == Factor && "Unexpected mask on store"); @@ -18161,6 +18164,146 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, return true; } +/// If the interleaved vector elements are greater than supported MaxFactor, +/// interleaving the data with additional shuffles can be used to +/// achieve the same. +/// +/// Consider the following data with 8 interleaves which are shuffled to store +/// stN instructions. Data needs to be stored in this order: +/// [v0, v1, v2, v3, v4, v5, v6, v7] +/// +/// v0 v4 v2 v6 v1 v5 v3 v7 +/// | | | | | | | | +/// \ / \ / \ / \ / +/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4 +/// | | | | +/// \ / \ / +/// \ / \ / +/// \ / \ / +/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 +/// +/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored +/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with +/// another st4. +/// +/// For stN = 2, upper half of interleaved data V0, V1 is stored +/// with one st2 instruction. Second set V2, V3 is stored with another st2. +/// Total of 4 st2's are required here. +bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle( + StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { + unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); + + auto *VecTy = cast<FixedVectorType>(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); + + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, Factor); + + const DataLayout &DL = SI->getModule()->getDataLayout(); + bool UseScalable; + + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || + !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) + return false; + + if (UseScalable) + return false; + + std::deque<Value *> Shuffles; + Shuffles.push_back(SVI); + unsigned ConcatLevel = Factor; + // Getting all the interleaved operands. + while (ConcatLevel > 1) { + unsigned InterleavedOperands = Shuffles.size(); + for (unsigned Ops = 0; Ops < InterleavedOperands; Ops++) { + auto *V = Shuffles.front(); + Shuffles.pop_front(); + if (isa<ConstantAggregateZero, UndefValue>(V)) { + VectorType *Ty = cast<VectorType>(V->getType()); + auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); + Value *SplitValue = nullptr; + if (isa<ConstantAggregateZero>(V)) + SplitValue = ConstantAggregateZero::get(HalfTy); + else if (isa<PoisonValue>(V)) + SplitValue = PoisonValue::get(HalfTy); + else if (isa<UndefValue>(V)) + SplitValue = UndefValue::get(HalfTy); + Shuffles.push_back(SplitValue); + Shuffles.push_back(SplitValue); + continue; + } + + ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(V); + if (!SFL) + return false; + if (SVI != SFL && !SFL->isConcat()) + return false; + + Value *Op0 = SFL->getOperand(0); + Value *Op1 = SFL->getOperand(1); + + Shuffles.push_back(dyn_cast<Value>(Op0)); + Shuffles.push_back(dyn_cast<Value>(Op1)); + } + ConcatLevel >>= 1; + } + + IRBuilder<> Builder(SI); + auto Mask = createInterleaveMask(LaneLen, 2); + SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen); + for (unsigned Idx = 0; Idx < LaneLen; Idx++) { + LowerHalfMask[Idx] = Mask[Idx]; + UpperHalfMask[Idx] = Mask[Idx + LaneLen]; + } + + unsigned InterleaveFactor = Factor >> 1; + while (InterleaveFactor >= MaxSupportedFactor) { + std::deque<Value *> ShufflesIntermediate; + ShufflesIntermediate.resize(Factor); + for (unsigned Idx = 0; Idx < Factor; Idx += (InterleaveFactor * 2)) { + for (unsigned GroupIdx = 0; GroupIdx < InterleaveFactor; GroupIdx++) { + auto *Shuffle = Builder.CreateShuffleVector( + Shuffles[Idx + GroupIdx], + Shuffles[Idx + GroupIdx + InterleaveFactor], LowerHalfMask); + ShufflesIntermediate[Idx + GroupIdx] = Shuffle; + Shuffle = Builder.CreateShuffleVector( + Shuffles[Idx + GroupIdx], + Shuffles[Idx + GroupIdx + InterleaveFactor], UpperHalfMask); + ShufflesIntermediate[Idx + GroupIdx + InterleaveFactor] = Shuffle; + } + } + Shuffles = ShufflesIntermediate; + InterleaveFactor >>= 1; + } + + Type *PtrTy = SI->getPointerOperandType(); + auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); + + Value *BaseAddr = SI->getPointerOperand(); + Function *StNFunc = getStructuredStoreFunction( + SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy); + for (unsigned N = 0; N < (Factor / MaxSupportedFactor); N++) { + SmallVector<Value *, 5> Ops; + for (unsigned OpIdx = 0; OpIdx < MaxSupportedFactor; OpIdx++) + Ops.push_back(Shuffles[N * MaxSupportedFactor + OpIdx]); + + if (N > 0) { + // We will compute the pointer operand of each store from the original + // base address using GEPs. Cast the base address to a pointer to the + // scalar element type. + BaseAddr = Builder.CreateConstGEP1_32( + SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor); + } + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } + return true; +} + bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 70bfae717fb76..2793022d9e240 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -229,6 +229,8 @@ class AArch64TargetLowering : public TargetLowering { bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override; + bool hasInterleaveWithGatherScatter() const override { return true; } + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(Instruction *Load, Value *Mask, @@ -239,6 +241,9 @@ class AArch64TargetLowering : public TargetLowering { ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override; + bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 197aae6e03cb1..5d2f33ab8d62e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4922,11 +4922,35 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) return InstructionCost::getInvalid(); - if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + unsigned NumLoadStores = 1; + InstructionCost ShuffleCost = 0; + bool isInterleaveWithShuffle = false; + unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor(); + + auto *SubVecTy = + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); + + if (TLI->hasInterleaveWithGatherScatter() && Opcode == Instruction::Store && + (0 == Factor % MaxSupportedFactor) && Factor > MaxSupportedFactor) { + isInterleaveWithShuffle = true; + SmallVector<int, 16> Mask; + // preparing interleave Mask. + for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2; + i++) { + for (unsigned j = 0; j < 2; j++) + Mask.push_back(j * Factor + i); + } + + NumLoadStores = Factor / MaxSupportedFactor; + ShuffleCost = + (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy, + Mask, CostKind, 0, SubVecTy)); + } + + if (!UseMaskForGaps && + (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) { unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); - auto *SubVecTy = - VectorType::get(VecVTy->getElementType(), - VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -4934,7 +4958,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( bool UseScalable; if (MinElts % Factor == 0 && TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); + return (Factor * + TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) * + NumLoadStores) + + ShuffleCost; } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index 3685e9cf85bd6..7af1c8e9365ed 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -730,6 +730,244 @@ entry: ret void } +define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) { +; CHECK-LABEL: store_factor8: +; CHECK: .Lfunc_begin17: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK: zip1	[[V1:.*s]], [[I1:.*s]], [[I5:.*s]] +; CHECK-NEXT: zip2	[[V5:.*s]], [[I1]], [[I5]] +; CHECK-NEXT: zip1	[[V2:.*s]], [[I2:.*s]], [[I6:.*s]] +; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]] +; CHECK-NEXT: zip1	[[V3:.*s]], [[I3:.*s]], [[I7:.*s]] +; CHECK-NEXT: zip2	[[V7:.*s]], [[I3]], [[I7]] +; CHECK-NEXT: zip1	[[V4:.*s]], [[I4:.*s]], [[I8:.*s]] +; CHECK-NEXT: zip2	[[V8:.*s]], [[I4]], [[I8]] +; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64 +; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0] +; CHECK-NEXT: ret + + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7, + <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11, + <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) { +; CHECK-LABEL: store_factor16: +; CHECK: .Lfunc_begin18: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK:	zip1	[[V05:.*s]], [[I05:.*s]], [[I13:.*s]] +; CHECK-NEXT:	zip1	[[V01:.*s]], [[I01:.*s]], [[I09:.*s]] +; CHECK-NEXT:	zip1	[[V02:.*s]], [[I02:.*s]], [[I10:.*s]] +; CHECK-NEXT:	zip1	[[V06:.*s]], [[I06:.*s]], [[I14:.*s]] +; CHECK-NEXT:	zip1	[[V07:.*s]], [[I07:.*s]], [[I15:.*s]] +; CHECK-NEXT:	zip2	[[V09:.*s]], [[I01]], [[I09]] +; CHECK-NEXT:	zip2	[[V13:.*s]], [[I05]], [[I13]] +; CHECK-NEXT:	zip1	[[V03:.*s]], [[I03:.*s]], [[I11:.*s]] +; CHECK-NEXT:	zip1	[[V04:.*s]], [[I04:.*s]], [[I12:.*s]] +; CHECK-NEXT:	zip1	[[V08:.*s]], [[I08:.*s]], [[I16:.*s]] +; CHECK-NEXT:	zip2	[[V10:.*s]], [[I02]], [[I10]] +; CHECK-NEXT:	zip2	[[V14:.*s]], [[I06]], [[I14]] +; CHECK-NEXT:	zip2	[[V11:.*s]], [[I03]], [[I11]] +; CHECK-NEXT:	zip1	[[V17:.*s]], [[V01]], [[V05]] +; CHECK-NEXT:	zip2	[[V15:.*s]], [[I07]], [[I15]] +; CHECK-NEXT:	zip2	[[V21:.*s]], [[V01]], [[V05]] +; CHECK-NEXT:	zip1	[[V18:.*s]], [[V02]], [[V06]] +; CHECK-NEXT:	zip2	[[V12:.*s]], [[I04]], [[I12]] +; CHECK-NEXT:	zip2	[[V16:.*s]], [[I08]], [[I16]] +; CHECK-NEXT:	zip1	[[V19:.*s]], [[V03]], [[V07]] +; CHECK-NEXT:	zip2	[[V22:.*s]], [[V02]], [[V06]] +; CHECK-NEXT:	zip1	[[V25:.*s]], [[V09]], [[V13]] +; CHECK-NEXT:	zip1	[[V20:.*s]], [[V04]], [[V08]] +; CHECK-NEXT:	zip2	[[V23:.*s]], [[V03]], [[V07]] +; CHECK-NEXT:	zip1	[[V26:.*s]], [[V10]], [[V14]] +; CHECK-NEXT:	zip2	[[V29:.*s]], [[V09]], [[V13]] +; CHECK-NEXT:	zip2	[[V24:.*s]], [[V04]], [[V08]] +; CHECK-NEXT:	zip1	[[V27:.*s]], [[V11]], [[V15]] +; CHECK-NEXT:	zip2	[[V30:.*s]], [[V10]], [[V14]] +; CHECK-NEXT:	zip1	[[V28:.*s]], [[V12]], [[V16]] +; CHECK-NEXT:	zip2	[[V31:.*s]], [[V11]], [[V15]] +; CHECK-NEXT:	zip2	[[V32:.*s]], [[V12]], [[V16]] +; CHECK-NEXT:	st4	{ [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64 +; CHECK-NEXT:	ldp	d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT:	ldp	d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT:	st4	{ [[V21]], [[V22]], [[V23]], [[V24]] }, [x8] +; CHECK-NEXT:	add	x8, x0, #128 +; CHECK-NEXT:	ldp	d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT:	st4	{ [[V25]], [[V26]], [[V27]], [[V28]] }, [x8] +; CHECK-NEXT:	add	x8, x0, #192 +; CHECK-NEXT:	st4	{ [[V29]], [[V30]], [[V31]], [[V32]] }, [x8] +; CHECK-NEXT:	ldp	d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT:	ret + + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + + %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3,... [truncated]

github-actions · 2025-11-12T18:32:12Z

✅ With the latest revision this PR passed the undef deprecator.

Rajveer100

Can you add a few test cases that caused the assertion failure including the original reproducer?

- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.

github-actions · 2025-11-19T19:02:50Z

🐧 Linux x64 Test Results

186416 tests passed
4864 tests skipped

- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.

ram-NK · 2025-11-20T04:14:04Z

Can you add a few test cases that caused the assertion failure including the original reproducer?

@Rajveer100, More failure test cases added.
3 failure and 1 success cases added in llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll
2 failure and 3 success cases added in llvm/test/CodeGen/AArch64/vldn_shuffle.ll

@fhahn , One of the loop in @scalar_store_cost_after_discarding_interleave_group() in llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll is vectorize now. The corresponding IR is added @store_factor8_1() in llvm/test/CodeGen/AArch64/vldn_shuffle.ll for verification. Better assembly is generated with this patch.

nasherm

This patch looks good to me. I'm hesitant to approve as it is touching on a pass I don't have much experience with. I'd be happy to approve if no one else has any objections?

lukel97 · 2025-11-27T15:55:02Z

llvm/lib/CodeGen/InterleavedAccessPass.cpp

 return true;
 }

+ if (InterleaveWithShuffles) {


Is it possible to avoid the hasInterleaveWithGatherScatter TTI hook? Can you just increase AArch64's getMaxSupportedInterleaveFactor and check if it's legal in AArch64TargetLowering::lowerInterleavedStore?

paulwalker-arm · 2025-11-27T17:20:37Z

My takeaway from this PR is that it is vastly complicated for what it is trying to achieve. I'll caveat my comment by saying I don't much like this pass. From what I can see you are tackling two independent problems:

Improving the code generation of large shuffles that can be decomposed into sequences of zip/uzp patterns.
Aggressively introducing struct stores to remove some of the sequences that result from (1).

Perhaps the work can be split on those boundaries because looking at the original test output I suspect (1) could be a significant win without necessarily needing (2).

Just thinking on (2) I wonder if instead of the code rewriting the shuffle it could instead shuffle the original shuffled data? (i.e. to account for the implicit shuffle performed by the structure store). That seems like a simpler implementation but I don't know if it will result in the same generated code. That said, if it does not then to me that is a sign of us not doing a good enough job of optimising shuffle sequences, which kind of brings us back to (1).

As a final thought, we do have the (de)interleave intrinsics, which might be easier to work with? [1] I have raised #169700 so the bigger power-of-two variants work for NEON. I mention this because in some ways the hardwork has been done by the InterleavePass in identifying the (de)interleaving factor and so it might be easier to replace the shuffle with a call to the equivalent intrinsic. Then you can leave the code generator to emit optimised zip/uzp patterns.

[1] I don't know which factors you're most interested in but previously it was suggested that we should introduce intrinsics where the factor is passed in (rather than having an intrinsic per factor). I think such an approach would make my suggestions above easier still.

Rajveer100 · 2025-11-28T11:05:32Z

As a final thought, we do have the (de)interleave intrinsics, which might be easier to work with? [1] I have raised #169700 so the bigger power-of-two variants work for NEON. I mention this because in some ways the hardwork has been done by the InterleavePass in identifying the (de)interleaving factor and so it might be easier to replace the shuffle with a call to the equivalent intrinsic. Then you can leave the code generator to emit optimised zip/uzp patterns.

This sounds viable to me.

ram-NK · 2025-11-28T15:29:20Z

@paulwalker-arm Thanks for reviewing it. Let me try to rewrite it with your suggestions.

llvmbot added backend:AArch64 llvm:codegen llvm:transforms labels Nov 12, 2025

ram-NK requested review from fhahn and mstorsjo November 12, 2025 18:31

ram-NK requested review from CongzheUalberta, Rajveer100, amehsan, davemgreen, hassnaaHamdi, lukel97, mgabka, nikic, paulwalker-arm and preames November 12, 2025 18:33

ram-NK force-pushed the interleaved-store-with-shuffle-fix branch from 0efaebf to fdcb54e Compare November 13, 2025 03:28

Rajveer100 reviewed Nov 17, 2025

View reviewed changes

nasherm self-requested a review November 17, 2025 10:57

[InterleavedAccess] Construct interleaved access store with shuffles

102b917

- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.

ram-NK force-pushed the interleaved-store-with-shuffle-fix branch from fdcb54e to 102b917 Compare November 17, 2025 16:42

ram-NK and others added 2 commits November 19, 2025 13:09

Merge branch 'llvm:main' into interleaved-store-with-shuffle-fix

fcb84d5

Fix bazel dep caused by f5b7376 (llvm#168436)

205e39f

ram-NK force-pushed the interleaved-store-with-shuffle-fix branch from 4fa2bbd to 5122611 Compare November 19, 2025 22:10

[InterleavedAccess] Construct interleaved access store with shuffles

d4b120b

- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.

ram-NK force-pushed the interleaved-store-with-shuffle-fix branch from 5122611 to d4b120b Compare November 20, 2025 03:56

nasherm reviewed Nov 27, 2025

View reviewed changes

lukel97 reviewed Nov 27, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[InterleavedAccess] Construct interleaved access store with shuffles #167737

[InterleavedAccess] Construct interleaved access store with shuffles #167737

Uh oh!

ram-NK commented Nov 12, 2025

llvmbot commented Nov 12, 2025 •

edited

Loading

github-actions bot commented Nov 12, 2025 •

edited

Loading

Rajveer100 left a comment

github-actions bot commented Nov 19, 2025 •

edited

Loading

ram-NK commented Nov 20, 2025 •

edited

Loading

nasherm left a comment

lukel97 Nov 27, 2025

paulwalker-arm commented Nov 27, 2025

Rajveer100 commented Nov 28, 2025

ram-NK commented Nov 28, 2025 •

edited

Loading

Labels

7 participants

[InterleavedAccess] Construct interleaved access store with shuffles #167737

Are you sure you want to change the base?

[InterleavedAccess] Construct interleaved access store with shuffles #167737

Uh oh!

Conversation

ram-NK commented Nov 12, 2025

llvmbot commented Nov 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

github-actions bot commented Nov 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Rajveer100 left a comment

Choose a reason for hiding this comment

github-actions bot commented Nov 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

ram-NK commented Nov 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

nasherm left a comment

Choose a reason for hiding this comment

lukel97 Nov 27, 2025

Choose a reason for hiding this comment

paulwalker-arm commented Nov 27, 2025

Rajveer100 commented Nov 28, 2025

ram-NK commented Nov 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Labels

7 participants

llvmbot commented Nov 12, 2025 •

edited

Loading

github-actions bot commented Nov 12, 2025 •

edited

Loading

github-actions bot commented Nov 19, 2025 •

edited

Loading

ram-NK commented Nov 20, 2025 •

edited

Loading

ram-NK commented Nov 28, 2025 •

edited

Loading