@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
347+ // For a given width return the max 0number of elements that can be combined
348+ // into a wider bit value:
349+ return (ElemWidth == 8 && ST->has16BitInsts ()) ? 4
350+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
351+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
352+ : 1 ;
350353}
351354
352355unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1195,14 +1198,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11951198
11961199 Kind = improveShuffleKindFromMask (Kind, Mask, SrcTy, Index, SubTp);
11971200
1198- // Larger vector widths may require additional instructions, but are
1199- // typically cheaper than scalarized versions.
1200- unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
1201+ unsigned ScalarSize = DL.getTypeSizeInBits (SrcTy->getElementType ());
12011202 if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1202- DL.getTypeSizeInBits (SrcTy->getElementType ()) == 16 ) {
1203- bool HasVOP3P = ST->hasVOP3PInsts ();
1203+ (ScalarSize == 16 || ScalarSize == 8 )) {
1204+ // Larger vector widths may require additional instructions, but are
1205+ // typically cheaper than scalarized versions.
1206+ unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
12041207 unsigned RequestedElts =
12051208 count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1209+ unsigned EltsPerReg = 32 / ScalarSize;
12061210 if (RequestedElts == 0 )
12071211 return 0 ;
12081212 switch (Kind) {
@@ -1211,9 +1215,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12111215 case TTI::SK_PermuteSingleSrc: {
12121216 // With op_sel VOP3P instructions freely can access the low half or high
12131217 // half of a register, so any swizzle of two elements is free.
1214- if (HasVOP3P && NumVectorElts == 2 )
1218+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
12151219 return 0 ;
1216- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1220+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
12171221 // SK_Broadcast just reuses the same mask
12181222 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
12191223 return NumPerms + NumPermMasks;
@@ -1225,12 +1229,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12251229 return 0 ;
12261230 // Insert/extract subvectors only require shifts / extract code to get the
12271231 // relevant bits
1228- return alignTo (RequestedElts, 2 ) / 2 ;
1232+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
12291233 }
12301234 case TTI::SK_PermuteTwoSrc:
12311235 case TTI::SK_Splice:
12321236 case TTI::SK_Select: {
1233- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1237+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
12341238 // SK_Select just reuses the same mask
12351239 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
12361240 return NumPerms + NumPermMasks;
@@ -1505,3 +1509,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
15051509 return AMDGPU::isShader (F->getCallingConv ()) ? KnownIEEEMode::Off
15061510 : KnownIEEEMode::On;
15071511}
1512+
1513+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1514+ Align Alignment,
1515+ unsigned AddressSpace,
1516+ TTI::TargetCostKind CostKind,
1517+ TTI::OperandValueInfo OpInfo,
1518+ const Instruction *I) const {
1519+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1520+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1521+ VecTy->getElementType ()->isIntegerTy (8 )) {
1522+ return divideCeil (DL.getTypeSizeInBits (VecTy) - 1 ,
1523+ getLoadStoreVecRegBitWidth (AddressSpace));
1524+ }
1525+ }
1526+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1527+ OpInfo, I);
1528+ }
1529+
1530+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1531+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1532+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1533+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1534+ return divideCeil (ElementCount - 1 , 4 );
1535+ }
1536+ }
1537+ return BaseT::getNumberOfParts (Tp);
1538+ }
0 commit comments