Skip to content

Commit 142efd6

Browse files
committed
[AMDGPU] Add ISD::FSHR Handling to AMDGPUISD::PERM matching
Pulled out of D159533, which encourages (zext (trunc x)) -> x folds, leading to more ISD::FSHR nodes, which was breaking some existing AMDGPUISD::PERM tests Differential Revision: https://reviews.llvm.org/D159533
1 parent 4c241a9 commit 142efd6

File tree

2 files changed

+49
-11
lines changed

2 files changed

+49
-11
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
789789
ISD::AND,
790790
ISD::OR,
791791
ISD::XOR,
792+
ISD::FSHR,
792793
ISD::SINT_TO_FP,
793794
ISD::UINT_TO_FP,
794795
ISD::FCANONICALIZE,
@@ -10773,6 +10774,30 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
1077310774
return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
1077410775
}
1077510776

10777+
case ISD::FSHR: {
10778+
// fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
10779+
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
10780+
if (!ShiftOp || Op.getValueType().isVector())
10781+
return std::nullopt;
10782+
10783+
uint64_t BitsProvided = Op.getValueSizeInBits();
10784+
if (BitsProvided % 8 != 0)
10785+
return std::nullopt;
10786+
10787+
uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
10788+
if (BitShift % 8)
10789+
return std::nullopt;
10790+
10791+
uint64_t ConcatSizeInBytes = BitsProvided / 4;
10792+
uint64_t ByteShift = BitShift / 8;
10793+
10794+
uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
10795+
uint64_t BytesProvided = BitsProvided / 8;
10796+
SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
10797+
NewIndex %= BytesProvided;
10798+
return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
10799+
}
10800+
1077610801
case ISD::SRA:
1077710802
case ISD::SRL: {
1077810803
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
@@ -11053,6 +11078,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
1105311078
SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
1105411079
: *PermNodes[FirstSrc].Src;
1105511080

11081+
// Check that we haven't just recreated the same FSHR node.
11082+
if (N->getOpcode() == ISD::FSHR &&
11083+
(N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
11084+
(N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
11085+
return SDValue();
11086+
1105611087
// Check that we are not just extracting the bytes in order from an op
1105711088
if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
1105811089
int Low16 = PermMask & 0xffff;
@@ -13061,6 +13092,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
1306113092
return performAndCombine(N, DCI);
1306213093
case ISD::OR:
1306313094
return performOrCombine(N, DCI);
13095+
case ISD::FSHR: {
13096+
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13097+
if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
13098+
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13099+
return matchPERM(N, DCI);
13100+
}
13101+
break;
13102+
}
1306413103
case ISD::XOR:
1306513104
return performXorCombine(N, DCI);
1306613105
case ISD::ZERO_EXTEND:

llvm/test/CodeGen/AMDGPU/permute_i8.ll

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,13 +1234,12 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
12341234
; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12351235
; GFX10-NEXT: s_waitcnt vmcnt(0)
12361236
; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1237-
; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1237+
; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
12381238
; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
1239-
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
1240-
; GFX10-NEXT: v_alignbit_b32 v0, v0, v10, 16
1241-
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1242-
; GFX10-NEXT: global_store_dword v[5:6], v1, off
1243-
; GFX10-NEXT: global_store_dword v[7:8], v0, off
1239+
; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1240+
; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706
1241+
; GFX10-NEXT: global_store_dword v[5:6], v0, off
1242+
; GFX10-NEXT: global_store_dword v[7:8], v1, off
12441243
; GFX10-NEXT: s_setpc_b64 s[30:31]
12451244
;
12461245
; GFX9-LABEL: ive_store_div:
@@ -1256,18 +1255,18 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
12561255
; GFX9-NEXT: global_load_dword v10, v[2:3], off
12571256
; GFX9-NEXT: s_movk_i32 s4, 0xff
12581257
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4
1258+
; GFX9-NEXT: s_mov_b32 s5, 0x2000706
12591259
; GFX9-NEXT: s_waitcnt vmcnt(1)
12601260
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9
1261-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
12621261
; GFX9-NEXT: s_waitcnt vmcnt(0)
12631262
; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1264-
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1263+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
12651264
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
1266-
; GFX9-NEXT: v_alignbit_b32 v2, v1, v10, 16
1267-
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1265+
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
12681266
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1267+
; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5
12691268
; GFX9-NEXT: global_store_dword v[5:6], v0, off
1270-
; GFX9-NEXT: global_store_dword v[7:8], v2, off
1269+
; GFX9-NEXT: global_store_dword v[7:8], v3, off
12711270
; GFX9-NEXT: s_waitcnt vmcnt(0)
12721271
; GFX9-NEXT: s_setpc_b64 s[30:31]
12731272
%tid = call i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)