Skip to content
4 changes: 1 addition & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// The hardware supports 32-bit FSHR, but not FSHL.
setOperationAction(ISD::FSHR, MVT::i32, Legal);

// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
setOperationAction(ISD::ROTR, MVT::i64, Expand);
setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);

setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);

Expand Down
6 changes: 0 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -806,12 +806,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
(vt rc:$addr)
>;

// rotr pattern
class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
(rotr i32:$src0, i32:$src1),
(BIT_ALIGN $src0, $src0, $src1)
>;

// Special conversion patterns

def cvt_rpi_i32_f32 : PatFrag <
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/EvergreenInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,6 @@ def : AMDGPUPat <
(fshr i32:$src0, i32:$src1, i32:$src2),
(BIT_ALIGN_INT_eg $src0, $src1, $src2)
>;
def : ROTRPattern <BIT_ALIGN_INT_eg>;
def MULADD_eg : MULADD_Common<0x14>;
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
def FMA_eg : FMA_Common<0x7>;
Expand Down
27 changes: 0 additions & 27 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2663,8 +2663,6 @@ def : AMDGPUPat <

let True16Predicate = NotHasTrue16BitInsts in {
let SubtargetPredicate = isNotGFX9Plus in {
def : ROTRPattern <V_ALIGNBIT_B32_e64>;

def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
Expand All @@ -2675,14 +2673,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
} // isNotGFX9Plus

let SubtargetPredicate = isGFX9GFX10 in {
def : GCNPat <
(rotr i32:$src0, i32:$src1),
(V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
/* src1_modifiers */ 0, $src0,
/* src2_modifiers */ 0,
$src1, /* clamp */ 0, /* op_sel */ 0)
>;

foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
def : GCNPat<pat,
Expand All @@ -2704,15 +2694,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = NotHasTrue16BitInsts

let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
(rotr i32:$src0, i32:$src1),
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
/* src1_modifiers */ 0, $src0,
/* src2_modifiers */ 0,
(EXTRACT_SUBREG $src1, lo16),
/* clamp */ 0, /* op_sel */ 0)
>;

def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
Expand All @@ -2731,14 +2712,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
} // end True16Predicate = UseRealTrue16Insts

let True16Predicate = UseFakeTrue16Insts in {
def : GCNPat <
(rotr i32:$src0, i32:$src1),
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
/* src1_modifiers */ 0, $src0,
/* src2_modifiers */ 0,
$src1, /* clamp */ 0, /* op_sel */ 0)
>;

def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/packetizer.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s
; XFAIL: *

; CHECK: {{^}}test:
; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/AMDGPU/permute_i8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -353,16 +353,17 @@ define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040706
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are just more instances of a problem introduced by #70240 and already noted here:

; FIXME: produce v_alignbit_b32 v2, v2, s0, 24 instead of v_perm

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have a plan for these? It feels like you're going to end up adding fshr peepholes to replace the rotr ones?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have a plan for these?

My plan is to hope @jrbyrnes picks it up :) At the moment I don't understand why his #70240 removed this check of yours:

// Check that we haven't just recreated the same FSHR node.

It feels like you're going to end up adding fshr peepholes to replace the rotr ones?

I don't think there are any peepholes. It is just that PerformDAGCombine's FSHR case calls matchPERM but the ROTR case does not.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reinstated the check for recreating the same FSHR node.

; GFX10-NEXT: global_store_dword v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: shuffle5341ud2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, 0x5040706
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16
; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand Down
15 changes: 7 additions & 8 deletions llvm/test/CodeGen/AMDGPU/shl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1470,21 +1470,20 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
;
; EG-LABEL: s_shl_inline_imm_1_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.y,
; EG-NEXT: 31(4.344025e-44), 26(3.643376e-44)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: LSHL * T0.W, 1, PV.W,
; EG-NEXT: NOT_INT * T1.W, KC0[2].W,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.Y, PV.W, PS,
; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, 0.0, PS,
; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.x,
; EG-NEXT: LSHL * T0.W, 1, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, 0.0,
; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS,
; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i64 1, %a
Expand Down