Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,40 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
return false;
}

static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
llvm::SelectionDAG *CurDAG,
const GCNSubtarget *Subtarget) {
if (!Subtarget->useRealTrue16Insts()) {
return Lo;
}

SDValue NewSrc;
SDLoc SL(Lo);

if (Lo->isDivergent()) {
SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
SL, Lo.getValueType()),
0);
const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};

NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
Src.getValueType(), Ops),
0);
} else {
// the S_MOV is needed since the Lo could still be a VGPR16.
// With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
// the fixvgpr2sgprcopy pass to legalize it
NewSrc = SDValue(
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
0);
}

return NewSrc;
}

// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
Expand Down Expand Up @@ -3412,8 +3446,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
// Really a scalar input. Just select from the low half of the register to
// avoid packing.

if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
if (VecSize == Lo.getValueSizeInBits()) {
Src = Lo;
} else if (VecSize == 32) {
Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
} else {
assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);

Expand Down
114 changes: 114 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11

@const_half = internal constant half 1.0

define amdgpu_kernel void @fma_v2f16_divergent(
; GFX11-LABEL: name: fma_v2f16_divergent
; GFX11: bb.0 (%ir-block.0):
; GFX11-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4)
; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s32) from %ir.d.kernarg.offset, addrspace 4)
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.f.gep, addrspace 1)
; GFX11-NEXT: [[V_AND_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_AND_B16_t16_e64 0, 32767, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
; GFX11-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, -32768, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX11-NEXT: S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def $scc
; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
; GFX11-NEXT: [[V_CNDMASK_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CNDMASK_B16_t16_e64 0, killed [[V_XOR_B16_t16_e64_]], 0, killed [[V_AND_B16_t16_e64_]], killed [[COPY10]], 0, implicit $exec
; GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_CNDMASK_B16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16
; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1)
; GFX11-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1)
; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[REG_SEQUENCE4]], 8, killed [[S_LOAD_DWORD_IMM1]], 8, killed [[S_LOAD_DWORD_IMM2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
ptr addrspace(1) %r,
ptr addrspace(1) %fptr,
ptr addrspace(1) %b,
ptr addrspace(1) %c,
i32 %d) {

%idx = call i32 @llvm.amdgcn.workitem.id.x() #1
%f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
%f = load half, ptr addrspace(1) %f.gep
%f.abs = call half @llvm.fabs.f16(half %f)
%f.neg = fneg half %f
%setcc = icmp ne i32 %d, 0
%select = select i1 %setcc, half %f.abs, half %f.neg
%vec = insertelement <2 x half> poison, half %select, i32 0
%a.val = insertelement <2 x half> %vec, half %select, i32 1
%b.v = load i32, ptr addrspace(1) %b
%b.val = bitcast i32 %b.v to <2 x half>
%c.v = load i32, ptr addrspace(1) %c
%c.val = bitcast i32 %c.v to <2 x half>
%r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}

define amdgpu_kernel void @fma_v2f16_uniform(
; GFX11-LABEL: name: fma_v2f16_uniform
; GFX11: bb.0 (%ir-block.0):
; GFX11-NEXT: liveins: $sgpr4_sgpr5
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4)
; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1
; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16) from %ir.3, addrspace 1)
; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]]
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[COPY9]]
; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1)
; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1)
; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_LOAD_DWORD_IMM1]]
; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[S_MOV_B32_]], 8, killed [[S_LOAD_DWORD_IMM]], 8, [[COPY10]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
ptr addrspace(1) %c) {
%a.half = load half, ptr addrspace(1) %a
%vec = insertelement <2 x half> poison, half %a.half, i32 0
%a.val = insertelement <2 x half> %vec, half %a.half, i32 1
%b.v = load i32, ptr addrspace(1) %b
%b.val = bitcast i32 %b.v to <2 x half>
%c.v = load i32, ptr addrspace(1) %c
%c.val = bitcast i32 %c.v to <2 x half>
%r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}