Skip to content

Conversation

@petar-avramovic
Copy link
Collaborator

No description provided.

Copy link
Collaborator Author

petar-avramovic commented Nov 17, 2025

@llvmbot
Copy link
Member

llvmbot commented Nov 17, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Petar Avramovic (petar-avramovic)

Changes

Patch is 21.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168411.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+15-2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+19)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll (+233)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll (+216)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 1765d054a3c0d..d719f3d40295d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -629,10 +629,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); assert(MRI.getType(Dst) == V2S16); - auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); - auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); unsigned Opc = MI.getOpcode(); auto Flags = MI.getFlags(); + + if (MI.getNumOperands() == 2) { + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); + auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); + return; + } + + assert(MI.getNumOperands() == 3); + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index b81a08de383d9..4051dc8495f6f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -951,6 +951,25 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}}); + // FNEG and FABS are either folded as source modifiers or can be selected as + // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for + // targets without SALU float we still select them as VGPR since there would + // be no real sgpr use. + addRulesForGOpcs({G_FNEG, G_FABS}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16}}) + .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat) + .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat) + .Div(S32, {{Vgpr32}, {Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}}); + addRulesForGOpcs({G_FPTOUI}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll new file mode 100644 index 0000000000000..093cdf744e3b4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_ps void @v_fabs_f16(half %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fabs_f16: +; GCN: ; %bb.0: +; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GCN-NEXT: global_store_b16 v[1:2], v0, off +; GCN-NEXT: s_endpgm + %fabs = call half @llvm.fabs.f16(half %in) + store half %fabs, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fabs_f16(half inreg %in, ptr addrspace(1) %out) { +; GFX11-LABEL: s_fabs_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e64 v2, |s0|, |s0| +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_fabs_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_add_f16 s0, s0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %fabs = call half @llvm.fabs.f16(half %in) + %fadd = fadd half %fabs, %fabs + store half %fadd, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fabs_f32(float %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fabs_f32: +; GCN: ; %bb.0: +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: global_store_b32 v[1:2], v0, off +; GCN-NEXT: s_endpgm + %fabs = call float @llvm.fabs.f32(float %in) + store float %fabs, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fabs_f32(float inreg %in, ptr addrspace(1) %out) { +; GFX11-LABEL: s_fabs_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v2, |s0|, |s0| +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_fabs_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_bitset0_b32 s0, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_add_f32 s0, s0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %fabs = call float @llvm.fabs.f32(float %in) + %fadd = fadd float %fabs, %fabs + store float %fadd, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fabs_f64(double %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fabs_f64: +; GCN: ; %bb.0: +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off +; GCN-NEXT: s_endpgm + %fabs = call double @llvm.fabs.f64(double %in) + store double %fabs, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fabs_f64(double inreg %in, ptr addrspace(1) %out) { +; GFX11-LABEL: s_fabs_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[2:3], |s[0:1]|, |s[0:1]| +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_fabs_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e64 v[2:3], |s[0:1]|, |s[0:1]| +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %fabs = call double @llvm.fabs.f64(double %in) + %fadd = fadd double %fabs, %fabs + store double %fadd, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fabs_v2f16(<2 x half> %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fabs_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GCN-NEXT: global_store_b32 v[1:2], v0, off +; GCN-NEXT: s_endpgm + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + store <2 x half> %fabs, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fabs_v2f16(<2 x half> inreg %in, ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: s_fabs_v2f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0x7fff7fff +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_pk_add_f16 v2, s0, s0 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: s_fabs_v2f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e64 v2, 0x7fff7fff, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_pk_add_f16 v2, v2, v2 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: s_fabs_v2f16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_and_b32 s0, s0, 0x7fff7fff +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_pk_add_f16 v2, s0, s0 +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: s_fabs_v2f16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_lshr_b32 s1, s0, 16 +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX12-GISEL-NEXT: s_and_b32 s1, s1, 0x7fff +; GFX12-GISEL-NEXT: s_add_f16 s0, s0, s0 +; GFX12-GISEL-NEXT: s_add_f16 s1, s1, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-GISEL-NEXT: s_endpgm + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + %fadd = fadd <2 x half> %fabs, %fabs + store <2 x half> %fadd, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fabs_v2f32(<2 x float> %in, ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: v_fabs_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_fabs_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_fabs_v2f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_fabs_v2f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + store <2 x float> %fabs, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: s_fabs_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_add_f32_e64 v3, |s1|, |s1| +; GFX11-SDAG-NEXT: v_add_f32_e64 v2, |s0|, |s0| +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: s_fabs_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_add_f32_e64 v2, |s0|, |s0| +; GFX11-GISEL-NEXT: v_add_f32_e64 v3, |s1|, |s1| +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: s_fabs_v2f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_bitset0_b32 s0, 31 +; GFX12-SDAG-NEXT: s_bitset0_b32 s1, 31 +; GFX12-SDAG-NEXT: s_add_f32 s0, s0, s0 +; GFX12-SDAG-NEXT: s_add_f32 s1, s1, s1 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: s_fabs_v2f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_bitset0_b32 s0, 31 +; GFX12-GISEL-NEXT: s_bitset0_b32 s1, 31 +; GFX12-GISEL-NEXT: s_add_f32 s0, s0, s0 +; GFX12-GISEL-NEXT: s_add_f32 s1, s1, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-GISEL-NEXT: s_endpgm + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + %fadd = fadd <2 x float> %fabs, %fabs + store <2 x float> %fadd, ptr addrspace(1) %out + ret void +} + +declare half @llvm.fabs.f16(half) +declare float @llvm.fabs.f32(float) +declare double @llvm.fabs.f64(double) +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll new file mode 100644 index 0000000000000..f837c62821951 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s + +define amdgpu_ps void @v_fneg_f16(half %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fneg_f16: +; GCN: ; %bb.0: +; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GCN-NEXT: global_store_b16 v[1:2], v0, off +; GCN-NEXT: s_endpgm + %fneg = fneg half %in + store half %fneg, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fneg_f16(half inreg %in, half inreg %val, ptr addrspace(1) %out) { +; GFX11-LABEL: s_fneg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mul_f16_e64 v2, -s0, s1 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_fneg_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_xor_b32 s0, s0, 0x8000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_mul_f16 s0, s0, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b16 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %fneg = fneg half %in + %fmul = fmul half %fneg, %val + store half %fmul, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fneg_f32(float %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fneg_f32: +; GCN: ; %bb.0: +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: global_store_b32 v[1:2], v0, off +; GCN-NEXT: s_endpgm + %fneg = fneg float %in + store float %fneg, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fneg_f32(float inreg %in, float inreg %val, ptr addrspace(1) %out) { +; GFX11-LABEL: s_fneg_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mul_f32_e64 v2, -s0, s1 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_fneg_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_xor_b32 s0, s0, 0x80000000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX12-NEXT: s_mul_f32 s0, s0, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-NEXT: s_endpgm + %fneg = fneg float %in + %fmul = fmul float %fneg, %val + store float %fmul, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fneg_f64(double %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fneg_f64: +; GCN: ; %bb.0: +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off +; GCN-NEXT: s_endpgm + %fneg = fneg double %in + store double %fneg, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fneg_f64(double inreg %in, double inreg %val, ptr addrspace(1) %out) { +; GFX11-LABEL: s_fneg_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mul_f64 v[2:3], -s[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: s_fneg_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mul_f64_e64 v[2:3], -s[0:1], s[2:3] +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm + %fneg = fneg double %in + %fmul = fmul double %fneg, %val + store double %fmul, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fneg_v2f16(<2 x half> %in, ptr addrspace(1) %out) { +; GCN-LABEL: v_fneg_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GCN-NEXT: global_store_b32 v[1:2], v0, off +; GCN-NEXT: s_endpgm + %fneg = fneg <2 x half> %in + store <2 x half> %fneg, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fneg_v2f16(<2 x half> inreg %in, <2 x half> inreg %val, ptr addrspace(1) %out) { +; GFX11-LABEL: s_fneg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_mul_f16 v2, s0, s1 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: s_fneg_v2f16: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_pk_mul_f16 v2, s0, s1 neg_lo:[1,0] neg_hi:[1,0] +; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: s_fneg_v2f16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x8000 +; GFX12-GISEL-NEXT: s_xor_b32 s2, s2, 0x8000 +; GFX12-GISEL-NEXT: s_lshr_b32 s3, s1, 16 +; GFX12-GISEL-NEXT: s_mul_f16 s0, s0, s1 +; GFX12-GISEL-NEXT: s_mul_f16 s1, s2, s3 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX12-GISEL-NEXT: s_endpgm + %fneg = fneg <2 x half> %in + %fmul = fmul <2 x half> %fneg, %val + store <2 x half> %fmul, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @v_fneg_v2f32(<2 x float> %in, ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: v_fneg_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX11-SDAG-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX11-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_fneg_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX11-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: v_fneg_v2f32: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX12-SDAG-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX12-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: v_fneg_v2f32: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX12-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX12-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX12-GISEL-NEXT: s_endpgm + %fneg = fneg <2 x float> %in + store <2 x float> %fneg, ptr addrspace(1) %out + ret void +} +define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, <2 x float> inreg %val, ptr addrspace(1) %out) { +; GFX11-SDAG-LABEL: s_fneg_v2f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: v_mul_f32_e64 v3, -s1, s3 +; GFX11-SDAG-NEXT: v_mul_f32_e64 v2, -s0, s2 +; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: s_fneg_v2f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: v_mul_f32_e64 v2, -s0, s2 +; GFX11-GISEL-NEXT: v_mul_f32_e64 v3, -s1, s3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(N... [truncated] 
@github-actions
Copy link

github-actions bot commented Nov 17, 2025

🐧 Linux x64 Test Results

  • 186295 tests passed
  • 4849 tests skipped
%fmul = fmul <2 x float> %fneg, %val
store <2 x float> %fmul, ptr addrspace(1) %out
ret void
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also test the fneg + fabs case

store <2 x float> %fabs, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is somewhat misleading since it's not a standalone scalar fabs. It doesn't really count if it folds into a source modifier of a VALU instruction?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right, switched to "used by" g_select since that one is available on salu for all the types. Added version that folds readanylanes and salu_use version that requires readanylane, at least for now

unsigned Opc = MI.getOpcode();
auto Flags = MI.getFlags();

if (MI.getNumOperands() == 2) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this here? These don't require splitting to handle?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is for unary opcodes, old one below was for binary

@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/readanylane-combine branch from 3590a6e to 9e70882 Compare November 18, 2025 13:59
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/fabs-fneg branch from 529b6f2 to 73f2bf8 Compare November 18, 2025 13:59
Comment on lines +305 to +334
define amdgpu_ps void @v_fabs_fneg_f32(float %in, ptr addrspace(1) %out) {
; GCN-LABEL: v_fabs_fneg_f32:
; GCN: ; %bb.0:
; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: global_store_b32 v[1:2], v0, off
; GCN-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
%fneg = fneg float %fabs
store float %fneg, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @s_fabs_fneg_f32(float inreg %in, ptr addrspace(1) %out) {
; GFX11-LABEL: s_fabs_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_or_b32_e64 v2, 0x80000000, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_fabs_fneg_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_bitset1_b32 s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
%fneg = fneg float %fabs
store float %fneg, ptr addrspace(1) %out
ret void
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added are some fneg fabs tests, what are we trying to test here exactly?

@chinmaydd
Copy link
Contributor

Support for G_STRICT_FADD/SUB/MUL is blocked by G_FABS and G_FNEG. I would like to see this merged if possible. Thanks !

Base automatically changed from users/petar-avramovic/readanylane-combine to main November 24, 2025 14:57
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/fabs-fneg branch from 73f2bf8 to 242f867 Compare November 24, 2025 15:01
Copy link
Collaborator Author

petar-avramovic commented Nov 24, 2025

Merge activity

  • Nov 24, 3:34 PM UTC: A user started a stack merge that includes this pull request via Graphite.
  • Nov 24, 3:35 PM UTC: @petar-avramovic merged this pull request with Graphite.
@petar-avramovic petar-avramovic merged commit f4ba8e3 into main Nov 24, 2025
9 of 10 checks passed
@petar-avramovic petar-avramovic deleted the users/petar-avramovic/fabs-fneg branch November 24, 2025 15:35
@llvm-ci
Copy link
Collaborator

llvm-ci commented Nov 24, 2025

LLVM Buildbot has detected a new failure on builder mlir-nvidia-gcc7 running on mlir-nvidia while building llvm at step 7 "test-build-check-mlir-build-only-check-mlir".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/21368

Here is the relevant piece of the build log for the reference
Step 7 (test-build-check-mlir-build-only-check-mlir) failure: test (failure) ******************** TEST 'MLIR :: Integration/GPU/CUDA/async.mlir' FAILED ******************** Exit Code: 1 Command Output (stdout): -- # RUN: at line 1 /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=fatbin" | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0 | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-kernel-outlining # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt '-pass-pipeline=builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary=format=fatbin # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/mlir-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0 # .---command stderr------------ # | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuEventSynchronize(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED' # `----------------------------- # executed command: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir # .---command stderr------------ # | /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir:68:12: error: CHECK: expected string not found in input # | // CHECK: [84, 84] # | ^ # | <stdin>:1:1: note: scanning from here # | Unranked Memref base@ = 0x5b761c9ad300 rank = 1 offset = 0 sizes = [2] strides = [1] data = # | ^ # | <stdin>:2:1: note: possible intended match here # | [42, 42] # | ^ # | # | Input file: <stdin> # | Check file: /vol/worker/mlir-nvidia/mlir-nvidia-gcc7/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir # | # | -dump-input=help explains the following input dump. # | # | Input was: # | <<<<<< # | 1: Unranked Memref base@ = 0x5b761c9ad300 rank = 1 offset = 0 sizes = [2] strides = [1] data = # | check:68'0 X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found # | 2: [42, 42] # | check:68'0 ~~~~~~~~~ # | check:68'1 ? possible intended match ... 
aadeshps-mcw pushed a commit to aadeshps-mcw/llvm-project that referenced this pull request Nov 26, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment