- Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Add scheduling DAG mutation for hazard latencies #170075
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Improve waitcnt merging in ML kernel loops by increasing latencies on VALU writes to SGPRs. Specifically this helps with the case of V_CMP output feeding V_CNDMASK instructions.
| @llvm/pr-subscribers-backend-amdgpu Author: Carl Ritson (perlfu) ChangesImprove waitcnt merging in ML kernel loops by increasing latencies on VALU writes to SGPRs. Patch is 57.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170075.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp new file mode 100644 index 0000000000000..2257154d68543 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp @@ -0,0 +1,78 @@ +//===--- AMDGPUHazardLatency.cpp - AMDGPU Hazard Latency Adjustment -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to adjust the +/// latency of data edges between instructions which use registers +/// potentially subject to additional hazard waits not accounted +/// for in the normal scheduling model. +/// While the scheduling model is typically still accurate in these +/// scenarios, adjusting latency of relevant edges can improve wait +/// merging and reduce pipeline impact of any required waits. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUHazardLatency.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +using namespace llvm; + +namespace { + +class HazardLatency : public ScheduleDAGMutation { +private: + const GCNSubtarget *ST; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + +public: + HazardLatency(MachineFunction *MF) { + ST = &MF->getSubtarget<GCNSubtarget>(); + TRI = ST->getRegisterInfo(); + MRI = &MF->getRegInfo(); + } + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void HazardLatency::apply(ScheduleDAGInstrs *DAG) { + constexpr unsigned MaskLatencyBoost = 3; + + if (!ST->hasVALUMaskWriteHazard() || !ST->isWave64()) + return; + + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + if (!SIInstrInfo::isVALU(*MI)) + continue; + if (MI->getOpcode() == AMDGPU::V_READLANE_B32 || + MI->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) + continue; + for (SDep &SuccDep : SU.Succs) { + if (SuccDep.isCtrl()) + continue; + // Boost latency on VALU writes to SGPRs used by VALUs. + // Reduce risk of premature VALU pipeline stall on associated reads. + MachineInstr *DestMI = SuccDep.getSUnit()->getInstr(); + if (!SIInstrInfo::isVALU(*DestMI)) + continue; + Register Reg = SuccDep.getReg(); + if (!TRI->isSGPRReg(*MRI, Reg)) + continue; + SuccDep.setLatency(SuccDep.getLatency() * MaskLatencyBoost); + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique<HazardLatency>(MF); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h new file mode 100644 index 0000000000000..134cc27743cd1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h @@ -0,0 +1,24 @@ +//===- AMDGPUHazardLatency.h - Hazard Latency Adjustment --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +class MachineFunction; + +std::unique_ptr<ScheduleDAGMutation> +createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e5a35abe6da6b..5c3798c3f2309 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -21,6 +21,7 @@ #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" +#include "AMDGPUHazardLatency.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPULowerVGPREncoding.h" @@ -648,6 +649,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } @@ -669,6 +671,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } @@ -1210,6 +1213,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 4baae51e021c5..583ec8d7898e8 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHazardLatency.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 6846137272ec6..aa25294ba17b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -820,8 +820,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: s_and_b32 s8, 1, s8 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX11_W64-NEXT: v_mov_b32_e32 v3, s7 ; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 12cb8d2f6fb51..66d934b0170f4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2596,11 +2596,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_endpgm ; @@ -3143,15 +3143,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0) @@ -5853,9 +5853,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, 0, v8, vcc ; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -5876,11 +5876,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v4 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; @@ -6381,9 +6381,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, s15, v8, vcc ; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -6981,9 +6981,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_subrev_co_ci_u32_e64 v7, null, s9, v9, vcc ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_ITERATIVE-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv @@ -7003,10 +7003,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v4 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v5, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_endpgm ; @@ -7665,15 +7664,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0) @@ -12767,10 +12766,10 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1164-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 @@ -12825,10 +12824,10 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1164-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13812,22 +13811,23 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2 +; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1164-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX1164-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vcc(0) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13872,22 +13872,23 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2 +; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_sdst(0) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX1164-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_sdst(0) -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1... [truncated] |
| @llvm/pr-subscribers-llvm-globalisel Author: Carl Ritson (perlfu) ChangesImprove waitcnt merging in ML kernel loops by increasing latencies on VALU writes to SGPRs. Patch is 57.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170075.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp new file mode 100644 index 0000000000000..2257154d68543 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.cpp @@ -0,0 +1,78 @@ +//===--- AMDGPUHazardLatency.cpp - AMDGPU Hazard Latency Adjustment -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains a DAG scheduling mutation to adjust the +/// latency of data edges between instructions which use registers +/// potentially subject to additional hazard waits not accounted +/// for in the normal scheduling model. +/// While the scheduling model is typically still accurate in these +/// scenarios, adjusting latency of relevant edges can improve wait +/// merging and reduce pipeline impact of any required waits. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUHazardLatency.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +using namespace llvm; + +namespace { + +class HazardLatency : public ScheduleDAGMutation { +private: + const GCNSubtarget *ST; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + +public: + HazardLatency(MachineFunction *MF) { + ST = &MF->getSubtarget<GCNSubtarget>(); + TRI = ST->getRegisterInfo(); + MRI = &MF->getRegInfo(); + } + void apply(ScheduleDAGInstrs *DAG) override; +}; + +void HazardLatency::apply(ScheduleDAGInstrs *DAG) { + constexpr unsigned MaskLatencyBoost = 3; + + if (!ST->hasVALUMaskWriteHazard() || !ST->isWave64()) + return; + + for (SUnit &SU : DAG->SUnits) { + const MachineInstr *MI = SU.getInstr(); + if (!SIInstrInfo::isVALU(*MI)) + continue; + if (MI->getOpcode() == AMDGPU::V_READLANE_B32 || + MI->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) + continue; + for (SDep &SuccDep : SU.Succs) { + if (SuccDep.isCtrl()) + continue; + // Boost latency on VALU writes to SGPRs used by VALUs. + // Reduce risk of premature VALU pipeline stall on associated reads. + MachineInstr *DestMI = SuccDep.getSUnit()->getInstr(); + if (!SIInstrInfo::isVALU(*DestMI)) + continue; + Register Reg = SuccDep.getReg(); + if (!TRI->isSGPRReg(*MRI, Reg)) + continue; + SuccDep.setLatency(SuccDep.getLatency() * MaskLatencyBoost); + } + } +} + +} // end namespace + +std::unique_ptr<ScheduleDAGMutation> +llvm::createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF) { + return std::make_unique<HazardLatency>(MF); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h new file mode 100644 index 0000000000000..134cc27743cd1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHazardLatency.h @@ -0,0 +1,24 @@ +//===- AMDGPUHazardLatency.h - Hazard Latency Adjustment --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H + +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include <memory> + +namespace llvm { + +class MachineFunction; + +std::unique_ptr<ScheduleDAGMutation> +createAMDGPUHazardLatencyDAGMutation(MachineFunction *MF); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHAZARDLATENCY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e5a35abe6da6b..5c3798c3f2309 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -21,6 +21,7 @@ #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" +#include "AMDGPUHazardLatency.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUISelDAGToDAG.h" #include "AMDGPULowerVGPREncoding.h" @@ -648,6 +649,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } @@ -669,6 +671,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) { DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } @@ -1210,6 +1213,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { DAG->addMutation(createVOPDPairingMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation(C->MF)); + DAG->addMutation(createAMDGPUHazardLatencyDAGMutation(C->MF)); return DAG; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 4baae51e021c5..583ec8d7898e8 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHazardLatency.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 6846137272ec6..aa25294ba17b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -820,8 +820,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: s_and_b32 s8, 1, s8 ; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5 ; GFX11_W64-NEXT: v_mov_b32_e32 v3, s7 ; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 12cb8d2f6fb51..66d934b0170f4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2596,11 +2596,11 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164_ITERATIVE-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: v_add_co_ci_u32_e64 v1, null, s3, v1, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_endpgm ; @@ -3143,15 +3143,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0) @@ -5853,9 +5853,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, 0, v8, vcc ; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -5876,11 +5876,11 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v4 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v4 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v1, vcc ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm ; @@ -6381,9 +6381,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1164-NEXT: v_subrev_co_ci_u32_e64 v6, null, s15, v8, vcc ; GFX1164-NEXT: v_mov_b32_e32 v0, v5 ; GFX1164-NEXT: v_mov_b32_e32 v2, v7 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: v_mov_b32_e32 v3, v8 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164-NEXT: v_mov_b32_e32 v1, v6 ; GFX1164-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl1_inv @@ -6981,9 +6981,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_subrev_co_ci_u32_e64 v7, null, s9, v9, vcc ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v0, v6 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, v8 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, v9 +; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, v7 ; GFX1164_ITERATIVE-NEXT: buffer_atomic_cmpswap_b64 v[0:3], off, s[4:7], 0 glc ; GFX1164_ITERATIVE-NEXT: s_waitcnt vmcnt(0) ; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv @@ -7003,10 +7003,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164_ITERATIVE-NEXT: v_sub_co_u32 v0, vcc, s2, v4 -; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, v5, vcc ; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1 ; GFX1164_ITERATIVE-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164_ITERATIVE-NEXT: s_endpgm ; @@ -7665,15 +7664,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP: ; %bb.0: ; %entry ; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc ; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_waitcnt_depctr depctr_va_vcc(0) @@ -12767,10 +12766,10 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1164-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 @@ -12825,10 +12824,10 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs ; GFX1164-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s10, v0 +; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1164-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13812,22 +13811,23 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2 +; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 ; GFX1164-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1164-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 ; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX1164-TRUE16-NEXT: s_waitcnt_depctr depctr_va_vcc(0) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -13872,22 +13872,23 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2 +; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_sdst(0) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX1164-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 ; GFX1164-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX1164-FAKE16-NEXT: s_waitcnt_depctr depctr_va_sdst(0) -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1164-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1... [truncated] |
| This is intended to enhance with #169213, but is applicable alone. |
Improve waitcnt merging in ML kernel loops by increasing latencies on VALU writes to SGPRs.
Specifically this helps with the case of V_CMP output feeding V_CNDMASK instructions.