Skip to content

Commit 0eaf675

Browse files
committed
[AMDGPU][InsertWaits] No wait for WAW for global/scratch_load
global/scratch_load will return in order they are issued. No need to insert a s_waitcnt for WAW hazard. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D138476
1 parent 42ad9bf commit 0eaf675

File tree

6 files changed

+62
-22
lines changed

6 files changed

+62
-22
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,13 @@ enum VmemType {
141141
VMEM_BVH
142142
};
143143

144+
static bool updateVMCntOnly(const MachineInstr &Inst) {
145+
return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
146+
SIInstrInfo::isFLATScratch(Inst);
147+
}
148+
144149
VmemType getVmemType(const MachineInstr &Inst) {
145-
assert(SIInstrInfo::isVMEM(Inst));
150+
assert(updateVMCntOnly(Inst));
146151
if (!SIInstrInfo::isMIMG(Inst))
147152
return VMEM_NOSAMPLER;
148153
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
@@ -683,7 +688,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
683688
if (T == VM_CNT) {
684689
if (Interval.first >= NUM_ALL_VGPRS)
685690
continue;
686-
if (SIInstrInfo::isVMEM(Inst)) {
691+
if (updateVMCntOnly(Inst)) {
687692
VmemType V = getVmemType(Inst);
688693
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
689694
VgprVmemTypes[RegNo] |= 1 << V;
@@ -1182,7 +1187,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
11821187
// previous write and this write are the same type of VMEM
11831188
// instruction, in which case they're guaranteed to write their
11841189
// results in order anyway.
1185-
if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1190+
if (Op.isUse() || !updateVMCntOnly(MI) ||
11861191
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
11871192
getVmemType(MI))) {
11881193
ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);

llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -476,9 +476,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
476476
; GFX9V3: ; %bb.0:
477477
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
478478
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
479-
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
480479
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
481-
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
482480
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
483481
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
484482
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
@@ -495,9 +493,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
495493
; GFX9V4: ; %bb.0:
496494
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
497495
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
498-
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
499496
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
500-
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
501497
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
502498
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
503499
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
@@ -515,9 +511,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
515511
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
516512
; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc
517513
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
518-
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
519514
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
520-
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
521515
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
522516
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
523517
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
534534
; GFX9-NEXT: .LBB3_2:
535535
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
536536
; GFX9-NEXT: .LBB3_3: ; %T
537-
; GFX9-NEXT: s_waitcnt vmcnt(0)
538537
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
539538
; GFX9-NEXT: s_waitcnt vmcnt(0)
540539
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
@@ -706,7 +705,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
706705
; GFX9-NEXT: .LBB4_2:
707706
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
708707
; GFX9-NEXT: .LBB4_3: ; %T
709-
; GFX9-NEXT: s_waitcnt vmcnt(0)
710708
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
711709
; GFX9-NEXT: s_waitcnt vmcnt(0)
712710
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
@@ -878,7 +876,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
878876
; GFX9-NEXT: .LBB5_2:
879877
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
880878
; GFX9-NEXT: .LBB5_3: ; %T
881-
; GFX9-NEXT: s_waitcnt vmcnt(0)
882879
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
883880
; GFX9-NEXT: s_waitcnt vmcnt(0)
884881
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc

llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -462,9 +462,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
462462
; GFX9V3: ; %bb.0:
463463
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
464464
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
465-
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
466465
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
467-
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
468466
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
469467
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
470468
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
@@ -481,9 +479,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
481479
; GFX9V4: ; %bb.0:
482480
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
483481
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
484-
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
485482
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
486-
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
487483
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
488484
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
489485
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
@@ -500,9 +496,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
500496
; GFX9V5: ; %bb.0:
501497
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
502498
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
503-
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
504499
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
505-
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
506500
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
507501
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
508502
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,8 +1667,8 @@ define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x hal
16671667
; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32
16681668
; GFX11-NEXT: s_waitcnt vmcnt(1)
16691669
; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 offset:16
1670-
; GFX11-NEXT: s_waitcnt vmcnt(0)
16711670
; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:16
1671+
; GFX11-NEXT: s_waitcnt vmcnt(1)
16721672
; GFX11-NEXT: v_mov_b32_e32 v0, v2
16731673
; GFX11-NEXT: s_waitcnt vmcnt(0)
16741674
; GFX11-NEXT: v_mov_b32_e32 v2, v3
@@ -1771,7 +1771,6 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
17711771
; GFX9: ; %bb.0:
17721772
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17731773
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
1774-
; GFX9-NEXT: s_waitcnt vmcnt(0)
17751774
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
17761775
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
17771776
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
@@ -1786,7 +1785,6 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
17861785
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17871786
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
17881787
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
1789-
; GFX10-NEXT: s_waitcnt vmcnt(0)
17901788
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
17911789
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
17921790
; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
@@ -1800,7 +1798,6 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
18001798
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18011799
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
18021800
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1803-
; GFX11-NEXT: s_waitcnt vmcnt(0)
18041801
; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
18051802
; GFX11-NEXT: s_waitcnt vmcnt(0)
18061803
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100

llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,56 @@ body: |
7272
$vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
7373
$vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s128))
7474
...
75+
# (global_load + scratch_load + buffer_load)
76+
---
77+
name: global_scratch_buffer
78+
tracksRegLiveness: true
79+
body: |
80+
bb.0:
81+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
82+
; GFX9-LABEL: name: global_scratch_buffer
83+
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
84+
; GFX9-NEXT: {{ $}}
85+
; GFX9-NEXT: S_WAITCNT 0
86+
; GFX9-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
87+
; GFX9-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
88+
; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
89+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
90+
$vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
91+
$vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
92+
...
93+
# waw between flat and buffer should have a wait inserted between.
94+
# (flat + buffer)
95+
---
96+
name: flat_buffer
97+
tracksRegLiveness: true
98+
body: |
99+
bb.0:
100+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
101+
; GFX9-LABEL: name: flat_buffer
102+
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
103+
; GFX9-NEXT: {{ $}}
104+
; GFX9-NEXT: S_WAITCNT 0
105+
; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
106+
; GFX9-NEXT: S_WAITCNT 49279
107+
; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
108+
$vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
109+
$vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
110+
...
111+
# buffer + flat
112+
---
113+
name: buffer_flat
114+
tracksRegLiveness: true
115+
body: |
116+
bb.0:
117+
liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
118+
; GFX9-LABEL: name: buffer_flat
119+
; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
120+
; GFX9-NEXT: {{ $}}
121+
; GFX9-NEXT: S_WAITCNT 0
122+
; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
123+
; GFX9-NEXT: S_WAITCNT 3952
124+
; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
125+
$vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
126+
$vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
127+
...

0 commit comments

Comments
 (0)