@@ -237,10 +237,24 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
237237; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
238238; GFX940: ; %bb.0:
239239; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240+ ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
241+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
242+ ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
243+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
244+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
245+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
246+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
240247; GFX940-NEXT: buffer_wbl2 sc1
241- ; GFX940-NEXT: global_atomic_pk_add_f16 v0 , v[0:1], v2 , off offset:1024 sc0
248+ ; GFX940-NEXT: global_atomic_cmpswap v3 , v[0:1], v[4:5] , off offset:1024 sc0
242249; GFX940-NEXT: s_waitcnt vmcnt(0)
243250; GFX940-NEXT: buffer_inv sc1
251+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
252+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
254+ ; GFX940-NEXT: s_cbranch_execnz .LBB17_1
255+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
256+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
257+ ; GFX940-NEXT: v_mov_b32_e32 v0, v3
244258; GFX940-NEXT: s_setpc_b64 s[30:31]
245259 %gep = getelementptr <2 x half >, ptr addrspace (1 ) %ptr , i32 256
246260 %result = atomicrmw fadd ptr addrspace (1 ) %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -251,10 +265,23 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
251265; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
252266; GFX940: ; %bb.0:
253267; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268+ ; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
269+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
270+ ; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
271+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
272+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
273+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
254274; GFX940-NEXT: buffer_wbl2 sc1
255- ; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2 , off offset:1024
275+ ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5] , off offset:1024 sc0
256276; GFX940-NEXT: s_waitcnt vmcnt(0)
257277; GFX940-NEXT: buffer_inv sc1
278+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
280+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
281+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
282+ ; GFX940-NEXT: s_cbranch_execnz .LBB18_1
283+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
284+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
258285; GFX940-NEXT: s_setpc_b64 s[30:31]
259286 %gep = getelementptr <2 x half >, ptr addrspace (1 ) %ptr , i32 256
260287 %unused = atomicrmw fadd ptr addrspace (1 ) %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -265,10 +292,24 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
265292; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
266293; GFX940: ; %bb.0:
267294; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295+ ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
296+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
297+ ; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
298+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
299+ ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
301+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
268302; GFX940-NEXT: buffer_wbl2 sc1
269- ; GFX940-NEXT: flat_atomic_pk_add_f16 v0 , v[0:1], v2 offset:1024 sc0
303+ ; GFX940-NEXT: flat_atomic_cmpswap v3 , v[0:1], v[4:5] offset:1024 sc0
270304; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
271305; GFX940-NEXT: buffer_inv sc1
306+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
307+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
308+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
309+ ; GFX940-NEXT: s_cbranch_execnz .LBB19_1
310+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
311+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
312+ ; GFX940-NEXT: v_mov_b32_e32 v0, v3
272313; GFX940-NEXT: s_setpc_b64 s[30:31]
273314 %gep = getelementptr <2 x half >, ptr %ptr , i32 256
274315 %result = atomicrmw fadd ptr %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -279,10 +320,23 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
279320; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
280321; GFX940: ; %bb.0:
281322; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323+ ; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
324+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
325+ ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
326+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
327+ ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
282329; GFX940-NEXT: buffer_wbl2 sc1
283- ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
330+ ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
284331; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
285332; GFX940-NEXT: buffer_inv sc1
333+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
334+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
335+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
336+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
337+ ; GFX940-NEXT: s_cbranch_execnz .LBB20_1
338+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
339+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
286340; GFX940-NEXT: s_setpc_b64 s[30:31]
287341 %gep = getelementptr <2 x half >, ptr %ptr , i32 256
288342 %unused = atomicrmw fadd ptr %gep , <2 x half > %val syncscope("agent" ) seq_cst
0 commit comments