@@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
237237; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
238238; GFX940: ; %bb.0:
239239; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240- ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
241- ; GFX940-NEXT: s_mov_b64 s[0:1], 0
242- ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
243- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
244- ; GFX940-NEXT: s_waitcnt vmcnt(0)
245- ; GFX940-NEXT: v_mov_b32_e32 v5, v3
246- ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
247240; GFX940-NEXT: buffer_wbl2 sc1
248- ; GFX940-NEXT: global_atomic_cmpswap v3 , v[0:1], v[4:5] , off offset:1024 sc0
241+ ; GFX940-NEXT: global_atomic_pk_add_f16 v0 , v[0:1], v2 , off offset:1024 sc0
249242; GFX940-NEXT: s_waitcnt vmcnt(0)
250243; GFX940-NEXT: buffer_inv sc1
251- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
252- ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
254- ; GFX940-NEXT: s_cbranch_execnz .LBB17_1
255- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
256- ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
257- ; GFX940-NEXT: v_mov_b32_e32 v0, v3
258244; GFX940-NEXT: s_setpc_b64 s[30:31]
259245 %gep = getelementptr <2 x half >, ptr addrspace (1 ) %ptr , i32 256
260246 %result = atomicrmw fadd ptr addrspace (1 ) %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
265251; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
266252; GFX940: ; %bb.0:
267253; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268- ; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
269- ; GFX940-NEXT: s_mov_b64 s[0:1], 0
270- ; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
271- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
272- ; GFX940-NEXT: s_waitcnt vmcnt(0)
273- ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
274254; GFX940-NEXT: buffer_wbl2 sc1
275- ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5] , off offset:1024 sc0
255+ ; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2 , off offset:1024
276256; GFX940-NEXT: s_waitcnt vmcnt(0)
277257; GFX940-NEXT: buffer_inv sc1
278- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279- ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
280- ; GFX940-NEXT: v_mov_b32_e32 v5, v3
281- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
282- ; GFX940-NEXT: s_cbranch_execnz .LBB18_1
283- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
284- ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
285258; GFX940-NEXT: s_setpc_b64 s[30:31]
286259 %gep = getelementptr <2 x half >, ptr addrspace (1 ) %ptr , i32 256
287260 %unused = atomicrmw fadd ptr addrspace (1 ) %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
292265; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
293266; GFX940: ; %bb.0:
294267; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295- ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
296- ; GFX940-NEXT: s_mov_b64 s[0:1], 0
297- ; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
298- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
299- ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300- ; GFX940-NEXT: v_mov_b32_e32 v5, v3
301- ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
302268; GFX940-NEXT: buffer_wbl2 sc1
303- ; GFX940-NEXT: flat_atomic_cmpswap v3 , v[0:1], v[4:5] offset:1024 sc0
269+ ; GFX940-NEXT: flat_atomic_pk_add_f16 v0 , v[0:1], v2 offset:1024 sc0
304270; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
305271; GFX940-NEXT: buffer_inv sc1
306- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
307- ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
308- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
309- ; GFX940-NEXT: s_cbranch_execnz .LBB19_1
310- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
311- ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
312- ; GFX940-NEXT: v_mov_b32_e32 v0, v3
313272; GFX940-NEXT: s_setpc_b64 s[30:31]
314273 %gep = getelementptr <2 x half >, ptr %ptr , i32 256
315274 %result = atomicrmw fadd ptr %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
320279; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
321280; GFX940: ; %bb.0:
322281; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323- ; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
324- ; GFX940-NEXT: s_mov_b64 s[0:1], 0
325- ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
326- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
327- ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328- ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
329282; GFX940-NEXT: buffer_wbl2 sc1
330- ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
283+ ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
331284; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
332285; GFX940-NEXT: buffer_inv sc1
333- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
334- ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
335- ; GFX940-NEXT: v_mov_b32_e32 v5, v3
336- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
337- ; GFX940-NEXT: s_cbranch_execnz .LBB20_1
338- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
339- ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
340286; GFX940-NEXT: s_setpc_b64 s[30:31]
341287 %gep = getelementptr <2 x half >, ptr %ptr , i32 256
342288 %unused = atomicrmw fadd ptr %gep , <2 x half > %val syncscope("agent" ) seq_cst
0 commit comments