@@ -684,5 +684,170 @@ entry:
684684 ret void
685685}
686686
687+ define amdgpu_kernel void @flat_nontemporal_volatile_load (
688+ ; GFX7-LABEL: flat_nontemporal_volatile_load:
689+ ; GFX7: ; %bb.0: ; %entry
690+ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
691+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
692+ ; GFX7-NEXT: v_mov_b32_e32 v0, s0
693+ ; GFX7-NEXT: v_mov_b32_e32 v1, s1
694+ ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
695+ ; GFX7-NEXT: s_waitcnt vmcnt(0)
696+ ; GFX7-NEXT: v_mov_b32_e32 v0, s2
697+ ; GFX7-NEXT: v_mov_b32_e32 v1, s3
698+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
699+ ; GFX7-NEXT: flat_store_dword v[0:1], v2
700+ ; GFX7-NEXT: s_endpgm
701+ ;
702+ ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
703+ ; GFX10-WGP: ; %bb.0: ; %entry
704+ ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
705+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
706+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
707+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
708+ ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
709+ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
710+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
711+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
712+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
713+ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
714+ ; GFX10-WGP-NEXT: s_endpgm
715+ ;
716+ ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
717+ ; GFX10-CU: ; %bb.0: ; %entry
718+ ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
719+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
720+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
721+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
722+ ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
723+ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
724+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
725+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
726+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
727+ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
728+ ; GFX10-CU-NEXT: s_endpgm
729+ ;
730+ ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
731+ ; SKIP-CACHE-INV: ; %bb.0: ; %entry
732+ ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
733+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
734+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
735+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
736+ ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
737+ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
738+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
739+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
740+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
741+ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
742+ ; SKIP-CACHE-INV-NEXT: s_endpgm
743+ ;
744+ ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
745+ ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
746+ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
747+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
748+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
749+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
750+ ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
751+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
752+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
753+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
754+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
755+ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
756+ ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
757+ ;
758+ ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
759+ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
760+ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
761+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
762+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
763+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
764+ ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
765+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
766+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
767+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
768+ ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
769+ ; GFX90A-TGSPLIT-NEXT: s_endpgm
770+ ;
771+ ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
772+ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
773+ ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
774+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
775+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
776+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
777+ ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
778+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
779+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
780+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
781+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
782+ ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
783+ ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
784+ ;
785+ ; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
786+ ; GFX940-TGSPLIT: ; %bb.0: ; %entry
787+ ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
788+ ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
789+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
790+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
791+ ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
792+ ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
793+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
794+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
795+ ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
796+ ; GFX940-TGSPLIT-NEXT: s_endpgm
797+ ;
798+ ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
799+ ; GFX11-WGP: ; %bb.0: ; %entry
800+ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
801+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
802+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
803+ ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
804+ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
805+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
806+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
807+ ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
808+ ; GFX11-WGP-NEXT: s_endpgm
809+ ;
810+ ; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
811+ ; GFX11-CU: ; %bb.0: ; %entry
812+ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
813+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
814+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
815+ ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
816+ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
817+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
818+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
819+ ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
820+ ; GFX11-CU-NEXT: s_endpgm
821+ ;
822+ ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
823+ ; GFX12-WGP: ; %bb.0: ; %entry
824+ ; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
825+ ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
826+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
827+ ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
828+ ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
829+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
830+ ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
831+ ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
832+ ; GFX12-WGP-NEXT: s_endpgm
833+ ;
834+ ; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
835+ ; GFX12-CU: ; %bb.0: ; %entry
836+ ; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
837+ ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
838+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
839+ ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
840+ ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
841+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
842+ ; GFX12-CU-NEXT: s_wait_dscnt 0x0
843+ ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
844+ ; GFX12-CU-NEXT: s_endpgm
845+ ptr %in , ptr %out ) {
846+ entry:
847+ %val = load volatile i32 , ptr %in , align 4 , !nontemporal !0
848+ store i32 %val , ptr %out
849+ ret void
850+ }
851+
687852!0 = !{i32 1 }
688853declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments