@@ -69,14 +69,21 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
6969 return builtin::Memcpy<64 >::loop_and_tail (dst, src, count);
7070}
7171
72+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch (Ptr __restrict dst,
73+ CPtr __restrict src,
74+ size_t distance) {
75+ prefetch_to_local_cache (src + distance);
76+ prefetch_for_write (dst + distance);
77+ }
78+
7279[[maybe_unused]] LIBC_INLINE void
7380inline_memcpy_x86_sse2_ge64_sw_prefetching (Ptr __restrict dst,
7481 CPtr __restrict src, size_t count) {
7582 using namespace LIBC_NAMESPACE ::x86;
76- prefetch_to_local_cache (src + K_ONE_CACHELINE);
83+ inline_memcpy_prefetch (dst, src, K_ONE_CACHELINE);
7784 if (count <= 128 )
7885 return builtin::Memcpy<64 >::head_tail (dst, src, count);
79- prefetch_to_local_cache (src + K_TWO_CACHELINES);
86+ inline_memcpy_prefetch (dst, src, K_TWO_CACHELINES);
8087 // Aligning 'dst' on a 32B boundary.
8188 builtin::Memcpy<32 >::block (dst, src);
8289 align_to_next_boundary<32 , Arg::Dst>(dst, src, count);
@@ -90,17 +97,17 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
9097 if (count < 352 ) {
9198 // Two cache lines at a time.
9299 while (offset + K_TWO_CACHELINES + 32 <= count) {
93- prefetch_to_local_cache (src + offset + K_ONE_CACHELINE);
94- prefetch_to_local_cache (src + offset + K_TWO_CACHELINES);
100+ inline_memcpy_prefetch (dst, src, offset + K_ONE_CACHELINE);
101+ inline_memcpy_prefetch (dst, src, offset + K_TWO_CACHELINES);
95102 builtin::Memcpy<K_TWO_CACHELINES>::block_offset (dst, src, offset);
96103 offset += K_TWO_CACHELINES;
97104 }
98105 } else {
99106 // Three cache lines at a time.
100107 while (offset + K_THREE_CACHELINES + 32 <= count) {
101- prefetch_to_local_cache (src + offset + K_ONE_CACHELINE);
102- prefetch_to_local_cache (src + offset + K_TWO_CACHELINES);
103- prefetch_to_local_cache (src + offset + K_THREE_CACHELINES);
108+ inline_memcpy_prefetch (dst, src, offset + K_ONE_CACHELINE);
109+ inline_memcpy_prefetch (dst, src, offset + K_TWO_CACHELINES);
110+ inline_memcpy_prefetch (dst, src, offset + K_THREE_CACHELINES);
104111 // It is likely that this copy will be turned into a 'rep;movsb' on
105112 // non-AVX machines.
106113 builtin::Memcpy<K_THREE_CACHELINES>::block_offset (dst, src, offset);
@@ -120,11 +127,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
120127inline_memcpy_x86_avx_ge64_sw_prefetching (Ptr __restrict dst,
121128 CPtr __restrict src, size_t count) {
122129 using namespace LIBC_NAMESPACE ::x86;
123- prefetch_to_local_cache (src + K_ONE_CACHELINE);
130+ inline_memcpy_prefetch (dst, src, K_ONE_CACHELINE);
124131 if (count <= 128 )
125132 return builtin::Memcpy<64 >::head_tail (dst, src, count);
126- prefetch_to_local_cache (src + K_TWO_CACHELINES);
127- prefetch_to_local_cache (src + K_THREE_CACHELINES);
133+ inline_memcpy_prefetch (dst, src, K_TWO_CACHELINES);
134+ inline_memcpy_prefetch (dst, src, K_THREE_CACHELINES);
128135 if (count < 256 )
129136 return builtin::Memcpy<128 >::head_tail (dst, src, count);
130137 // Aligning 'dst' on a 32B boundary.
@@ -139,9 +146,9 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
139146 // - count >= 128.
140147 while (offset + K_THREE_CACHELINES + 64 <= count) {
141148 // Three cache lines at a time.
142- prefetch_to_local_cache (src + offset + K_ONE_CACHELINE);
143- prefetch_to_local_cache (src + offset + K_TWO_CACHELINES);
144- prefetch_to_local_cache (src + offset + K_THREE_CACHELINES);
149+ inline_memcpy_prefetch (dst, src, offset + K_ONE_CACHELINE);
150+ inline_memcpy_prefetch (dst, src, offset + K_TWO_CACHELINES);
151+ inline_memcpy_prefetch (dst, src, offset + K_THREE_CACHELINES);
145152 builtin::Memcpy<K_THREE_CACHELINES>::block_offset (dst, src, offset);
146153 offset += K_THREE_CACHELINES;
147154 }
0 commit comments