55// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66//
77// ===----------------------------------------------------------------------===//
8+ // The functions defined in this file give approximate code size. These sizes
9+ // assume the following configuration options:
10+ // - LIBC_CONF_KEEP_FRAME_POINTER = false
11+ // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
12+ // - LIBC_ADD_NULL_CHECKS = false
813#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
914#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
1015
1116#include " src/__support/macros/attributes.h" // LIBC_INLINE
1217#include " src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
18+ #include " src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
1319#include " src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
1420
1521#include < stddef.h> // size_t
1622
17- // https://libc.llvm.org/compiler_support.html
18- // Support for [[likely]] / [[unlikely]]
19- // [X] GCC 12.2
20- // [X] Clang 12
21- // [ ] Clang 11
22- #define LIBC_ATTR_LIKELY [[likely]]
23- #define LIBC_ATTR_UNLIKELY [[unlikely]]
24-
25- #if defined(LIBC_COMPILER_IS_CLANG)
26- #if LIBC_COMPILER_CLANG_VER < 1200
27- #undef LIBC_ATTR_LIKELY
28- #undef LIBC_ATTR_UNLIKELY
29- #define LIBC_ATTR_LIKELY
30- #define LIBC_ATTR_UNLIKELY
31- #endif
32- #endif
33-
3423namespace LIBC_NAMESPACE_DECL {
3524
3625namespace {
3726
38- LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof (uint32_t );
39-
40- enum Strategy {
41- ForceWordLdStChain,
42- AssumeWordAligned,
43- AssumeUnaligned,
44- };
27+ // Performs a copy of `bytes` byte from `src` to `dst`. This function has the
28+ // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
29+ // free to use whatever instruction is best for the size and assumed access.
30+ template <size_t bytes, AssumeAccess access>
31+ LIBC_INLINE void copy (void *dst, const void *src) {
32+ if constexpr (access == AssumeAccess::kAligned ) {
33+ constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
34+ memcpy_inline<bytes>(assume_aligned<alignment>(dst),
35+ assume_aligned<alignment>(src));
36+ } else if constexpr (access == AssumeAccess::kUnknown ) {
37+ memcpy_inline<bytes>(dst, src);
38+ } else {
39+ static_assert (false );
40+ }
41+ }
4542
46- template <size_t bytes, Strategy strategy = AssumeUnaligned>
47- LIBC_INLINE void copy_and_bump_pointers (Ptr &dst, CPtr &src) {
48- if constexpr (strategy == AssumeUnaligned) {
49- memcpy_inline<bytes>(assume_aligned<1 >(dst), assume_aligned<1 >(src));
50- } else if constexpr (strategy == AssumeWordAligned) {
51- static_assert (bytes >= kWordSize );
52- memcpy_inline<bytes>(assume_aligned<kWordSize >(dst),
53- assume_aligned<kWordSize >(src));
54- } else if constexpr (strategy == ForceWordLdStChain) {
43+ template <size_t bytes, BlockOp block_op = BlockOp::kFull ,
44+ AssumeAccess access = AssumeAccess::kUnknown >
45+ LIBC_INLINE void copy_block_and_bump_pointers (Ptr &dst, CPtr &src) {
46+ if constexpr (block_op == BlockOp::kFull ) {
47+ copy<bytes, access>(dst, src);
48+ } else if constexpr (block_op == BlockOp::kByWord ) {
5549 // We restrict loads/stores to 4 byte to prevent the use of load/store
56- // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
57- // fault (see notes below) and second, they use more registers which in turn
58- // adds push/pop instructions in the hot path.
50+ // multiple (LDM, STM) and load/store double (LDRD, STRD).
5951 static_assert ((bytes % kWordSize == 0 ) && (bytes >= kWordSize ));
6052 LIBC_LOOP_UNROLL
61- for (size_t i = 0 ; i < bytes / kWordSize ; ++i) {
62- const size_t offset = i * kWordSize ;
63- memcpy_inline<kWordSize >(dst + offset, src + offset);
53+ for (size_t offset = 0 ; offset < bytes; offset += kWordSize ) {
54+ copy<kWordSize , access>(dst + offset, src + offset);
6455 }
56+ } else {
57+ static_assert (false , " Invalid BlockOp" );
6558 }
6659 // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
6760 // into the load/store instructions.
@@ -72,39 +65,27 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
7265 src += bytes;
7366}
7467
75- LIBC_INLINE void copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src,
76- const size_t size) {
68+ template < size_t bytes, BlockOp block_op, AssumeAccess access>
69+ LIBC_INLINE void consume_by_block (Ptr &dst, CPtr &src, size_t & size) {
7770 LIBC_LOOP_NOUNROLL
78- for (size_t i = 0 ; i < size; ++i)
79- *dst++ = *src++;
71+ for (size_t i = 0 ; i < size / bytes; ++i)
72+ copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
73+ size %= bytes;
8074}
8175
82- template <size_t block_size, Strategy strategy>
83- LIBC_INLINE void copy_blocks_and_update_args (Ptr &dst, CPtr &src,
84- size_t &size) {
76+ [[maybe_unused]] LIBC_INLINE void
77+ copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src, size_t size) {
8578 LIBC_LOOP_NOUNROLL
86- for (size_t i = 0 ; i < size / block_size; ++i)
87- copy_and_bump_pointers<block_size, strategy>(dst, src);
88- // Update `size` once at the end instead of once per iteration.
89- size %= block_size;
90- }
91-
92- LIBC_INLINE CPtr bitwise_or (CPtr a, CPtr b) {
93- return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t >(a) |
94- cpp::bit_cast<uintptr_t >(b));
95- }
96-
97- LIBC_INLINE auto misaligned (CPtr a) {
98- return distance_to_align_down<kWordSize >(a);
79+ for (size_t i = 0 ; i < size; ++i)
80+ *dst++ = *src++;
9981}
10082
10183} // namespace
10284
103- // Implementation for Cortex-M0, M0+, M1.
104- // Notes:
105- // - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
106- // that also needs to return the `dst` ptr.
107- // - These cores do not allow for unaligned loads/stores.
85+ // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
86+ // loads/stores. It compiles down to 208 bytes when used through `memcpy` that
87+ // also needs to return the `dst` ptr.
88+ // Note:
10889// - When `src` and `dst` are coaligned, we start by aligning them and perform
10990// bulk copies. We let the compiler know the pointers are aligned so it can
11091// use load/store multiple (LDM, STM). This significantly increase throughput
@@ -125,9 +106,18 @@ LIBC_INLINE auto misaligned(CPtr a) {
125106 if (src_alignment == 0 )
126107 LIBC_ATTR_LIKELY {
127108 // Both `src` and `dst` are now word-aligned.
128- copy_blocks_and_update_args<64 , AssumeWordAligned>(dst, src, size);
129- copy_blocks_and_update_args<16 , AssumeWordAligned>(dst, src, size);
130- copy_blocks_and_update_args<4 , AssumeWordAligned>(dst, src, size);
109+ // We first copy by blocks of 64 bytes, the compiler will use 4
110+ // load/store multiple (LDM, STM), each of 4 words. This requires more
111+ // registers so additional push/pop are needed but the speedup is worth
112+ // it.
113+ consume_by_block<64 , BlockOp::kFull , AssumeAccess::kAligned >(dst, src,
114+ size);
115+ // Then we use blocks of 4 word load/store.
116+ consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
117+ size);
118+ // Then we use word by word copy.
119+ consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
120+ size);
131121 }
132122 else {
133123 // `dst` is aligned but `src` is not.
@@ -138,7 +128,7 @@ LIBC_INLINE auto misaligned(CPtr a) {
138128 src_alignment == 2
139129 ? load_aligned<uint32_t , uint16_t , uint16_t >(src)
140130 : load_aligned<uint32_t , uint8_t , uint16_t , uint8_t >(src);
141- memcpy_inline <kWordSize >(assume_aligned< kWordSize >( dst) , &value);
131+ copy <kWordSize , AssumeAccess:: kAligned >( dst, &value);
142132 dst += kWordSize ;
143133 src += kWordSize ;
144134 size -= kWordSize ;
@@ -151,56 +141,69 @@ LIBC_INLINE auto misaligned(CPtr a) {
151141}
152142
153143// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
154- // support for unaligned loads and stores.
155- // Notes:
156- // - It compiles down to 266 bytes.
157- // - `dst` and `src` are not `__restrict` to prevent the compiler from
158- // reordering loads/stores.
159- // - We keep state variables to a strict minimum to keep everything in the free
160- // registers and prevent costly push / pop.
161- // - If unaligned single loads/stores to normal memory are supported, unaligned
162- // accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
163- // STRD) instructions are generally not supported and will still fault so we
164- // make sure to restrict unrolling to word loads/stores.
144+ // support for unaligned loads and stores. It compiles down to 272 bytes when
145+ // used through `memcpy` that also needs to return the `dst` ptr.
165146[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end (Ptr dst, CPtr src,
166147 size_t size) {
167148 if (misaligned (bitwise_or (src, dst)))
168149 LIBC_ATTR_UNLIKELY {
169150 if (size < 8 )
170151 LIBC_ATTR_UNLIKELY {
171152 if (size & 1 )
172- copy_and_bump_pointers <1 >(dst, src);
153+ copy_block_and_bump_pointers <1 >(dst, src);
173154 if (size & 2 )
174- copy_and_bump_pointers <2 >(dst, src);
155+ copy_block_and_bump_pointers <2 >(dst, src);
175156 if (size & 4 )
176- copy_and_bump_pointers <4 >(dst, src);
157+ copy_block_and_bump_pointers <4 >(dst, src);
177158 return ;
178159 }
179160 if (misaligned (src))
180161 LIBC_ATTR_UNLIKELY {
181162 const size_t offset = distance_to_align_up<kWordSize >(dst);
182163 if (offset & 1 )
183- copy_and_bump_pointers <1 >(dst, src);
164+ copy_block_and_bump_pointers <1 >(dst, src);
184165 if (offset & 2 )
185- copy_and_bump_pointers <2 >(dst, src);
166+ copy_block_and_bump_pointers <2 >(dst, src);
186167 size -= offset;
187168 }
188169 }
189- copy_blocks_and_update_args<64 , ForceWordLdStChain>(dst, src, size);
190- copy_blocks_and_update_args<16 , ForceWordLdStChain>(dst, src, size);
191- copy_blocks_and_update_args<4 , AssumeUnaligned>(dst, src, size);
170+ // `dst` and `src` are not necessarily both aligned at that point but this
171+ // implementation assumes hardware support for unaligned loads and stores so
172+ // it is still fast to perform unrolled word by word copy. Note that wider
173+ // accesses through the use of load/store multiple (LDM, STM) and load/store
174+ // double (LDRD, STRD) instructions are generally not supported and can fault.
175+ // By forcing decomposition of 64 bytes copy into word by word copy, the
176+ // compiler can use the first load to prefetch memory:
177+ // ldr r3, [r1, #64]! <- prefetch next cache line
178+ // str r3, [r0]
179+ // ldr r3, [r1, #0x4]
180+ // str r3, [r0, #0x4]
181+ // ...
182+ // ldr r3, [r1, #0x3c]
183+ // str r3, [r0, #0x3c]
184+ // This is a bit detrimental for sizes between 64 and 256 (less than 10%
185+ // penalty) but the prefetch yields better throughput for larger copies.
186+ consume_by_block<64 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
187+ size);
188+ consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
189+ size);
190+ consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src, size);
192191 if (size & 1 )
193- copy_and_bump_pointers <1 >(dst, src);
192+ copy_block_and_bump_pointers <1 >(dst, src);
194193 if (size & 2 )
195- LIBC_ATTR_UNLIKELY
196- copy_and_bump_pointers<2 >(dst, src);
194+ copy_block_and_bump_pointers<2 >(dst, src);
197195}
198196
199- [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (void *__restrict dst_,
200- const void *__restrict src_,
197+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (Ptr dst, CPtr src,
201198 size_t size) {
202- Ptr dst = cpp::bit_cast<Ptr>(dst_);
203- CPtr src = cpp::bit_cast<CPtr>(src_);
199+ // The compiler performs alias analysis and is able to prove that `dst` and
200+ // `src` do not alias by propagating the `__restrict` keyword from the
201+ // `memcpy` prototype. This allows the compiler to merge consecutive
202+ // load/store (LDR, STR) instructions generated in
203+ // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
204+ // double (LDRD, STRD) instructions, this is is undesirable so we prevent the
205+ // compiler from inferring `__restrict` with the following line.
206+ asm volatile (" " : " +r" (dst), " +r" (src));
204207#ifdef __ARM_FEATURE_UNALIGNED
205208 return inline_memcpy_arm_mid_end (dst, src, size);
206209#else
@@ -210,8 +213,4 @@ LIBC_INLINE auto misaligned(CPtr a) {
210213
211214} // namespace LIBC_NAMESPACE_DECL
212215
213- // Cleanup local macros
214- #undef LIBC_ATTR_LIKELY
215- #undef LIBC_ATTR_UNLIKELY
216-
217216#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
0 commit comments