Skip to content
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ce56f84
pre-commit test
yafet-a Aug 21, 2025
1c27d89
[BOLT] documentation
yafet-a Aug 21, 2025
db353b7
[BOLT][AArch64] Implement safe size-aware memcpy inlining
yafet-a Aug 21, 2025
2e5b22b
test target fix for CI cross-compilation issue
yafet-a Aug 22, 2025
385fa23
moved inline-memcpy to avoid CI cross-compilation PIE conflicts
yafet-a Aug 22, 2025
4f9ef67
removed old test
yafet-a Aug 22, 2025
e83126e
response to review
yafet-a Aug 22, 2025
cf8279a
Update conditional formatting and move check for size into binaryPasses
yafet-a Aug 27, 2025
c317eb0
Negative Tests (live-in, register move, non-mov instruction)
yafet-a Aug 27, 2025
df97d61
memcpy8 redundant handling removed
yafet-a Aug 27, 2025
25cfb58
nit: comment clean up
yafet-a Aug 27, 2025
e308855
minor refactor
yafet-a Aug 28, 2025
365a0bf
NFC: Post-review refactor
yafet-a Aug 28, 2025
84c904a
NFC: Test for corner case with size 0
yafet-a Aug 28, 2025
0561bcc
Use temp instead of argument registers
yafet-a Aug 28, 2025
cc49db7
Update early return
yafet-a Aug 28, 2025
115606b
Update tests to be more specific about registers + negative test on e…
yafet-a Aug 28, 2025
1986bfa
Complex test + register aliasing
yafet-a Aug 29, 2025
bd990ea
NFC use if initializer
yafet-a Sep 1, 2025
ee5f859
[style] trailing whitespaces removed
yafet-a Sep 4, 2025
ad503a7
[test] CHECK-NEXT used
yafet-a Sep 4, 2025
267432a
[test] updated negative test to check for negative size
yafet-a Sep 4, 2025
198744d
[nfc] minor refactor
yafet-a Sep 4, 2025
62b871e
[bug] memcpy call removed for sizes>64
yafet-a Sep 4, 2025
dcab6ac
[nfc][test] reordered test
yafet-a Sep 5, 2025
875156e
[nfc] added assert for default case (future-proofing for changes to B…
yafet-a Sep 5, 2025
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bolt/docs/CommandLineArgumentReference.md
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@

- `--inline-memcpy`

Inline memcpy using 'rep movsb' instruction (X86-only)
Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)

- `--inline-small-functions`

Expand Down
26 changes: 26 additions & 0 deletions bolt/include/bolt/Core/MCPlusBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#ifndef BOLT_CORE_MCPLUSBUILDER_H
#define BOLT_CORE_MCPLUSBUILDER_H

#include "bolt/Core/BinaryBasicBlock.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Core/Relocation.h"
#include "llvm/ADT/ArrayRef.h"
Expand Down Expand Up @@ -1888,13 +1889,38 @@ class MCPlusBuilder {
return {};
}

/// Find memcpy size in bytes by using preceding instructions.
/// Returns std::nullopt if size cannot be determined (no-op for most
/// targets).
virtual std::optional<uint64_t>
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
BinaryBasicBlock::iterator CallInst) const {
return std::nullopt;
}

/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
/// (dest + n) instead of dest.
virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
llvm_unreachable("not implemented");
return {};
}

/// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
/// generates optimized code for that specific size. Falls back to regular
/// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
virtual InstructionListType
createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
return createInlineMemcpy(ReturnEnd);
}

/// Extract immediate value from move instruction that sets the given
/// register. Returns the immediate value if the instruction is a
/// move-immediate to TargetReg.
virtual std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
return std::nullopt;
}

/// Create a target-specific relocation out of the \p Fixup.
/// Note that not every fixup could be converted into a relocation.
virtual std::optional<Relocation>
Expand Down
12 changes: 10 additions & 2 deletions bolt/lib/Passes/BinaryPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
}

Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
if (!BC.isX86())
if (!BC.isX86() && !BC.isAArch64())
return Error::success();

uint64_t NumInlined = 0;
Expand All @@ -1866,8 +1866,16 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);

// Extract size from preceding instructions (AArch64 only).
// Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
std::optional<uint64_t> KnownSize =
BC.MIB->findMemcpySizeInBytes(BB, II);

if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
continue;

const InstructionListType NewCode =
BC.MIB->createInlineMemcpy(IsMemcpy8);
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
II = BB.replaceInstruction(II, NewCode);
std::advance(II, NewCode.size() - 1);
if (IsTailCall) {
Expand Down
4 changes: 3 additions & 1 deletion bolt/lib/Rewrite/BinaryPassManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),

static cl::opt<bool> StringOps(
"inline-memcpy",
cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
cl::desc(
"inline memcpy using size-specific optimized instructions "
"(X86: 'rep movsb', AArch64: width-optimized register operations)"),
cl::cat(BoltOptCategory));

static cl::opt<bool> StripRepRet(
Expand Down
114 changes: 114 additions & 0 deletions bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2597,6 +2597,120 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
getInstructionSize(const MCInst &Inst) const override {
return 4;
}

std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
// Match MOVZ instructions (both X and W register variants) with no shift.
if ((Inst.getOpcode() == AArch64::MOVZXi ||
Inst.getOpcode() == AArch64::MOVZWi) &&
Inst.getOperand(2).getImm() == 0 &&
getAliases(TargetReg)[Inst.getOperand(0).getReg()])
return Inst.getOperand(1).getImm();
return std::nullopt;
}

std::optional<uint64_t>
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
BinaryBasicBlock::iterator CallInst) const override {
MCPhysReg SizeReg = getIntArgRegister(2);
if (SizeReg == getNoRegister())
return std::nullopt;

BitVector WrittenRegs(RegInfo->getNumRegs());
const BitVector &SizeRegAliases = getAliases(SizeReg);

for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
const MCInst &Inst = *InstIt;
WrittenRegs.reset();
getWrittenRegs(Inst, WrittenRegs);

if (WrittenRegs.anyCommon(SizeRegAliases))
return extractMoveImmediate(Inst, SizeReg);
}
return std::nullopt;
}

InstructionListType
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
assert(KnownSize.has_value() &&
"AArch64 memcpy inlining requires known size");
InstructionListType Code;
uint64_t Size = *KnownSize;

generateSizeSpecificMemcpy(Code, Size);

// If _memcpy8, adjust X0 to return dest+size instead of dest.
if (ReturnEnd)
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addImm(Size)
.addImm(0));
return Code;
}

InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
unsigned Reg, unsigned Offset = 0) {
Code.emplace_back(MCInstBuilder(LoadOpc)
.addReg(Reg)
.addReg(AArch64::X1)
.addImm(Offset));
Code.emplace_back(MCInstBuilder(StoreOpc)
.addReg(Reg)
.addReg(AArch64::X0)
.addImm(Offset));
};

// Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
break;
case 2:
AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
break;
case 4:
AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
break;
case 8:
AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
break;
case 16:
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
break;
case 32:
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
break;

default:
if (Size <= 64) {
// For sizes up to 64 bytes, greedily use the largest possible loads.
uint64_t Remaining = Size;
uint64_t Offset = 0;

const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
LoadStoreOps = {
{{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
{8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
{4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
{2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
{1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};

for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
while (Remaining >= OpSize) {
AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
Remaining -= OpSize;
Offset += OpSize;
}
}
break;
}
return Code;
}
};

} // end anonymous namespace
Expand Down
Loading
Loading