- Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64][SME] Avoid clobbering X0 in the MachineSMEABIPass #170131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This tweaks `findStateChangeInsertionPoint` to also avoid clobbering X0, which should be possible in most cases (since X0's live ranges are likely to be very short before register allocation). This improves codegen in a few cases, as not all redundant copies to/from X0 are eliminated. Change-Id: I38d8f3b40f6b1c2143ec9efb95637d244ad264e3
| @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis tweaks This improves codegen in a few cases, as not all redundant copies to/from X0 are eliminated. Full diff: https://github.com/llvm/llvm-project/pull/170131.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index b96f6f12a58d6..e1c84ab76e1ba 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -632,8 +632,8 @@ MachineSMEABI::findStateChangeInsertionPoint( PhysLiveRegs = Block.PhysLiveRegsAtExit; } - if (!(PhysLiveRegs & LiveRegs::NZCV)) - return {InsertPt, PhysLiveRegs}; // Nothing to do (no live flags). + if (PhysLiveRegs == LiveRegs::None) + return {InsertPt, PhysLiveRegs}; // Nothing to do (no live regs). // Find the previous state change. We can not move before this point. MachineBasicBlock::iterator PrevStateChangeI; @@ -650,15 +650,21 @@ MachineSMEABI::findStateChangeInsertionPoint( // Note: LiveUnits will only accurately track X0 and NZCV. LiveRegUnits LiveUnits(*TRI); setPhysLiveRegs(LiveUnits, PhysLiveRegs); + auto BestCandidate = std::make_pair(InsertPt, PhysLiveRegs); for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) { // Don't move before/into a call (which may have a state change before it). if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall()) break; LiveUnits.stepBackward(*I); - if (LiveUnits.available(AArch64::NZCV)) - return {I, getPhysLiveRegs(LiveUnits)}; + LiveRegs CurrentPhysLiveRegs = getPhysLiveRegs(LiveUnits); + // Find places where NZCV is available, but keep looking for locations where + // both NZCV and X0 are available, which can avoid some copies. + if (!(CurrentPhysLiveRegs & LiveRegs::NZCV)) + BestCandidate = {I, getPhysLiveRegs(LiveUnits)}; + if (CurrentPhysLiveRegs == LiveRegs::None) + break; } - return {InsertPt, PhysLiveRegs}; + return BestCandidate; } void MachineSMEABI::insertStateChanges(EmitContext &Context, diff --git a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir index 3f174a62128a8..ed768dec77998 100644 --- a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir +++ b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir @@ -79,14 +79,12 @@ body: | ; CHECK-NEXT: RequiresZASavePseudo ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: $x0 = IMPLICIT_DEF - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0 ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0 ; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0 ; CHECK-NEXT: MSR 56965, $xzr - ; CHECK-NEXT: $x0 = COPY [[COPY2]] + ; CHECK-NEXT: $x0 = IMPLICIT_DEF ; CHECK-NEXT: $nzcv = IMPLICIT_DEF ; CHECK-NEXT: FAKE_USE $x0 ; CHECK-NEXT: $zab0 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 30dbd1cb34667..0906e10b551b7 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -67,10 +67,10 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload @@ -170,11 +170,11 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: mov x8, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x20 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 ; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -267,14 +267,14 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4 ; CHECK-NEWLOWERING-NEXT: // %bb.3: ; CHECK-NEWLOWERING-NEXT: smstart sm ; CHECK-NEWLOWERING-NEXT: .LBB5_4: -; CHECK-NEWLOWERING-NEXT: mov x8, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 ; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload @@ -336,10 +336,10 @@ define i64 @test_many_callee_arguments( ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee ; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll index 0c886c643c5fb..87a63fed0546c 100644 --- a/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll +++ b/llvm/test/CodeGen/AArch64/sme-dynamic-tls.ll @@ -87,8 +87,7 @@ define i32 @load_tls_shared_za() nounwind "aarch64_inout_za" { ; CHECK-NEXT: .tlsdesccall x ; CHECK-NEXT: blr x1 ; CHECK-NEXT: mrs x8, TPIDR_EL0 -; CHECK-NEXT: ldr w0, [x8, x0] -; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ldr w8, [x8, x0] ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x9, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 @@ -133,8 +132,7 @@ define i32 @load_tls_streaming_shared_za() nounwind "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: blr x1 ; CHECK-NEXT: smstart sm ; CHECK-NEXT: mrs x8, TPIDR_EL0 -; CHECK-NEXT: ldr w0, [x8, x0] -; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ldr w8, [x8, x0] ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x9, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #80 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 50dd0c699284c..e672f777703a6 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -621,15 +621,15 @@ define i64 @test_many_callee_arguments( ; CHECK-NEWLOWERING-NEXT: stp x10, x11, [sp, #-16]! ; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee ; CHECK-NEWLOWERING-NEXT: add sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: mov x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x1, x0 ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB9_2 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB9_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore ; CHECK-NEWLOWERING-NEXT: .LBB9_2: -; CHECK-NEWLOWERING-NEXT: mov x0, x8 +; CHECK-NEWLOWERING-NEXT: mov x0, x1 ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: mov sp, x29 ; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Reload diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 3aaae5e73ff23..37adfb89e4762 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -33,7 +33,7 @@ define i32 @csr_d8_allocnxv4i32i32f64(double %d) "aarch64_pstate_sm_compatible" ; CHECK-COMMON-NEXT: ldr x29, [sp, #8] // 8-byte Reload ; CHECK-COMMON-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ret -; CHECK-COMMON-NE +; CHECK-NE entry: %a = alloca <vscale x 4 x i32> %b = alloca i32 @@ -626,23 +626,21 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK-NEWLOWERING-NEXT: mov x9, sp ; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 ; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 ; CHECK-NEWLOWERING-NEXT: mov w20, w0 +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #80 ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-80] ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEWLOWERING-NEXT: smstop sm ; CHECK-NEWLOWERING-NEXT: bl other ; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: mov w0, w20 -; CHECK-NEWLOWERING-NEXT: mov w8, w0 ; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #80 -; CHECK-NEWLOWERING-NEXT: cbnz x9, .LBB8_2 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB8_2 ; CHECK-NEWLOWERING-NEXT: // %bb.1: // %entry ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore ; CHECK-NEWLOWERING-NEXT: .LBB8_2: // %entry -; CHECK-NEWLOWERING-NEXT: mov w0, w8 +; CHECK-NEWLOWERING-NEXT: mov w0, w20 ; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64 ; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa wsp, 112 @@ -671,4 +669,4 @@ entry: tail call void @other() ret i32 %x } -declare void @other() \ No newline at end of file +declare void @other() |
This tweaks
findStateChangeInsertionPointto also avoid clobbering X0, which should be possible in most cases (since X0's live ranges are likely to be very short before register allocation).This improves codegen in a few cases, as not all redundant copies to/from X0 are eliminated.