- Notifications
You must be signed in to change notification settings - Fork 15.3k
Closed
Labels
Description
I tried the following code, which takes more than 3s to compile using clang 17.0.6. (The code size is also a bit hard to accept.)
oom_manual2.c
int src(); int f1(unsigned int *b) { #define SWITCH(in1, out1) \ unsigned int out1;\ switch (in1) { \ case 0: out1 = b[0] >> 0; break; \ case 1: out1 = b[0] >> 1; break; \ case 2: out1 = b[0] >> 2; break; \ case 3: out1 = b[0] >> 3; break; \ case 4: out1 = b[0] >> 4; break; \ case 5: out1 = b[0] >> 5; break; \ case 6: out1 = b[0] >> 6; break; \ case 7: out1 = b[0] >> 7; break; \ case 8: out1 = b[0] >> 8; break; \ case 9: out1 = b[0] >> 9; break; \ case 10: out1 = b[0] >> 10; break; \ case 11: out1 = b[0] >> 11; break; \ case 12: out1 = b[0] >> 12; break; \ case 13: out1 = b[0] >> 13; break; \ case 14: out1 = b[0] >> 14; break; \ case 15: out1 = b[0] >> 15; break; \ case 16: out1 = b[0] >> 16; break; \ case 17: out1 = b[0] >> 17; break; \ case 18: out1 = b[0] >> 18; break; \ case 19: out1 = b[0] >> 19; break; \ case 20: out1 = b[0] >> 20; break; \ case 21: out1 = b[0] >> 21; break; \ case 22: out1 = b[0] >> 22; break; \ case 23: out1 = b[0] >> 23; break; \ case 24: out1 = b[0] >> 24; break; \ case 25: out1 = b[0] >> 25; break; \ case 26: out1 = b[0] >> 26; break; \ case 27: out1 = b[0] >> 27; break; \ case 28: out1 = b[0] >> 28; break; \ case 29: out1 = b[0] >> 29; break; \ case 30: out1 = b[0] >> 30; break; \ case 31: out1 = b[0] >> 31; break; \ case 32: out1 = b[1] >> 0; break; \ case 33: out1 = b[1] >> 1; break; \ case 34: out1 = b[1] >> 2; break; \ case 35: out1 = b[1] >> 3; break; \ case 36: out1 = b[1] >> 4; break; \ case 37: out1 = b[1] >> 5; break; \ case 38: out1 = b[1] >> 6; break; \ case 39: out1 = b[1] >> 7; break; \ case 40: out1 = b[1] >> 8; break; \ case 41: out1 = b[1] >> 9; break; \ case 42: out1 = b[1] >> 10; break; \ case 43: out1 = b[1] >> 11; break; \ case 44: out1 = b[1] >> 12; break; \ case 45: out1 = b[1] >> 13; break; \ case 46: out1 = b[1] >> 14; break; \ case 47: out1 = b[1] >> 15; break; \ case 48: out1 = b[1] >> 16; break; \ case 49: out1 = b[1] >> 17; break; \ case 50: out1 = b[1] >> 18; break; \ case 51: out1 = b[1] >> 19; break; \ case 52: out1 = b[1] >> 20; break; \ case 53: out1 = b[1] >> 21; break; \ case 54: out1 = b[1] >> 22; break; \ case 55: out1 = b[1] >> 23; break; \ case 56: out1 = b[1] >> 24; break; \ case 57: out1 = b[1] >> 25; break; \ case 58: out1 = b[1] >> 26; break; \ case 59: out1 = b[1] >> 27; break; \ case 60: out1 = b[1] >> 28; break; \ case 61: out1 = b[1] >> 29; break; \ case 62: out1 = b[1] >> 30; break; \ case 63: out1 = b[1] >> 31; break; \ case 64: out1 = b[2] >> 0; break; \ case 65: out1 = b[2] >> 1; break; \ case 66: out1 = b[2] >> 2; break; \ case 67: out1 = b[2] >> 3; break; \ case 68: out1 = b[2] >> 4; break; \ case 69: out1 = b[2] >> 5; break; \ case 70: out1 = b[2] >> 6; break; \ case 71: out1 = b[2] >> 7; break; \ case 72: out1 = b[2] >> 8; break; \ case 73: out1 = b[2] >> 9; break; \ case 74: out1 = b[2] >> 10; break; \ case 75: out1 = b[2] >> 11; break; \ case 76: out1 = b[2] >> 12; break; \ case 77: out1 = b[2] >> 13; break; \ case 78: out1 = b[2] >> 14; break; \ case 79: out1 = b[2] >> 15; break; \ case 80: out1 = b[2] >> 16; break; \ case 81: out1 = b[2] >> 17; break; \ case 82: out1 = b[2] >> 18; break; \ case 83: out1 = b[2] >> 19; break; \ case 84: out1 = b[2] >> 20; break; \ case 85: out1 = b[2] >> 21; break; \ case 86: out1 = b[2] >> 22; break; \ case 87: out1 = b[2] >> 23; break; \ case 88: out1 = b[2] >> 24; break; \ case 89: out1 = b[2] >> 25; break; \ case 90: out1 = b[2] >> 26; break; \ case 91: out1 = b[2] >> 27; break; \ case 92: out1 = b[2] >> 28; break; \ case 93: out1 = b[2] >> 29; break; \ case 94: out1 = b[2] >> 30; break; \ case 95: out1 = b[2] >> 31; break; \ case 96: out1 = b[2] >> 0; break; \ case 97: out1 = b[2] >> 1; break; \ case 98: out1 = b[2] >> 2; break; \ case 99: out1 = b[2] >> 3; break; \ case 100: out1 = b[2] >> 4; break; \ case 101: out1 = b[2] >> 5; break; \ case 102: out1 = b[2] >> 6; break; \ case 103: out1 = b[2] >> 7; break; \ case 104: out1 = b[2] >> 8; break; \ case 105: out1 = b[2] >> 9; break; \ case 106: out1 = b[2] >> 10; break; \ case 107: out1 = b[2] >> 11; break; \ case 108: out1 = b[2] >> 12; break; \ case 109: out1 = b[2] >> 13; break; \ case 110: out1 = b[2] >> 14; break; \ case 111: out1 = b[2] >> 15; break; \ case 112: out1 = b[2] >> 16; break; \ case 113: out1 = b[2] >> 17; break; \ case 114: out1 = b[2] >> 18; break; \ case 115: out1 = b[2] >> 19; break; \ case 116: out1 = b[2] >> 20; break; \ case 117: out1 = b[2] >> 21; break; \ case 118: out1 = b[2] >> 22; break; \ case 119: out1 = b[2] >> 23; break; \ case 120: out1 = b[2] >> 24; break; \ case 121: out1 = b[2] >> 25; break; \ case 122: out1 = b[2] >> 26; break; \ case 123: out1 = b[2] >> 27; break; \ case 124: out1 = b[2] >> 28; break; \ case 125: out1 = b[2] >> 29; break; \ case 126: out1 = b[2] >> 30; break; \ case 127: out1 = b[2] >> 31; break; \ default: __builtin_unreachable(); break; \ } unsigned int r = src(); SWITCH(r >> 1 & 0x7fU, v1); SWITCH(r >> 27 | (r << 5 & 0x60U), v2); SWITCH(r >> 2 & 0x7fu, v3); SWITCH(r >> 9 & 0x7fu, v4); SWITCH(r >> 16 & 0x7fu, v5); SWITCH(r >> 23 & 0x7fu, v6); SWITCH(r >> 30 | (r << 2 & 0x7fu), v7); SWITCH(r >> 7 & 0x7fu, v8); SWITCH(r >> 12 & 0x7fu, v9); SWITCH(r << 4 & 0x7fu, v10); SWITCH(r >> 10 & 0x7fu, v11); SWITCH(r >> 24 & 0x7fu, v12); SWITCH(r >> 31 | (r << 1 & 0x7eu), v13); SWITCH(r >> 6 & 0x7fu, v14); SWITCH(r >> 13 & 0x7fu, v15); SWITCH(r >> 20 & 0x7fu, v16); SWITCH(r >> 27 | (r << 5 & 0x60u), v17); SWITCH(r >> 3 & 0x7fu, v18); SWITCH(r >> 9 & 0x7fu, v19); SWITCH(r >> 16 & 0x7fu, v20); SWITCH(r >> 23 & 0x7fu, v21); SWITCH(r >> 30 | (r << 2 & 0x7cu), v22); SWITCH(r >> 7 & 0x7fu, v23); SWITCH(r >> 12 & 0x7fu, v24); SWITCH(r >> 14 & 0x7fu, v25); SWITCH(r >> 15 & 0x7fu, v26); return (v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | v10 | v11 | v12 | v13 | v14 | v15 | v16 | v17 | v18 | v19 | v20 | v21 | v22 | v23 | v24 | v25 | v26); }If I change the 127 branch to the default branch, the time returns to normal.
diff --git a/oom_manual2.c b/oom_manual2.c index aab82cf..7da3f79 100644 --- a/oom_manual2.c +++ b/oom_manual2.c @@ -130,8 +130,7 @@ int f1(unsigned int *b) { case 124: out1 = b[2] >> 28; break; \ case 125: out1 = b[2] >> 29; break; \ case 126: out1 = b[2] >> 30; break; \ - case 127: out1 = b[2] >> 31; break; \ - default: __builtin_unreachable(); break; \ + default: out1 = b[2] >> 31; break; \ } unsigned int r = src(); SWITCH(r >> 1 & 0x7fU, v1);The complete command I used is as follows:
$ time clang -O1 oom_manual.c main.c -o oom_manual clang -O1 oom_manual.c main.c -o oom_manual 0.31s user 0.04s system 100% cpu 0.347 total $ time clang -O1 oom_manual2.c main.c -o oom_manual2 clang -O1 oom_manual2.c main.c -o oom_manual2 3.72s user 0.24s system 100% cpu 3.962 total $ ls -l total 272 -rwxr-xr-x 1 dianqk users 56K Jan 21 16:22 oom_manual -rwxr-xr-x 1 dianqk users 192K Jan 21 16:22 oom_manual2main.c
int src(void) { return 0; } int main(int argc, char **argv) { extern int f1(unsigned int *b, unsigned int r); f1(0, 0); return 0; }I've found that the main time-consumption is Early Tail Duplication.
$ time clang -O1 -c oom_manual2.c -emit-llvm -o oom_manual2.bc clang -O1 -c oom_manual2.c -emit-llvm -o oom_manual2.bc 0.07s user 0.01s system 99% cpu 0.086 total $ time llc -O1 oom_manual2.bc -filetype=null llc -O1 oom_manual2.bc -filetype=null 3.65s user 0.22s system 99% cpu 3.868 total $ llc -O1 oom_manual2.bc -filetype=null -time-trace $ cat out.time-trace | jq -r '.traceEvents[] | select(.name == "RunPass" or .name == "Total RunPass") | "\(.dur) \(.name) \(.args.detail)"' | sed 's/"//g' | sort -nr | head -n5 3800384 Total RunPass null 1221923 RunPass Early Tail Duplication 1100348 RunPass Eliminate PHI nodes for register allocation 473867 RunPass Register Coalescer 120407 RunPass Live Variable AnalysisBy adding the new TimeTraceScope, I can see that TailDuplicator::tailDuplicateAndUpdate and TailDuplicator::shouldTailDuplicate take up most of the time.
TimeTraceScope
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index 5ed67bd0a121..80401e75c85d 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -158,6 +158,7 @@ bool TailDuplicator::tailDuplicateAndUpdate( SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds, function_ref<void(MachineBasicBlock *)> *RemovalCallback, SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) { + llvm::TimeTraceScope TimeScope("TailDuplicator::tailDuplicateAndUpdate"); // Save the successors list. SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(), MBB->succ_end()); @@ -555,6 +556,7 @@ void TailDuplicator::updateSuccessorsPHIs( /// Determine if it is profitable to duplicate this block. bool TailDuplicator::shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB) { + llvm::TimeTraceScope TimeScope("TailDuplicator::shouldTailDuplicate"); // When doing tail-duplication during layout, the block ordering is in flux, // so canFallThrough returns a result based on incorrect information and // should just be ignored.The original question is as follows (I've abbreviated it a bit)
int src(); int f1(unsigned int *b) { #define SWITCH(in1, out1) \ unsigned int out1;\ switch (in1) { \ case 0: out1 = b[0] >> 0; break; \ ... case 126: out1 = b[2] >> 30; break; \ default: out1 = b[2] >> 31; break; \ } unsigned int r = src(); SWITCH(r >> 1 & 0x7fU, v1); ... SWITCH(r >> 15 & 0x7fu, v26); return (v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | v10 | v11 | v12 | v13 | v14 | v15 | v16 | v17 | v18 | v19 | v20 | v21 | v22 | v23 | v24 | v25 | v26); } OLD=`/usr/bin/time -f '%M' clang_before -cc1 -O1 -emit-obj oom_manual.cc -o /dev/null 2>&1` NEW=`/usr/bin/time -f '%M' clang_after -cc1 -O1 -emit-obj oom_manual.cc -o /dev/null 2>&1` echo $OLD echo $NEW expr $OLD "*" 10 "<" $NEW > /dev/null $ ./test_manual.sh 119816 1248148 $ echo $? 0 Originally posted by @dwblaikie in #76669 (comment)