gcc -O3 removing live code in int-to-string converter

Question

I was 64-bit-izing Terje Mathisen's itoa function, which takes a char* which must point to a buffer of at least 20 characters and a number, and I created this:

#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \ buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1 #define uint128_t __uint128_t #define uint64_t unsigned long void u2s(char* buf, unsigned long num) { // Split number into low/high pair. uint128_t split = num * 7922816251426433760; split += num >> 1; uint64_t high = split >> 96; uint64_t low = num - (high * 10000000000); // Transform numbers into usable decimal fractions. split = high * 18446744074; buf[0] = (split >> 64) + '0'; high = (uint64_t)split; split = low * 18446744074; buf[10] = (split >> 64) + '0'; low = (uint64_t)split; // Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later) high = (high + 7) >> 3; low = (low + 7) >> 3; // Store special and number uint64_t andop = 0x0fffffffffffffff; LOOP_WORK(1, 60); LOOP_WORK(2, 59); LOOP_WORK(3, 58); LOOP_WORK(4, 57); LOOP_WORK(5, 56); LOOP_WORK(6, 55); LOOP_WORK(7, 54); LOOP_WORK(8, 53); // Final loop, without extra stuffs high *= 5; low *= 5; buf[9] = (high >> 52) + '0'; buf[19] = (low >> 52) + '0'; } #undef LOOP_WORK

Here's an equivalent version in assembly (handwritten in AT&T):

u2s: // tmp128(rax:rdx) = num * 7922816251426433760 movq $7922816251426433760, %rax mulq %rsi // tmp128(rax:rdx) += num >> 1 movq %rsi, %rcx shrq $0x1, %rcx addq %rcx, %rax adcq $0x0, %rdx // high(rdx) = tmp128(rax:rdx) >> 96 shrq $32, %rdx // high(rcx); low(rsi) = num - (high * 10^10) movq %rdx, %rcx movq $10000000000, %rax mulq %rdx subq %rax, %rsi // high2(rax:rdx) = high * 18446744074 movq $18446744074, %rax mulq %rcx // buf[0] = (high2 >> 64) + '0' addb $'0', %dl movb %dl, (%rdi) // low2(rax:rdx) = low * 18446744074 movq %rax, %rcx movq $18446744074, %rax mulq %rsi // buf[10] = (low2 >> 64) + '0' addb $'0', %dl movb %dl, 10(%rdi) // high(rcx) = (u64)high2 // low(rax) = (u64)low2 // high = (high + 7) >> 3 addb $0x7, %cl shrq $0x3, %rcx // low = (low + 7) >> 3 addb $0x7, %al shrq $0x3, %rax // low (rax) *= 5 leaq (%rax,%rax,4), %rax // high(rcx) *= 5 leaq (%rcx,%rcx,4), %rcx // buf[1] = (high >> 60) + '0' movq %rcx, %rdx shrq $60, %rdx addb $'0', %dl movb %dl, 1(%rdi) // buf[11] = (low >> 60) + '0' movq %rax, %rdx shrq $60, %rdx addb $'0', %dl movb %dl, 11(%rdi) // Store number 0x0fffffffffffffff movq $0x0fffffffffffffff, %rsi // high &= 0x0fffffffffffffff andq %rsi, %rcx // low &= 0x0fffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[2] = (high >> 59) + '0' movq %rcx, %rdx shrq $59, %rdx addb $'0', %dl movb %dl, 2(%rdi) // buf[12] = (low >> 59) + '0' movq %rax, %rdx shrq $59, %rdx addb $'0', %dl movb %dl, 12(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x07ffffffffffffff andq %rsi, %rcx // low &= 0x07ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[3] = (high >> 58) + '0' movq %rcx, %rdx shrq $58, %rdx addb $'0', %dl movb %dl, 3(%rdi) // buf[13] = (low >> 58) + '0' movq %rax, %rdx shrq $58, %rdx addb $'0', %dl movb %dl, 13(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x03ffffffffffffff andq %rsi, %rcx // low &= 0x03ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[4] = (high >> 57) + '0' movq %rcx, %rdx shrq $57, %rdx addb $'0', %dl movb %dl, 4(%rdi) // buf[14] = (low >> 57) + '0' movq %rax, %rdx shrq $57, %rdx addb $'0', %dl movb %dl, 14(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x01ffffffffffffff andq %rsi, %rcx // low &= 0x01ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[5] = (high >> 56) + '0' movq %rcx, %rdx shrq $56, %rdx addb $'0', %dl movb %dl, 5(%rdi) // buf[15] = (low >> 56) + '0' movq %rax, %rdx shrq $56, %rdx addb $'0', %dl movb %dl, 15(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x00ffffffffffffff andq %rsi, %rcx // low &= 0x00ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[6] = (high >> 55) + '0' movq %rcx, %rdx shrq $55, %rdx addb $'0', %dl movb %dl, 6(%rdi) // buf[16] = (low >> 55) + '0' movq %rax, %rdx shrq $55, %rdx addb $'0', %dl movb %dl, 16(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x007fffffffffffff andq %rsi, %rcx // low &= 0x007fffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[7] = (high >> 54) + '0' movq %rcx, %rdx shrq $54, %rdx addb $'0', %dl movb %dl, 7(%rdi) // buf[17] = (low >> 54) + '0' movq %rax, %rdx shrq $54, %rdx addb $'0', %dl movb %dl, 17(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x003fffffffffffff andq %rsi, %rcx // low &= 0x003fffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[8] = (high >> 53) + '0' movq %rcx, %rdx shrq $53, %rdx addb $'0', %dl movb %dl, 8(%rdi) // buf[18] = (low >> 53) + '0e' movq %rax, %rdx shrq $53, %rdx addb $'0', %dl movb %dl, 18(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x001fffffffffffff andq %rsi, %rcx // low &= 0x001fffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[9] = (high >> 52) + '0' shrq $52, %rcx addb $'0', %cl movb %cl, 9(%rdi) // buf[19] = (high >> 52) + '0' shrq $52, %rax addb $'0', %al movb %al, 19(%rdi) retq

When I compile the C version in GCC with optimizations -O3, I find that code for one of the high/low variables is optimized out, it has hardcoded values for andop everywhere, and the code at the start to load in buf[0] and buf[10] inputs hard-coded values (48 and 3472328296227680304, respectively). When I run GCC with -fverbose-asm -S, I find that GCC optimized away high completely! I'm guessing that my C code's the problem (I'm not too great at C), but I don't know why. Terje Mathisen's post has it's own version in C, but it does not include the handwritten assembly optimizations also given there. Why is GCC messing me up so much?

B.T.W, here's the code from gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0 (USE flags: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv) with flags -O3 -c (an objdump with flags -d):

u2s.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <u2s>: 0: 48 b8 0a fa 82 4b 04 movabs $0x44b82fa0a,%rax 7: 00 00 00 a: 48 b9 30 30 30 30 30 movabs $0x3030303030303030,%rcx 11: 30 30 30 14: c6 47 0a 30 movb $0x30,0xa(%rdi) // What? 18: 48 0f af f0 imul %rax,%rsi 1c: 48 89 0f mov %rcx,(%rdi) // What? 1f: 48 83 c6 07 add $0x7,%rsi 23: 48 c1 ee 03 shr $0x3,%rsi 27: 48 8d 04 b6 lea (%rsi,%rsi,4),%rax 2b: 48 89 c2 mov %rax,%rdx 2e: 48 c1 ea 3c shr $0x3c,%rdx 32: 83 c2 30 add $0x30,%edx 35: 88 57 0b mov %dl,0xb(%rdi) 38: 48 ba ff ff ff ff ff movabs $0xfffffffffffffff,%rdx 3f: ff ff 0f 42: 48 21 d0 and %rdx,%rax // There should be another AND 45: 48 8d 04 80 lea (%rax,%rax,4),%rax 49: 48 89 c2 mov %rax,%rdx 4c: 48 c1 ea 3b shr $0x3b,%rdx 50: 83 c2 30 add $0x30,%edx 53: 88 57 0c mov %dl,0xc(%rdi) 56: 48 ba ff ff ff ff ff movabs $0x7ffffffffffffff,%rdx 5d: ff ff 07 60: 48 21 d0 and %rdx,%rax 63: 48 8d 04 80 lea (%rax,%rax,4),%rax 67: 48 89 c2 mov %rax,%rdx 6a: 48 c1 ea 3a shr $0x3a,%rdx 6e: 83 c2 30 add $0x30,%edx 71: 88 57 0d mov %dl,0xd(%rdi) 74: 48 ba ff ff ff ff ff movabs $0x3ffffffffffffff,%rdx 7b: ff ff 03 7e: 48 21 d0 and %rdx,%rax 81: 48 8d 04 80 lea (%rax,%rax,4),%rax 85: 48 89 c2 mov %rax,%rdx 88: 48 c1 ea 39 shr $0x39,%rdx 8c: 83 c2 30 add $0x30,%edx 8f: 88 57 0e mov %dl,0xe(%rdi) 92: 48 ba ff ff ff ff ff movabs $0x1ffffffffffffff,%rdx 99: ff ff 01 9c: 48 21 d0 and %rdx,%rax 9f: 48 8d 04 80 lea (%rax,%rax,4),%rax a3: 48 89 c2 mov %rax,%rdx a6: 48 c1 ea 38 shr $0x38,%rdx aa: 83 c2 30 add $0x30,%edx ad: 88 57 0f mov %dl,0xf(%rdi) b0: 48 ba ff ff ff ff ff movabs $0xffffffffffffff,%rdx b7: ff ff 00 ba: 48 21 d0 and %rdx,%rax bd: 48 8d 04 80 lea (%rax,%rax,4),%rax c1: 48 89 c2 mov %rax,%rdx c4: 48 c1 ea 37 shr $0x37,%rdx c8: 83 c2 30 add $0x30,%edx cb: 88 57 10 mov %dl,0x10(%rdi) ce: 48 ba ff ff ff ff ff movabs $0x7fffffffffffff,%rdx d5: ff 7f 00 d8: 48 21 d0 and %rdx,%rax db: 48 8d 04 80 lea (%rax,%rax,4),%rax df: 48 89 c2 mov %rax,%rdx e2: 48 c1 ea 36 shr $0x36,%rdx e6: 83 c2 30 add $0x30,%edx e9: 88 57 11 mov %dl,0x11(%rdi) ec: 48 ba ff ff ff ff ff movabs $0x3fffffffffffff,%rdx f3: ff 3f 00 f6: 48 21 d0 and %rdx,%rax f9: 48 8d 04 80 lea (%rax,%rax,4),%rax fd: 48 89 c2 mov %rax,%rdx 100: 48 c1 ea 35 shr $0x35,%rdx 104: 83 c2 30 add $0x30,%edx 107: 88 57 12 mov %dl,0x12(%rdi) 10a: 48 ba ff ff ff ff ff movabs $0x1fffffffffffff,%rdx 111: ff 1f 00 114: 48 21 d0 and %rdx,%rax 117: ba 30 30 00 00 mov $0x3030,%edx 11c: 48 8d 04 80 lea (%rax,%rax,4),%rax 120: 66 89 57 08 mov %dx,0x8(%rdi) 124: 48 c1 e8 34 shr $0x34,%rax 128: 83 c0 30 add $0x30,%eax 12b: 88 47 13 mov %al,0x13(%rdi) 12e: c3 retq

P.S: I've tested my handwritten assembly well, and so most differences between that and the objdump are mostly either code reordering or errors.

P.P.S: @Peter_Corde's answer has prevented the optimization-away of high, but the starting code is still messed up! Here's an excerpt:

 0: 48 b9 e0 ea f6 5e 67 movabs $0x6df37f675ef6eae0,%rcx 7: 7f f3 6d a: 49 b8 00 e4 0b 54 02 movabs $0x2540be400,%r8 11: 00 00 00 14: c6 07 30 movb $0x30,(%rdi) // NOT GOOD 17: 48 89 c8 mov %rcx,%rax 1a: 48 89 f1 mov %rsi,%rcx 1d: c6 47 0a 30 movb $0x30,0xa(%rdi) // NOT GOOD 21: 48 f7 e6 mul %rsi 24: 48 d1 e9 shr %rcx 27: 49 89 c1 mov %rax,%r9 2a: 48 89 c8 mov %rcx,%rax 2d: 49 89 d2 mov %rdx,%r10 30: 31 d2 xor %edx,%edx 32: 4c 01 c8 add %r9,%rax 35: 48 b9 0a fa 82 4b 04 movabs $0x44b82fa0a,%rcx 3c: 00 00 00 3f: 4c 11 d2 adc %r10,%rdx 42: 48 c1 ea 20 shr $0x20,%rdx 46: 48 89 d0 mov %rdx,%rax 49: 49 0f af d0 imul %r8,%rdx 4d: 48 0f af c1 imul %rcx,%rax 51: 48 29 d6 sub %rdx,%rsi 54: 48 0f af f1 imul %rcx,%rsi 58: 48 83 c0 07 add $0x7,%rax 5c: 48 c1 e8 03 shr $0x3,%rax

Is split = high * 18446744074; supposed to be split = high * (unsigned __int128)18446744074;? Looks like exactly the same bug. Check the rest of your C code for any more cases where you assign the result of a calculation to a wider variable. — Peter Cordes
– Peter Cordes, Commented May 12, 2018 at 21:54
You're right! It's fixed! Thanks. Can you make the comment an answer, or add to your previous answer? — ARaspiK
– ARaspiK, Commented May 13, 2018 at 7:15
BTW, about your 2nd comment, we later ignore the high bits of split. We only use the low bits. Test out the program! In my tests, I was only getting an output of 0 because strtoull was being passed argv[0] instead of argv[1] facepalm. Well, it's sorted out now! — ARaspiK
– ARaspiK, Commented May 13, 2018 at 7:16
done, although really my answer already pointed out the bug you're having. Finding every instance of that bug in the code you posted is a minor detail (for future readers with the same problem in different code.) — Peter Cordes
– Peter Cordes, Commented May 13, 2018 at 7:17

Peter Cordes · Accepted Answer · 2018-05-13 07:16:08Z

Yes, when the compiler shows you that some of your function outputs unexpectedly don't depend on the input, that's usually a sign that your C source doesn't mean what you thought it did.

In this case it looks like uint128_t split = num * 7922816251426433760; is the problem. num is an unsigned long (uint64_t in the x86-64 SysV ABI which you're compiling for). Thus, the * operator produces a 64-bit result which is zero-extended as an initializer for uint128_t split.

uint128_t split = (unsigned __int128) num * 7922816251426433760;

casts num to a 128-bit integer before the multiply, so you get a full 128-bit result with mulq. (gcc7.3 -O3 on Godbolt).

I didn't look into the full details of the rest of your function; there may be other problems, but that's the first one I saw.

re: update:

Is split = high * 18446744074; supposed to be split = high * (unsigned __int128)18446744074;? Looks like exactly the same bug. Check the rest of your C code for any more cases where you assign the result of a calculation to a wider variable.

Thanks! It solved the main issue of high being optimized away! But the starter code is still messed up. Thanks a lot, though!
BTW, shouldn't GCC complain about this? I tried compiling my code with -Wall -Wextra -Wpedantic but nothing showed up. How come?
@ARaspiK: You're allowed to have arithmetic overflow on unsigned types with no consequences, so the 64-bit unsigned multiplication with overflow was legitimate. The compiler assumed you meant what you said.
@ARaspiK: You're wishing that gcc would warn that uint64_t high = split >> 96; was an over-complicated way of writing high=0? In this case that's a bug, but in other cases, especially after inlining, the same logic could be a desirable optimization when two separate functions end up interacting this way. Or it could be from a macro, which the compiler itself can't see, so even when the code is in the same function it might be inappropriate to warn. The closest thing I see is -Wtype-limits which will warn about unsigned u=...; if (u < 0) or if(u>UINT_MAX).
A warning option like that might be useful for localized debugging in cases like this, but gcc doesn't have such an option (gcc.gnu.org/onlinedocs/gcc/Warning-Options.html). I didn't check clang. Normally you'd just single-step with a debugger and notice that high=0, and get back to the problem line right away. Looking at the asm output and working backwards to see that high=0 was a compile-time constant is another way to see the same thing, which is what I did.

Collectives™ on Stack Overflow

gcc -O3 removing live code in int-to-string converter

1 Answer 1

6 Comments

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

6 Comments

Linked

Related