I was 64-bit-izing Terje Mathisen's itoa function, which takes a char* which must point to a buffer of at least 20 characters and a number, and I created this:
#define LOOP_WORK(number, shift) high *= 5; low *= 5; buf[number] = (high >> shift) + '0'; \ buf[number+10] = (low >> shift) + '0'; high &= andop; low &= andop; andop >>= 1 #define uint128_t __uint128_t #define uint64_t unsigned long void u2s(char* buf, unsigned long num) { // Split number into low/high pair. uint128_t split = num * 7922816251426433760; split += num >> 1; uint64_t high = split >> 96; uint64_t low = num - (high * 10000000000); // Transform numbers into usable decimal fractions. split = high * 18446744074; buf[0] = (split >> 64) + '0'; high = (uint64_t)split; split = low * 18446744074; buf[10] = (split >> 64) + '0'; low = (uint64_t)split; // Adjust numbers and multiply by 2 (so we don't have to multiply by 10 later) high = (high + 7) >> 3; low = (low + 7) >> 3; // Store special and number uint64_t andop = 0x0fffffffffffffff; LOOP_WORK(1, 60); LOOP_WORK(2, 59); LOOP_WORK(3, 58); LOOP_WORK(4, 57); LOOP_WORK(5, 56); LOOP_WORK(6, 55); LOOP_WORK(7, 54); LOOP_WORK(8, 53); // Final loop, without extra stuffs high *= 5; low *= 5; buf[9] = (high >> 52) + '0'; buf[19] = (low >> 52) + '0'; } #undef LOOP_WORK Here's an equivalent version in assembly (handwritten in AT&T):
u2s: // tmp128(rax:rdx) = num * 7922816251426433760 movq $7922816251426433760, %rax mulq %rsi // tmp128(rax:rdx) += num >> 1 movq %rsi, %rcx shrq $0x1, %rcx addq %rcx, %rax adcq $0x0, %rdx // high(rdx) = tmp128(rax:rdx) >> 96 shrq $32, %rdx // high(rcx); low(rsi) = num - (high * 10^10) movq %rdx, %rcx movq $10000000000, %rax mulq %rdx subq %rax, %rsi // high2(rax:rdx) = high * 18446744074 movq $18446744074, %rax mulq %rcx // buf[0] = (high2 >> 64) + '0' addb $'0', %dl movb %dl, (%rdi) // low2(rax:rdx) = low * 18446744074 movq %rax, %rcx movq $18446744074, %rax mulq %rsi // buf[10] = (low2 >> 64) + '0' addb $'0', %dl movb %dl, 10(%rdi) // high(rcx) = (u64)high2 // low(rax) = (u64)low2 // high = (high + 7) >> 3 addb $0x7, %cl shrq $0x3, %rcx // low = (low + 7) >> 3 addb $0x7, %al shrq $0x3, %rax // low (rax) *= 5 leaq (%rax,%rax,4), %rax // high(rcx) *= 5 leaq (%rcx,%rcx,4), %rcx // buf[1] = (high >> 60) + '0' movq %rcx, %rdx shrq $60, %rdx addb $'0', %dl movb %dl, 1(%rdi) // buf[11] = (low >> 60) + '0' movq %rax, %rdx shrq $60, %rdx addb $'0', %dl movb %dl, 11(%rdi) // Store number 0x0fffffffffffffff movq $0x0fffffffffffffff, %rsi // high &= 0x0fffffffffffffff andq %rsi, %rcx // low &= 0x0fffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[2] = (high >> 59) + '0' movq %rcx, %rdx shrq $59, %rdx addb $'0', %dl movb %dl, 2(%rdi) // buf[12] = (low >> 59) + '0' movq %rax, %rdx shrq $59, %rdx addb $'0', %dl movb %dl, 12(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x07ffffffffffffff andq %rsi, %rcx // low &= 0x07ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[3] = (high >> 58) + '0' movq %rcx, %rdx shrq $58, %rdx addb $'0', %dl movb %dl, 3(%rdi) // buf[13] = (low >> 58) + '0' movq %rax, %rdx shrq $58, %rdx addb $'0', %dl movb %dl, 13(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x03ffffffffffffff andq %rsi, %rcx // low &= 0x03ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[4] = (high >> 57) + '0' movq %rcx, %rdx shrq $57, %rdx addb $'0', %dl movb %dl, 4(%rdi) // buf[14] = (low >> 57) + '0' movq %rax, %rdx shrq $57, %rdx addb $'0', %dl movb %dl, 14(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x01ffffffffffffff andq %rsi, %rcx // low &= 0x01ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[5] = (high >> 56) + '0' movq %rcx, %rdx shrq $56, %rdx addb $'0', %dl movb %dl, 5(%rdi) // buf[15] = (low >> 56) + '0' movq %rax, %rdx shrq $56, %rdx addb $'0', %dl movb %dl, 15(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x00ffffffffffffff andq %rsi, %rcx // low &= 0x00ffffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[6] = (high >> 55) + '0' movq %rcx, %rdx shrq $55, %rdx addb $'0', %dl movb %dl, 6(%rdi) // buf[16] = (low >> 55) + '0' movq %rax, %rdx shrq $55, %rdx addb $'0', %dl movb %dl, 16(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x007fffffffffffff andq %rsi, %rcx // low &= 0x007fffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[7] = (high >> 54) + '0' movq %rcx, %rdx shrq $54, %rdx addb $'0', %dl movb %dl, 7(%rdi) // buf[17] = (low >> 54) + '0' movq %rax, %rdx shrq $54, %rdx addb $'0', %dl movb %dl, 17(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x003fffffffffffff andq %rsi, %rcx // low &= 0x003fffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[8] = (high >> 53) + '0' movq %rcx, %rdx shrq $53, %rdx addb $'0', %dl movb %dl, 8(%rdi) // buf[18] = (low >> 53) + '0e' movq %rax, %rdx shrq $53, %rdx addb $'0', %dl movb %dl, 18(%rdi) // update the `and` number shrq $0x1, %rsi // high &= 0x001fffffffffffff andq %rsi, %rcx // low &= 0x001fffffffffffff andq %rsi, %rax // high *= 5 leaq (%rcx,%rcx,4), %rcx // low *= 5 leaq (%rax,%rax,4), %rax // buf[9] = (high >> 52) + '0' shrq $52, %rcx addb $'0', %cl movb %cl, 9(%rdi) // buf[19] = (high >> 52) + '0' shrq $52, %rax addb $'0', %al movb %al, 19(%rdi) retq When I compile the C version in GCC with optimizations -O3, I find that code for one of the high/low variables is optimized out, it has hardcoded values for andop everywhere, and the code at the start to load in buf[0] and buf[10] inputs hard-coded values (48 and 3472328296227680304, respectively). When I run GCC with -fverbose-asm -S, I find that GCC optimized away high completely! I'm guessing that my C code's the problem (I'm not too great at C), but I don't know why. Terje Mathisen's post has it's own version in C, but it does not include the handwritten assembly optimizations also given there. Why is GCC messing me up so much?
B.T.W, here's the code from gcc (Gentoo 7.3.0-r1 p1.1) 7.3.0 (USE flags: -cilk +cxx -debug -doc +fortran -go -graphite -mpx -nls +nptl -objc -objc++ -objc-gc +openmp +pch -pgo +pie -regression-test +sanitize +ssp -vanilla +vtv) with flags -O3 -c (an objdump with flags -d):
u2s.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <u2s>: 0: 48 b8 0a fa 82 4b 04 movabs $0x44b82fa0a,%rax 7: 00 00 00 a: 48 b9 30 30 30 30 30 movabs $0x3030303030303030,%rcx 11: 30 30 30 14: c6 47 0a 30 movb $0x30,0xa(%rdi) // What? 18: 48 0f af f0 imul %rax,%rsi 1c: 48 89 0f mov %rcx,(%rdi) // What? 1f: 48 83 c6 07 add $0x7,%rsi 23: 48 c1 ee 03 shr $0x3,%rsi 27: 48 8d 04 b6 lea (%rsi,%rsi,4),%rax 2b: 48 89 c2 mov %rax,%rdx 2e: 48 c1 ea 3c shr $0x3c,%rdx 32: 83 c2 30 add $0x30,%edx 35: 88 57 0b mov %dl,0xb(%rdi) 38: 48 ba ff ff ff ff ff movabs $0xfffffffffffffff,%rdx 3f: ff ff 0f 42: 48 21 d0 and %rdx,%rax // There should be another AND 45: 48 8d 04 80 lea (%rax,%rax,4),%rax 49: 48 89 c2 mov %rax,%rdx 4c: 48 c1 ea 3b shr $0x3b,%rdx 50: 83 c2 30 add $0x30,%edx 53: 88 57 0c mov %dl,0xc(%rdi) 56: 48 ba ff ff ff ff ff movabs $0x7ffffffffffffff,%rdx 5d: ff ff 07 60: 48 21 d0 and %rdx,%rax 63: 48 8d 04 80 lea (%rax,%rax,4),%rax 67: 48 89 c2 mov %rax,%rdx 6a: 48 c1 ea 3a shr $0x3a,%rdx 6e: 83 c2 30 add $0x30,%edx 71: 88 57 0d mov %dl,0xd(%rdi) 74: 48 ba ff ff ff ff ff movabs $0x3ffffffffffffff,%rdx 7b: ff ff 03 7e: 48 21 d0 and %rdx,%rax 81: 48 8d 04 80 lea (%rax,%rax,4),%rax 85: 48 89 c2 mov %rax,%rdx 88: 48 c1 ea 39 shr $0x39,%rdx 8c: 83 c2 30 add $0x30,%edx 8f: 88 57 0e mov %dl,0xe(%rdi) 92: 48 ba ff ff ff ff ff movabs $0x1ffffffffffffff,%rdx 99: ff ff 01 9c: 48 21 d0 and %rdx,%rax 9f: 48 8d 04 80 lea (%rax,%rax,4),%rax a3: 48 89 c2 mov %rax,%rdx a6: 48 c1 ea 38 shr $0x38,%rdx aa: 83 c2 30 add $0x30,%edx ad: 88 57 0f mov %dl,0xf(%rdi) b0: 48 ba ff ff ff ff ff movabs $0xffffffffffffff,%rdx b7: ff ff 00 ba: 48 21 d0 and %rdx,%rax bd: 48 8d 04 80 lea (%rax,%rax,4),%rax c1: 48 89 c2 mov %rax,%rdx c4: 48 c1 ea 37 shr $0x37,%rdx c8: 83 c2 30 add $0x30,%edx cb: 88 57 10 mov %dl,0x10(%rdi) ce: 48 ba ff ff ff ff ff movabs $0x7fffffffffffff,%rdx d5: ff 7f 00 d8: 48 21 d0 and %rdx,%rax db: 48 8d 04 80 lea (%rax,%rax,4),%rax df: 48 89 c2 mov %rax,%rdx e2: 48 c1 ea 36 shr $0x36,%rdx e6: 83 c2 30 add $0x30,%edx e9: 88 57 11 mov %dl,0x11(%rdi) ec: 48 ba ff ff ff ff ff movabs $0x3fffffffffffff,%rdx f3: ff 3f 00 f6: 48 21 d0 and %rdx,%rax f9: 48 8d 04 80 lea (%rax,%rax,4),%rax fd: 48 89 c2 mov %rax,%rdx 100: 48 c1 ea 35 shr $0x35,%rdx 104: 83 c2 30 add $0x30,%edx 107: 88 57 12 mov %dl,0x12(%rdi) 10a: 48 ba ff ff ff ff ff movabs $0x1fffffffffffff,%rdx 111: ff 1f 00 114: 48 21 d0 and %rdx,%rax 117: ba 30 30 00 00 mov $0x3030,%edx 11c: 48 8d 04 80 lea (%rax,%rax,4),%rax 120: 66 89 57 08 mov %dx,0x8(%rdi) 124: 48 c1 e8 34 shr $0x34,%rax 128: 83 c0 30 add $0x30,%eax 12b: 88 47 13 mov %al,0x13(%rdi) 12e: c3 retq P.S: I've tested my handwritten assembly well, and so most differences between that and the objdump are mostly either code reordering or errors.
P.P.S: @Peter_Corde's answer has prevented the optimization-away of high, but the starting code is still messed up! Here's an excerpt:
0: 48 b9 e0 ea f6 5e 67 movabs $0x6df37f675ef6eae0,%rcx 7: 7f f3 6d a: 49 b8 00 e4 0b 54 02 movabs $0x2540be400,%r8 11: 00 00 00 14: c6 07 30 movb $0x30,(%rdi) // NOT GOOD 17: 48 89 c8 mov %rcx,%rax 1a: 48 89 f1 mov %rsi,%rcx 1d: c6 47 0a 30 movb $0x30,0xa(%rdi) // NOT GOOD 21: 48 f7 e6 mul %rsi 24: 48 d1 e9 shr %rcx 27: 49 89 c1 mov %rax,%r9 2a: 48 89 c8 mov %rcx,%rax 2d: 49 89 d2 mov %rdx,%r10 30: 31 d2 xor %edx,%edx 32: 4c 01 c8 add %r9,%rax 35: 48 b9 0a fa 82 4b 04 movabs $0x44b82fa0a,%rcx 3c: 00 00 00 3f: 4c 11 d2 adc %r10,%rdx 42: 48 c1 ea 20 shr $0x20,%rdx 46: 48 89 d0 mov %rdx,%rax 49: 49 0f af d0 imul %r8,%rdx 4d: 48 0f af c1 imul %rcx,%rax 51: 48 29 d6 sub %rdx,%rsi 54: 48 0f af f1 imul %rcx,%rsi 58: 48 83 c0 07 add $0x7,%rax 5c: 48 c1 e8 03 shr $0x3,%rax
split = high * 18446744074;supposed to besplit = high * (unsigned __int128)18446744074;? Looks like exactly the same bug. Check the rest of your C code for any more cases where you assign the result of a calculation to a wider variable.split. We only use the low bits. Test out the program! In my tests, I was only getting an output of 0 becausestrtoullwas being passedargv[0]instead ofargv[1]facepalm. Well, it's sorted out now!