I've disassembled your TestSpeed Java method (by the way, you ought to call it testSpeed to keep to Java naming conventions). This is on Linux (x64), but the results should be similar on Windows. Evidently, the JVM compiler has a number of compilation stages, each time with more optimization than the previous stage. I'm not an assembly language expert, but I think this shows that in C1 compilation, it generated a simple loop, similar to that generated by the C# runtime, but in the C2 generated code it has unrolled the loop, thus speeding it up.
So, it seems, the JVM compiler optimizes better than the .NET one in this particular case.
The TestSpeed method is shown below with line numbers for cross-reference with the assembly listing:
28 private static int TestSpeed(int[] byteArray) 29 { 30 int count=0; 31 for (int element : byteArray) 32 if (element>100) 33 count++; 34 return count; 35 }
And the assembly listing generated with
java -XX:+UnlockDiagnosticVMOptions -XX:+PrintAssembly
with hsdis-amd64.so in the LD_LIBRARY_PATH:
============================= C1-compiled nmethod ============================== ----------------------------------- Assembly ----------------------------------- Compiled method (c1) 498 20 % 3 org.example.App::TestSpeed @ 10 (40 bytes) total in heap [0x00007d32b5403510,0x00007d32b5403ae0] = 1488 relocation [0x00007d32b5403670,0x00007d32b54036b0] = 64 main code [0x00007d32b54036c0,0x00007d32b54038e0] = 544 stub code [0x00007d32b54038e0,0x00007d32b5403910] = 48 oops [0x00007d32b5403910,0x00007d32b5403918] = 8 metadata [0x00007d32b5403918,0x00007d32b5403920] = 8 scopes data [0x00007d32b5403920,0x00007d32b5403998] = 120 scopes pcs [0x00007d32b5403998,0x00007d32b5403ab8] = 288 dependencies [0x00007d32b5403ab8,0x00007d32b5403ac0] = 8 nul chk table [0x00007d32b5403ac0,0x00007d32b5403ae0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App' 0x00007d32b54036c0: mov %eax,-0x14000(%rsp) 0x00007d32b54036c7: push %rbp 0x00007d32b54036c8: sub $0x40,%rsp 0x00007d32b54036cc: movabs $0x7d323f400ac8,%rdi ; {metadata(method data for {method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App')} 0x00007d32b54036d6: mov 0xf4(%rdi),%ebx 0x00007d32b54036dc: add $0x2,%ebx 0x00007d32b54036df: mov %ebx,0xf4(%rdi) 0x00007d32b54036e5: and $0x7fe,%ebx 0x00007d32b54036eb: cmp $0x0,%ebx 0x00007d32b54036ee: je 0x00007d32b5403827 ;*iconst_0 {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@0 (line 30) 0x00007d32b54036f4: mov 0xc(%rsi),%edi ; implicit exception: dispatches to 0x00007d32b5403848 ;*arraylength {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@5 (line 31) 0x00007d32b54036f7: mov $0x0,%ebx 0x00007d32b54036fc: mov $0x0,%eax 0x00007d32b5403701: jmp 0x00007d32b5403791 ;*iload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@10 (line 31) 0x00007d32b5403706: xchg %ax,%ax 0x00007d32b5403708: cmp 0xc(%rsi),%ebx ; implicit exception: dispatches to 0x00007d32b540384d 0x00007d32b540370b: jae 0x00007d32b5403857 0x00007d32b5403711: movslq %ebx,%rdx 0x00007d32b5403714: mov 0x10(%rsi,%rdx,4),%edx ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) 0x00007d32b5403718: cmp $0x64,%edx 0x00007d32b540371b: movabs $0x7d323f400ac8,%rdx ; {metadata(method data for {method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App')} 0x00007d32b5403725: mov $0x158,%rcx 0x00007d32b540372c: jle 0x00007d32b5403739 0x00007d32b5403732: mov $0x168,%rcx 0x00007d32b5403739: mov (%rdx,%rcx,1),%r8 0x00007d32b540373d: lea 0x1(%r8),%r8 0x00007d32b5403741: mov %r8,(%rdx,%rcx,1) 0x00007d32b5403745: jle 0x00007d32b540374d ;*if_icmple {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@26 (line 32) 0x00007d32b540374b: inc %eax 0x00007d32b540374d: inc %ebx 0x00007d32b540374f: movabs $0x7d323f400ac8,%rdx ; {metadata(method data for {method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App')} 0x00007d32b5403759: mov 0xf8(%rdx),%ecx 0x00007d32b540375f: add $0x2,%ecx 0x00007d32b5403762: mov %ecx,0xf8(%rdx) 0x00007d32b5403768: and $0x3ffe,%ecx 0x00007d32b540376e: cmp $0x0,%ecx 0x00007d32b5403771: je 0x00007d32b5403865 ;*goto {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@35 (line 31) 0x00007d32b5403777: mov 0x348(%r15),%r10 ; ImmutableOopMap {rsi=Oop } ;*goto {reexecute=1 rethrow=0 return_oop=0} ; - (reexecute) org.example.App::TestSpeed@35 (line 31) 0x00007d32b540377e: test %eax,(%r10) ; {poll} 0x00007d32b5403781: movabs $0x7d323f400ac8,%rdx ; {metadata(method data for {method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App')} 0x00007d32b540378b: incl 0x178(%rdx) ;*goto {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@35 (line 31) 0x00007d32b5403791: cmp %edi,%ebx 0x00007d32b5403793: movabs $0x7d323f400ac8,%rdx ; {metadata(method data for {method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App')} 0x00007d32b540379d: mov $0x148,%rcx 0x00007d32b54037a4: jl 0x00007d32b54037b1 0x00007d32b54037aa: mov $0x138,%rcx 0x00007d32b54037b1: mov (%rdx,%rcx,1),%r8 0x00007d32b54037b5: lea 0x1(%r8),%r8 0x00007d32b54037b9: mov %r8,(%rdx,%rcx,1) 0x00007d32b54037bd: jl 0x00007d32b5403708 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@13 (line 31) 0x00007d32b54037c3: add $0x40,%rsp 0x00007d32b54037c7: pop %rbp 0x00007d32b54037c8: cmp 0x340(%r15),%rsp ; {poll_return} 0x00007d32b54037cf: ja 0x00007d32b5403886 0x00007d32b54037d5: ret ;*ireturn {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@39 (line 34) 0x00007d32b54037d6: mov %eax,-0x14000(%rsp) 0x00007d32b54037dd: push %rbp 0x00007d32b54037de: sub $0x40,%rsp 0x00007d32b54037e2: mov 0x20(%rsi),%ebx 0x00007d32b54037e5: mov 0x18(%rsi),%rax 0x00007d32b54037e9: mov 0x10(%rsi),%edx 0x00007d32b54037ec: mov 0x8(%rsi),%ecx 0x00007d32b54037ef: mov %rsi,%rdi 0x00007d32b54037f2: mov %ebx,0x30(%rsp) 0x00007d32b54037f6: mov %rax,0x28(%rsp) 0x00007d32b54037fb: mov %edx,0x24(%rsp) 0x00007d32b54037ff: mov %ecx,0x20(%rsp) 0x00007d32b5403803: call 0x00007d32d4700830 ; {runtime_call SharedRuntime::OSR_migration_end(long*)} 0x00007d32b5403808: mov 0x20(%rsp),%ecx 0x00007d32b540380c: mov %rcx,%rbx 0x00007d32b540380f: mov 0x24(%rsp),%edx 0x00007d32b5403813: mov %rdx,%rdi 0x00007d32b5403816: mov 0x28(%rsp),%rax 0x00007d32b540381b: mov %rax,%rsi 0x00007d32b540381e: mov 0x30(%rsp),%eax 0x00007d32b5403822: jmp 0x00007d32b5403791 0x00007d32b5403827: movabs $0x7d323f4006c0,%r10 ; {metadata({method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App')} 0x00007d32b5403831: mov %r10,0x8(%rsp) 0x00007d32b5403836: movq $0xffffffffffffffff,(%rsp) 0x00007d32b540383e: call 0x00007d32bca2f280 ; ImmutableOopMap {rsi=Oop } ;*synchronization entry ; - org.example.App::TestSpeed@-1 (line 30) ; {runtime_call counter_overflow Runtime1 stub} 0x00007d32b5403843: jmp 0x00007d32b54036f4 0x00007d32b5403848: call 0x00007d32bca295a0 ; ImmutableOopMap {rsi=Oop } ;*arraylength {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@5 (line 31) ; {runtime_call throw_null_pointer_exception Runtime1 stub} 0x00007d32b540384d: call 0x00007d32bca295a0 ; ImmutableOopMap {rsi=Oop } ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) ; {runtime_call throw_null_pointer_exception Runtime1 stub} 0x00007d32b5403852: call 0x00007d32bca295a0 ; ImmutableOopMap {rsi=Oop } ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) ; {runtime_call throw_null_pointer_exception Runtime1 stub} 0x00007d32b5403857: mov %rbx,(%rsp) 0x00007d32b540385b: mov %rsi,0x8(%rsp) 0x00007d32b5403860: call 0x00007d32bca28ca0 ; ImmutableOopMap {rsi=Oop } ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) ; {runtime_call throw_range_check_failed Runtime1 stub} 0x00007d32b5403865: movabs $0x7d323f4006c0,%r10 ; {metadata({method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App')} 0x00007d32b540386f: mov %r10,0x8(%rsp) 0x00007d32b5403874: movq $0x23,(%rsp) 0x00007d32b540387c: call 0x00007d32bca2f280 ; ImmutableOopMap {rsi=Oop } ;*goto {reexecute=1 rethrow=0 return_oop=0} ; - (reexecute) org.example.App::TestSpeed@35 (line 31) ; {runtime_call counter_overflow Runtime1 stub} 0x00007d32b5403881: jmp 0x00007d32b5403777 0x00007d32b5403886: movabs $0x7d32b54037c8,%r10 ; {internal_word} 0x00007d32b5403890: mov %r10,0x358(%r15) 0x00007d32b5403897: jmp 0x00007d32bc98e700 ; {runtime_call SafepointBlob} 0x00007d32b540389c: nop 0x00007d32b540389d: nop 0x00007d32b540389e: mov 0x3d0(%r15),%rax 0x00007d32b54038a5: movq $0x0,0x3d0(%r15) 0x00007d32b54038b0: movq $0x0,0x3d8(%r15) 0x00007d32b54038bb: add $0x40,%rsp 0x00007d32b54038bf: pop %rbp 0x00007d32b54038c0: jmp 0x00007d32bc99b600 ; {runtime_call unwind_exception Runtime1 stub} 0x00007d32b54038c5: hlt 0x00007d32b54038c6: hlt ... many hlt instructions as padding ... 0x00007d32b54038df: hlt [Exception Handler] 0x00007d32b54038e0: call 0x00007d32bca2b980 ; {no_reloc} 0x00007d32b54038e5: movabs $0x7d32d49cbfc8,%rdi ; {external_word} 0x00007d32b54038ef: and $0xfffffffffffffff0,%rsp 0x00007d32b54038f3: call 0x00007d32d451c6f0 ; {runtime_call MacroAssembler::debug64(char*, long, long*)} 0x00007d32b54038f8: hlt [Deopt Handler Code] 0x00007d32b54038f9: movabs $0x7d32b54038f9,%r10 ; {section_word} 0x00007d32b5403903: push %r10 0x00007d32b5403905: jmp 0x00007d32bc98d9a0 ; {runtime_call DeoptimizationBlob} 0x00007d32b540390a: hlt 0x00007d32b540390b: hlt 0x00007d32b540390c: hlt 0x00007d32b540390d: hlt 0x00007d32b540390e: hlt 0x00007d32b540390f: hlt -------------------------------------------------------------------------------- [/Disassembly]
(This section added by @PeterCordes.)
The C1 code doesn't use any cmovcc (conditional-move) or setcc (condition into 0/1 integer) instructions. It looks like it's just branching over inc instructions.
The C2 code does if-conversion (from branchy source to branchless assembly), avoiding the slowdown of branch mispredictions on your random data. (Why is processing a sorted array faster than processing an unsorted array? is a very similar problem, also looping over an array and conditionally adding.) C# is probably not doing this, which would explain it being 5x slower.
Java unrolls by 8, and does all 8 loads before any compares. This is excessive; out-of-order exec and hardware prefetch will already do a good job here; it could have avoided saving/restoring so many registers outside the loop by using cmp with memory. (It also costs some I-cache footprint, but the JIT compiler knows this is a long-running very hot loop.)
Once it has values loaded, the if(element > 100) { count++; } is implemented as count = (element>100) ? count+1 : count;, with blocks of asm like this:
# count in EDX, element in EDI mov %edx,%r11d inc %r11d # r11d = edx+1. Could have been an lea 1(%rdx), %r11d cmp $0x64,%edi # compare element against 100 cmovle %edx,%r11d # keep the un-incremented count if 100 >= element # next block uses count = r11d
This has critical path latency of at least 2 cycles from old count to new count (through the mov + inc + cmov). 3 cycles on CPUs that don't do mov-elimination, like Ice Lake (disabled for errata) and AMD Bulldozer-family. (Zen 1 does do mov-elimination on integer and XMM registers). So the bottleneck is one compare+increment per 2 clock cycles, and Zen 1 should achieve that even with ints coming all the way from DRAM, even without loop unrolling.
It would be faster to generate a 0 / 1 integer from the compare and add that. Like cmp $100, %edi ; setg %al / add %eax, %edx. (Assuming EAX is xor-zeroed once per unrolled loop.) 3 single-uop instructions vs. 4, but more importantly the critical path latency through count is only 1 cycle, an add. Out-of-order exec can handle the cmp/setg work as soon as element is ready, not needing count.
If Java had the value-range information to realize this could be an unsigned compare, it could have used cmp $101, %edi ; sbb $-1, %edx (to add 1 if no carry, or add 0 if carry, when x is unsigned-below 101). Converting from bytes to int on the fly would give the compiler more info, and avoid a separate unpack loop + cache footprint. But HotSpot isn't even looking for LEA as a copy-and-add peephole optimization, so probably won't try to use CF instead of materializing a 0/1 integer.
See also gcc optimization flag -O3 makes code slower than -O2 for a very similar case of branchless code-gen with a longer critical path than necessary.
TL:DR: Java could be going about twice as fast if it did count += (int)(element > 100) instead of count = (element>100) ? count+1 : count;.
Auto-vectorizing with SIMD could go much faster, like GCC and Clang do.
C2 disassembly:
============================= C2-compiled nmethod ============================== ----------------------------------- Assembly ----------------------------------- Compiled method (c2) 503 21 % 4 org.example.App::TestSpeed @ 10 (40 bytes) total in heap [0x00007d32bcecc090,0x00007d32bcecc6c0] = 1584 relocation [0x00007d32bcecc1f0,0x00007d32bcecc208] = 24 main code [0x00007d32bcecc220,0x00007d32bcecc4a0] = 640 stub code [0x00007d32bcecc4a0,0x00007d32bcecc4b8] = 24 oops [0x00007d32bcecc4b8,0x00007d32bcecc4c0] = 8 metadata [0x00007d32bcecc4c0,0x00007d32bcecc4d0] = 16 scopes data [0x00007d32bcecc4d0,0x00007d32bcecc568] = 152 scopes pcs [0x00007d32bcecc568,0x00007d32bcecc6a8] = 320 dependencies [0x00007d32bcecc6a8,0x00007d32bcecc6b0] = 8 nul chk table [0x00007d32bcecc6b0,0x00007d32bcecc6c0] = 16 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x00007d323f4006c0} 'TestSpeed' '([I)I' in 'org/example/App' 0x00007d32bcecc220: call 0x00007d32d463d440 ; {runtime_call os::breakpoint()} 0x00007d32bcecc225: data16 data16 nopw 0x0(%rax,%rax,1) 0x00007d32bcecc230: mov %eax,-0x14000(%rsp) 0x00007d32bcecc237: push %rbp 0x00007d32bcecc238: sub $0x30,%rsp 0x00007d32bcecc23c: mov 0x18(%rsi),%r14 0x00007d32bcecc240: mov 0x20(%rsi),%ebp 0x00007d32bcecc243: mov 0x10(%rsi),%r13d 0x00007d32bcecc247: mov 0x8(%rsi),%ebx 0x00007d32bcecc24a: mov %rsi,%rdi 0x00007d32bcecc24d: movabs $0x7d32d4700830,%r10 0x00007d32bcecc257: call *%r10 0x00007d32bcecc25a: nopw 0x0(%rax,%rax,1) 0x00007d32bcecc260: mov 0x8(%r14),%r11d ; implicit exception: dispatches to 0x00007d32bcecc47c 0x00007d32bcecc264: cmp $0x6c38,%r11d ; {metadata({type array int})} 0x00007d32bcecc26b: jne 0x00007d32bcecc464 ;*iload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@10 (line 31) 0x00007d32bcecc271: cmp %r13d,%ebx 0x00007d32bcecc274: jge 0x00007d32bcecc42a ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@13 (line 31) 0x00007d32bcecc27a: mov 0xc(%r14),%r10d 0x00007d32bcecc27e: mov %ebx,%r11d 0x00007d32bcecc281: inc %r11d 0x00007d32bcecc284: movslq %r11d,%r8 0x00007d32bcecc287: xor %r9d,%r9d 0x00007d32bcecc28a: test %r11d,%r11d 0x00007d32bcecc28d: cmovl %r9,%r8 0x00007d32bcecc291: mov %r8d,%r11d 0x00007d32bcecc294: cmp %r13d,%r11d 0x00007d32bcecc297: cmovg %r13d,%r11d ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) 0x00007d32bcecc29b: nopl 0x0(%rax,%rax,1) 0x00007d32bcecc2a0: cmp %r10d,%ebx 0x00007d32bcecc2a3: jae 0x00007d32bcecc444 0x00007d32bcecc2a9: mov 0x10(%r14,%rbx,4),%r9d 0x00007d32bcecc2ae: mov %ebp,%edx 0x00007d32bcecc2b0: inc %edx 0x00007d32bcecc2b2: cmp $0x64,%r9d 0x00007d32bcecc2b6: cmovle %ebp,%edx 0x00007d32bcecc2b9: inc %ebx ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@32 (line 31) 0x00007d32bcecc2bb: nopl 0x0(%rax,%rax,1) 0x00007d32bcecc2c0: cmp %r11d,%ebx 0x00007d32bcecc2c3: jge 0x00007d32bcecc2c9 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@13 (line 31) 0x00007d32bcecc2c5: mov %edx,%ebp 0x00007d32bcecc2c7: jmp 0x00007d32bcecc29b 0x00007d32bcecc2c9: movslq %r10d,%r11 0x00007d32bcecc2cc: movslq %r13d,%r8 0x00007d32bcecc2cf: cmp %r11,%r8 0x00007d32bcecc2d2: cmovl %r8,%r11 0x00007d32bcecc2d6: add $0xfffffffffffffff9,%r11 0x00007d32bcecc2da: mov $0xffffffff80000000,%r8 0x00007d32bcecc2e1: cmp $0xffffffff80000000,%r11 0x00007d32bcecc2e8: cmovl %r8,%r11 0x00007d32bcecc2ec: mov %r11d,%r11d 0x00007d32bcecc2ef: cmp %r11d,%ebx 0x00007d32bcecc2f2: jge 0x00007d32bcecc3fa ;*goto {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@35 (line 31) 0x00007d32bcecc2f8: jmp 0x00007d32bcecc31b 0x00007d32bcecc2fa: mov 0x348(%r15),%r10 ; ImmutableOopMap {r14=Oop } ;*goto {reexecute=1 rethrow=0 return_oop=0} ; - (reexecute) org.example.App::TestSpeed@35 (line 31) 0x00007d32bcecc301: test %eax,(%r10) ;*goto {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@35 (line 31) ; {poll} 0x00007d32bcecc304: cmp (%rsp),%ebx 0x00007d32bcecc307: jge 0x00007d32bcecc3f0 0x00007d32bcecc30d: vmovd %xmm0,%r13d 0x00007d32bcecc312: vmovd %xmm1,%r10d 0x00007d32bcecc317: mov (%rsp),%r11d 0x00007d32bcecc31b: mov %r11d,%ebp 0x00007d32bcecc31e: sub %ebx,%ebp 0x00007d32bcecc320: xor %r9d,%r9d 0x00007d32bcecc323: cmp %ebx,%r11d 0x00007d32bcecc326: cmovl %r9d,%ebp 0x00007d32bcecc32a: cmp $0x1f40,%ebp 0x00007d32bcecc330: mov $0x1f40,%r9d 0x00007d32bcecc336: cmova %r9d,%ebp 0x00007d32bcecc33a: add %ebx,%ebp 0x00007d32bcecc33c: vmovd %r13d,%xmm0 0x00007d32bcecc341: vmovd %r10d,%xmm1 0x00007d32bcecc346: mov %r11d,(%rsp) 0x00007d32bcecc34a: nopw 0x0(%rax,%rax,1) ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) 0x00007d32bcecc350: mov 0x10(%r14,%rbx,4),%r10d 0x00007d32bcecc355: movslq %ebx,%rsi 0x00007d32bcecc358: mov 0x2c(%r14,%rsi,4),%r13d 0x00007d32bcecc35d: mov 0x14(%r14,%rsi,4),%r11d 0x00007d32bcecc362: mov 0x28(%r14,%rsi,4),%r9d 0x00007d32bcecc367: mov 0x24(%r14,%rsi,4),%r8d 0x00007d32bcecc36c: mov 0x20(%r14,%rsi,4),%edi 0x00007d32bcecc371: mov 0x1c(%r14,%rsi,4),%ecx 0x00007d32bcecc376: mov 0x18(%r14,%rsi,4),%eax 0x00007d32bcecc37b: mov %edx,%esi 0x00007d32bcecc37d: inc %esi 0x00007d32bcecc37f: cmp $0x64,%r10d 0x00007d32bcecc383: cmovle %edx,%esi 0x00007d32bcecc386: mov %esi,%edx 0x00007d32bcecc388: inc %edx 0x00007d32bcecc38a: cmp $0x64,%r11d 0x00007d32bcecc38e: cmovle %esi,%edx 0x00007d32bcecc391: mov %edx,%r11d 0x00007d32bcecc394: inc %r11d 0x00007d32bcecc397: cmp $0x64,%eax 0x00007d32bcecc39a: cmovle %edx,%r11d 0x00007d32bcecc39e: mov %r11d,%edx 0x00007d32bcecc3a1: inc %edx 0x00007d32bcecc3a3: cmp $0x64,%ecx 0x00007d32bcecc3a6: cmovle %r11d,%edx 0x00007d32bcecc3aa: mov %edx,%r11d 0x00007d32bcecc3ad: inc %r11d 0x00007d32bcecc3b0: cmp $0x64,%edi 0x00007d32bcecc3b3: cmovle %edx,%r11d 0x00007d32bcecc3b7: mov %r11d,%r10d 0x00007d32bcecc3ba: inc %r10d 0x00007d32bcecc3bd: cmp $0x64,%r8d 0x00007d32bcecc3c1: cmovle %r11d,%r10d 0x00007d32bcecc3c5: mov %r10d,%r8d 0x00007d32bcecc3c8: inc %r8d 0x00007d32bcecc3cb: cmp $0x64,%r9d 0x00007d32bcecc3cf: cmovle %r10d,%r8d 0x00007d32bcecc3d3: mov %r8d,%edx 0x00007d32bcecc3d6: inc %edx 0x00007d32bcecc3d8: cmp $0x64,%r13d 0x00007d32bcecc3dc: cmovle %r8d,%edx 0x00007d32bcecc3e0: add $0x8,%ebx ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@32 (line 31) 0x00007d32bcecc3e3: cmp %ebp,%ebx 0x00007d32bcecc3e5: jl 0x00007d32bcecc350 ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@13 (line 31) 0x00007d32bcecc3eb: jmp 0x00007d32bcecc2fa 0x00007d32bcecc3f0: vmovd %xmm0,%r13d 0x00007d32bcecc3f5: vmovd %xmm1,%r10d 0x00007d32bcecc3fa: nopw 0x0(%rax,%rax,1) 0x00007d32bcecc400: cmp %r13d,%ebx 0x00007d32bcecc403: jge 0x00007d32bcecc460 0x00007d32bcecc409: jmp 0x00007d32bcecc40e 0x00007d32bcecc40b: nop 0x00007d32bcecc40c: mov %ebp,%edx ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) 0x00007d32bcecc40e: cmp %r10d,%ebx 0x00007d32bcecc411: jae 0x00007d32bcecc446 0x00007d32bcecc413: mov 0x10(%r14,%rbx,4),%r11d 0x00007d32bcecc418: mov %edx,%ebp 0x00007d32bcecc41a: inc %ebp 0x00007d32bcecc41c: cmp $0x64,%r11d 0x00007d32bcecc420: cmovle %edx,%ebp 0x00007d32bcecc423: inc %ebx ;*iinc {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@32 (line 31) 0x00007d32bcecc425: cmp %r13d,%ebx 0x00007d32bcecc428: jl 0x00007d32bcecc40c ;*if_icmpge {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@13 (line 31) 0x00007d32bcecc42a: mov $0xffffff45,%esi 0x00007d32bcecc42f: mov %r14,(%rsp) 0x00007d32bcecc433: mov %ebx,0x10(%rsp) 0x00007d32bcecc437: mov %r13d,0x14(%rsp) 0x00007d32bcecc43c: data16 xchg %ax,%ax 0x00007d32bcecc43f: call 0x00007d32bc98d600 ; ImmutableOopMap {[0]=Oop } ;*if_icmpge {reexecute=1 rethrow=0 return_oop=0} ; - (reexecute) org.example.App::TestSpeed@13 (line 31) ; {runtime_call UncommonTrapBlob} 0x00007d32bcecc444: mov %ebp,%edx 0x00007d32bcecc446: mov $0xffffffe4,%esi 0x00007d32bcecc44b: mov %edx,%ebp 0x00007d32bcecc44d: mov %r13d,0x8(%rsp) 0x00007d32bcecc452: mov %r14,0x10(%rsp) 0x00007d32bcecc457: mov %ebx,0x18(%rsp) 0x00007d32bcecc45b: call 0x00007d32bc98d600 ; ImmutableOopMap {[16]=Oop } ;*iaload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@19 (line 31) ; {runtime_call UncommonTrapBlob} 0x00007d32bcecc460: mov %edx,%ebp 0x00007d32bcecc462: jmp 0x00007d32bcecc42a 0x00007d32bcecc464: mov $0xffffff8d,%esi 0x00007d32bcecc469: mov %r14,(%rsp) 0x00007d32bcecc46d: mov %r13d,0x8(%rsp) 0x00007d32bcecc472: mov %ebx,0xc(%rsp) 0x00007d32bcecc476: nop 0x00007d32bcecc477: call 0x00007d32bc98d600 ; ImmutableOopMap {[0]=Oop } ;*iload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@10 (line 31) ; {runtime_call UncommonTrapBlob} 0x00007d32bcecc47c: mov $0xffffff76,%esi 0x00007d32bcecc481: mov %r13d,(%rsp) 0x00007d32bcecc485: mov %ebx,0x4(%rsp) 0x00007d32bcecc489: xchg %ax,%ax 0x00007d32bcecc48b: call 0x00007d32bc98d600 ; ImmutableOopMap {} ;*iload {reexecute=0 rethrow=0 return_oop=0} ; - org.example.App::TestSpeed@10 (line 31) ; {runtime_call UncommonTrapBlob} 0x00007d32bcecc490: hlt ... 14 more hlt instructions ... 0x00007d32bcecc49f: hlt [Exception Handler] 0x00007d32bcecc4a0: jmp 0x00007d32bc99bc00 ; {no_reloc} [Deopt Handler Code] 0x00007d32bcecc4a5: call 0x00007d32bcecc4aa 0x00007d32bcecc4aa: subq $0x5,(%rsp) 0x00007d32bcecc4af: jmp 0x00007d32bc98d9a0 ; {runtime_call DeoptimizationBlob} 0x00007d32bcecc4b4: hlt 0x00007d32bcecc4b5: hlt 0x00007d32bcecc4b6: hlt 0x00007d32bcecc4b7: hlt -------------------------------------------------------------------------------- [/Disassembly]
TestSpeed()once or twice before measuring it. But you should really be using a benchmarking library, especially when measuring programs that have not been compiled to native machine code.System.currentTimeInMillis()is not a great way of measuring very short intervals. You should be usingsystem.nanoTime().