I don't have any experience with Parallel, but I whipped up a test with manual threading, and it works perfectly.
private class Worker { public Thread Thread; public int[] Accumulator = new int[256]; public int Start, End; public byte[] Data; public Worker( int start, int end, byte[] buf ) { this.Start = start; this.End = end; this.Data = buf; this.Thread = new Thread( Func ); this.Thread.Start(); } public void Func() { for( int i = Start; i < End; i++ ) this.Accumulator[this.Data[i]]++; } } int NumThreads = 8; int len = buf.Length / NumThreads; var workers = new Worker[NumThreads]; for( int i = 0; i < NumThreads; i++ ) workers[i] = new Worker() { Data = buf, Start = i * len, End = i * len + len , buf }); foreach( var w in workers ) w.Thread.Join(); int[] accumulator = new int[256]; for( int i = 0; i < workers.Length; i++ ) for( int j = 0; j < accumulator.Length; j++ ) accumulator[j] += workers[i].Accumulator[j]; Results on my 720QMQ720 mobile i7:
Single threaded time = 5.50s 4 threads = 1.90s 8 threads = 1.24s Looks like it's working to me. And interestingly, even though the hyper-threading cores shares a cache, 8 threads was actually a bit faster than 4.