I have an array of 4 longs that I am wanting to count the number of set bits within a given range. This is the function that I am currently using (where bitcount(uint64_t) is an inline asm function that gives the number of set bits in the argument):
unsigned count_bits(const uint64_t *long_ptr, size_t begin, size_t end) { uint64_t start_mask = ~((1L << (begin & 63)) - 1); uint64_t end_mask = ((1L << (end & 63)) - 1); if (begin >= 0 && begin < 64) { if (end < 64) { return bitcount(long_ptr[0] & start_mask & end_mask); } else if (end < 128) { return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1] & end_mask); } else if (end < 192) { return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1]) + bitcount(long_ptr[2] & end_mask); } else if (end<256) { return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1]) + bitcount(long_ptr[2]) + bitcount(long_ptr[3] & end_mask); } else { return bitcount(long_ptr[0] & start_mask) + bitcount(long_ptr[1]) + bitcount(long_ptr[2]) + bitcount(long_ptr[3]); } } else if (begin >= 64 && begin < 128) { if (end < 128) { return bitcount(long_ptr[1] & start_mask & end_mask); } else if (end < 192) { return bitcount(long_ptr[1] & start_mask) + bitcount(long_ptr[2] & end_mask); } else if (end < 256) { return bitcount(long_ptr[1] & start_mask) + bitcount(long_ptr[2]) + bitcount(long_ptr[3] & end_mask); } else { return bitcount(long_ptr[1] & start_mask) + bitcount(long_ptr[2]) + bitcount(long_ptr[3]); } } else if (begin >= 128 && begin < 192) { if (end < 192) { return bitcount(long_ptr[2] & start_mask & end_mask); } else if (end < 256) { return bitcount(long_ptr[2] & start_mask) + bitcount(long_ptr[3] & end_mask); } else { return bitcount(long_ptr[2] & start_mask) + bitcount(long_ptr[3]); } } else if (begin<256) { if (end < 256) { return bitcount(long_ptr[3] & start_mask & end_mask); } else { return bitcount(long_ptr[3] & start_mask); } } else { return 0; } } I am finding that performance of this code is quite good, but I am wondering if there is anything I can do to make it faster, or if a redesign of the algorithm could result in a performance boost.
1ULL, not1Lassert(begin <= end && end < 256); uint32_t tb = begin / 64; uint32_t te = end / 64; uint32_t ret = 0; uint64_t r; for (r = long_ptr[tb] & start_mask; tb < te ; tb++, r = long_ptr[tb]) { ret += bitcount(r); } return ret + bitcount(r & end_mask);? While OP code avoids the loop, I believe this has no more compares/jumps. And being smaller, may not pollute the code cache as much. Might be worth benchmarking, anyway.