EDITED 2021-12-16 19:07: Show comparison against both uint and uchar, and also show both GCC and Clang. Add -march=native to compiler flags. Now the results seem to show that bool is as good as other integer types, but some compilers produce sub-optimal code.
EDITED 2022-01-11 18:56: After some tests, slightly changing the code can show important performance problems, more likely to be present with _Bool than uint.
For my tests, I chose unsigned types, since that's what the project was using instead of bool, but I expect signed types to behave similarly.
I'll show here the tests with unsigned char, since bool is 1 byte in my system and that reduces the difference in assembly output, and also unsigned int to compare different widths.
I tested storing an integer into one of these types (bool, unsigned char, and unsigned int), using one of these types to control a selection statement, and using one of this types as a parameter of a function.
Source code:
// repeat.h:
#pragma once #define repeat2(e) (e);(e) #define repeat4(e) repeat2(e);repeat2(e) #define repeat8(e) repeat4(e);repeat4(e) #define repeat16(e) repeat8(e);repeat8(e) #define repeat32(e) repeat16(e);repeat16(e) #define repeat64(e) repeat32(e);repeat32(e) #define repeat128(e) repeat64(e);repeat64(e) #define repeat256(e) repeat128(e);repeat128(e) #define repeat512(e) repeat256(e);repeat256(e) #define repeat1024(e) repeat512(e);repeat512(e) #define repeat(e) do \ { \ repeat16(e); \ } while (0)
// store_bool.h:
#pragma once _Bool store_bool(long n, int x);
// store_bool.c:
#include "store_bool.h" #include "repeat.h" _Bool store_bool(long n, volatile int x) { volatile _Bool b; for (long i = 0; i < n; i++) repeat(b = x); return b; }
// store_uchar.h:
#pragma once unsigned char store_uchar(long n, int x);
// store_uchar.c:
#include "store_uchar.h" #include "repeat.h" unsigned char store_uchar(long n, volatile int x) { volatile unsigned char c; for (long i = 0; i < n; i++) repeat(c = x); return c; }
// store_uint.h:
#pragma once unsigned int store_uint(long n, int x);
// store_uint.c:
#include "store_uint.h" #include "repeat.h" unsigned int store_uint(long n, volatile int x) { volatile unsigned int u; for (long i = 0; i < n; i++) repeat(u = x); return u; }
// consume_bool.h:
#pragma once int consume_bool(long n, _Bool b);
// consume_bool.c:
#include "consume_bool.h" #include "repeat.h" int consume_bool(long n, volatile _Bool b) { volatile int x = 5; for (long i = 0; i < n; i++) repeat({if (b) x = 3;}); return x; }
// consume_uchar.h:
#pragma once int consume_uchar(long n, unsigned char u);
// consume_uchar.c:
#include "consume_uchar.h" #include "repeat.h" int consume_uchar(long n, volatile unsigned char c) { volatile int x = 5; for (long i = 0; i < n; i++) repeat({if (c) x = 3;}); return x; }
// consume_uint.h:
#pragma once int consume_uint(long n, unsigned int u);
// consume_uint.c:
#include "consume_uint.h" #include "repeat.h" int consume_uint(long n, volatile unsigned int u) { volatile int x = 5; for (long i = 0; i < n; i++) repeat({if (u) x = 3;}); return x; }
// param_bool_.h:
#pragma once int param_bool_(_Bool x);
// param_bool_.c:
#include "param_bool_.h" int param_bool_(_Bool b) { return b ? 3 : 5; }
// param_bool.h:
#pragma once void param_bool(long n, _Bool b);
// param_bool.c:
#include "param_bool.h" #include "param_bool_.h" #include "repeat.h" void param_bool(long n, volatile _Bool b) { for (long i = 0; i < n; i++) repeat(param_bool_(b)); }
// param_uchar_.h:
#pragma once int param_uchar_(unsigned char c);
// param_uchar_.c:
#include "param_uchar_.h" int param_uchar_(unsigned char c) { return c ? 3 : 5; }
// param_uchar.h:
#pragma once void param_uchar(long n, unsigned char c);
// param_uchar.c:
#include "param_uchar.h" #include "param_uchar_.h" #include "repeat.h" void param_uchar(long n, volatile unsigned char c) { for (long i = 0; i < n; i++) repeat(param_bool_(c)); }
// param_uint_.h:
#pragma once int param_uint_(unsigned int u);
// param_uint_.c:
#include "param_uint_.h" int param_uint_(unsigned int u) { return u ? 3 : 5; }
// param_uint.h:
#pragma once void param_uint(long n, unsigned int u);
// param_uint.c:
#include "param_uint.h" #include "param_uint_.h" #include "repeat.h" void param_uint(long n, volatile unsigned int u) { for (long i = 0; i < n; i++) repeat(param_bool_(u)); }
// main.c:
#include <stdio.h> #include <time.h> #include "store_bool.h" #include "store_uchar.h" #include "store_uint.h" #include "consume_bool.h" #include "consume_uchar.h" #include "consume_uint.h" #include "param_bool.h" #include "param_uchar.h" #include "param_uint.h" #define measure(e) \ ({ \ clock_t t0, t1; \ double t; \ \ t0 = clock(); \ e; \ t1 = clock(); \ \ t = (double) (t1 - t0) / CLOCKS_PER_SEC; \ t; \ }) int main(int argc, char *argv[]) { double sb, sc, su; double cb, cc, cu; double pb, pc, pu; long n; if (argc != 2) exit(2); n = atol(argv[1]); sb = measure(store_bool(n, 1)); sc = measure(store_uchar(n, 1)); su = measure(store_uint(n, 1)); cb = measure(consume_bool(n, 1)); cc = measure(consume_uchar(n, 1)); cu = measure(consume_uint(n, 1)); pb = measure(param_bool(n, 1)); pc = measure(param_uchar(n, 1)); pu = measure(param_uint(n, 1)); printf("n: %li\n", n); putchar('\n'); printf("store bool: %lf\n", sb); printf("store uchar: %lf\n", sc); printf("store uint: %lf\n", su); putchar('\n'); printf("consume bool: %lf\n", cb); printf("consume uchar: %lf\n", cc); printf("consume uint: %lf\n", cu); putchar('\n'); printf("param bool: %lf\n", pb); printf("param uchar: %lf\n", pc); printf("param uint: %lf\n", pu); }
I used volatile for some variables, to avoid the compiler optimizing out the multiple assignments and tests.
Since the compiler will not unroll the loops, as they are huge, I used many (16) repeated expressions in each loop (see the repeat() macro), to reduce the impact of the loop overhead (jump instructions) in the total benchmark time.
Compiling:
$ cc -Wall -Wextra -O3 -march=native -S *.c $ cc -O3 -march=native *.s $
Assembly:
I'll cherry-pick a single one of the 16 repetitions, to simplify. If you want to see full assembly files, you can compile them yourself (I gave here enough instructions).
// store_bool.s (GCC):
movl -20(%rsp), %edx testl %edx, %edx setne %dl movb %dl, -1(%rsp)
// store_bool.s (Clang):
cmpl $0, -4(%rsp) setne -5(%rsp)
// sotre_uchar.s (GCC):
movl -20(%rsp), %edx movb %dl, -1(%rsp)
// store_uchar.s (Clang):
movl -4(%rsp), %ecx movb %cl, -5(%rsp)
// store_uint.s (GCC):
movl -20(%rsp), %edx movl %edx, -4(%rsp)
// store_uint.s (Clang):
movl -4(%rsp), %ecx movl %ecx, -8(%rsp)
From the above, uchar and uint are likely to be the same. bool has two instructions too on Clang, but they are different; that may or may not make a difference. On GCC, it clearly has 2 extra instructions compared to uchar which makes it slower.
// consume_bool.s (GCC):
movzbl -20(%rsp), %edx testb %dl, %dl je .L2 movl $3, -4(%rsp) .L2:
// consume_bool.s (Clang):
.LBB0_5: # in Loop: Header=BB0_1 Depth=1 testb $1, -5(%rsp) jne .LBB0_6 [...] .LBB0_6: # in Loop: Header=BB0_1 Depth=1 movl $3, -4(%rsp) testb $1, -5(%rsp) je .LBB0_9
(LBB0_9 is similar to LBB0_5)
// consume_uchar.s (GCC):
movzbl -20(%rsp), %edx testb %dl, %dl je .L2 movl $3, -4(%rsp) .L2:
// consume_uchar.s (Clang):
cmpb $0, -5(%rsp) je .LBB0_3 # %bb.2: # in Loop: Header=BB0_1 Depth=1 movl $3, -4(%rsp) .LBB0_3: # in Loop: Header=BB0_1 Depth=1
// consume_uint.s (GCC):
movl -20(%rsp), %edx testl %edx, %edx je .L2 movl $3, -4(%rsp) .L2:
// consume_uint.s (Clang):
cmpl $0, -4(%rsp) je .LBB0_3 # %bb.2: # in Loop: Header=BB0_1 Depth=1 movl $3, -8(%rsp) .LBB0_3: # in Loop: Header=BB0_1 Depth=1
In these cases, the assembly produced by GCC is almost identical for the 3 types, so I don't expect any difference. In Clang, bool has different code, but since it's very different, it's hard to predict if it will be faster or slower than the integers.
// param_bool_.s (GCC):
param_bool_: .LFB0: .cfi_startproc cmpb $1, %dil sbbl %eax, %eax andl $2, %eax addl $3, %eax ret .cfi_endproc .LFE0:
// param_bool_.s (Clang):
param_bool_: # @param_bool_ .cfi_startproc # %bb.0: xorb $1, %dil movzbl %dil, %eax addl %eax, %eax addl $3, %eax retq .Lfunc_end0:
// param_bool.s (GCC):
movzbl 12(%rsp), %edi call param_bool_@PLT
// param_bool.s (Clang):
movzbl 15(%rsp), %edi andl $1, %edi callq param_bool_
// param_uchar_.s (GCC):
param_uchar_: .LFB0: .cfi_startproc cmpb $1, %dil sbbl %eax, %eax andl $2, %eax addl $3, %eax ret .cfi_endproc .LFE0:
// param_uchar_.s (Clang):
param_uchar_: # @param_uchar_ .cfi_startproc # %bb.0: xorl %eax, %eax testl %edi, %edi sete %al addl %eax, %eax addl $3, %eax retq .Lfunc_end0:
// param_uchar.s (GCC):
movzbl 12(%rsp), %edi call param_uchar_@PLT
// param_uchar.s (Clang):
movzbl 15(%rsp), %edi callq param_uchar_
// param_uint_.s (GCC):
param_uint_: .LFB0: .cfi_startproc cmpl $1, %edi sbbl %eax, %eax andl $2, %eax addl $3, %eax ret .cfi_endproc .LFE0:
// param_uint_.s (Clang):
param_uint_: # @param_uint_ .cfi_startproc # %bb.0: xorl %eax, %eax testl %edi, %edi sete %al addl %eax, %eax addl $3, %eax retq .Lfunc_end0:
// param_uint.s (GCC):
movl 12(%rsp), %edi call param_uint_@PLT
// param_uint.s (Clang):
movl 12(%rsp), %edi callq param_uint_
In this case, bool should be the same as uchar since the only important thing should be the width, and we might see (or not) a difference with uint. A part from zero extending, there's not much difference. There are slight differences between GCC and Clang, however, Clang producing larger code, so I expect Clang to run slightly slower than GCC.
Timing:
// amd64, gcc-11, i5-5675C:
$ ./a.out 1073741824 store bool: 4.928789 store uchar: 4.795028 store uint: 4.803893 consume bool: 4.795776 consume uchar: 4.794873 consume uint: 4.794079 param bool: 17.713958 param uchar: 17.611229 param uint: 17.688909
// amd64, clang-13, i5-5675C:
$ ./a.out 1073741824 store bool: 4.806418 store uchar: 4.802943 store uint: 4.800172 consume bool: 4.805537 consume uchar: 4.799858 consume uint: 4.799462 param bool: 19.095543 param uchar: 17.708014 param uint: 17.782490
In 'store', as we expected, bool is slower than the other types with GCC (around 1~10%). With Clang, there's no significant difference (I've seen bool consistently be a bit slower than the others, but less than 0.5%).
In 'consume', we see no difference between types or compilers.
In 'param', times vary a lot between runs, and there's no consistency: sometimes bool is slower, and sometimes it's faster. However, GCC is consistently faster than Clang.
Slight changes in the code can lead to compilers missing important optimizations. Using the following code in consume_<type>.c, leads to some important performance loss:
repeat(x = b ? 3 : x);
Note that just by changing an if to a ternary operator, makes the compiler slow down to the following times:
GCC:
$ ./a.out 1073741824 n: 1073741824 ... consume bool: 8.684662 consume uchar: 8.683915 consume uint: 8.086806 ...
Clang:
$ ./a.out 1073741824 n: 1073741824 ... consume bool: 8.161896 consume uchar: 5.422896 consume uint: 5.127165 ...
Clang slows down considerably for _Bool, while maintaining a reasonable speed for other types. GCC seems to generate quite bad code for all the types.
Conclusion:
Programmers should consider a few things:
Performance: Even though _Bool may be theoretically as fast as unsigned int, compilers are far from being ideal, and it is likely that your compiler will miss some optimizations, which in some cases may be quite important.
Maintainability/readability/correctness: Some may argue that _Bool is safer due to autonormalization; others may argue that it is less safe due to autonormalization; just know what you're using, and form your own opinion.
Supporting pre-C99 code: If that's the case, you have no choice but using unsigned int.
bool(which in C is an alias for_Bool) is an integer type. But it does have semantics associated with it that other integer types do not have.booltypes, then use their convention.