I have written the following implementation of the Caesar cipher for an old Linux computer with a Pentium MMX chip. The code has been written with the following design goals in mind:
- the code should perform well
- to avoid timing side-channel attacks, the runtime should be independent of the data used
- the code should run on a Pentium MMX
Please let me know of possible improvements and performance pitfalls I fell in.
# MMX caeasar cipher implementation # for i586 with MMX # signature: # caesar(out, in, len, key) # key is between 0 and 25 .section .text .globl caesar .type caesar,@function .align 16 caesar: push %ebp mov %esp,%ebp push %ebx push %edi push %esi mov 8(%ebp),%edi # edi: destination mov 12(%ebp),%esi # esi: source mov 16(%ebp),%edx # edx: length and $~7,%edx # only process full qwords test %edx,%edx jz 1f xor %ecx,%ecx # ecx: index movd 20(%ebp),%mm5 punpcklbw %mm5,%mm5 punpcklwd %mm5,%mm5 punpckldq %mm5,%mm5 # mm5: key bytes movq %mm5,%mm4 psubb twentysix,%mm4 # mm4: key - 26 movq Amask,%mm6 psubb %mm4,%mm6 # mm6: 'A' - 1 - (key - 26) .align 16 0: movq (%esi,%ecx),%mm0 movq %mm0,%mm1 pand ucmask,%mm1 # mm1: xmm0 in upper case movq %mm1,%mm3 movq %mm1,%mm2 pcmpgtb Amask,%mm2 # mm2: 0xff where 'A' <= buf[i] pcmpgtb Zmask,%mm3 # mm3: 0xff where 'Z' < buf[i] pcmpgtb %mm6,%mm1 # mm1: 0xff where 'A' + key <= buf[i] pandn %mm1,%mm3 pand %mm4,%mm3 # mm3: key-26 where 'A' + (26 - key) <= buf[i] <= 'Z' pandn %mm2,%mm1 pand %mm5,%mm1 # mm1: key where 'A' <= buf[i] < 'A' + (26 - key) por %mm3,%mm1 # mm1: 0/key/key-26 paddb %mm1,%mm0 movq %mm0,(%edi,%ecx) add $8,%ecx cmp %edx,%ecx jb 0b emms add %ecx,%edi add %ecx,%esi 1: mov 16(%ebp),%edx # length mov 20(%ebp),%ecx # key and $7,%edx # loop tail # process remaining bytes add %edx,%edi # end of output buffer add %edx,%esi # end of input buffer neg %edx # index counts up to 0 lea -26(%ecx),%eax # al: key - 26 mov $'A'+26-1,%ah sub %cl,%ah # ah: threshold ('A' - 1 - (key - 26)) .align 16 0: mov (%esi,%edx),%bl and $~0x20,%bl # bl: uppercase c cmp $'Z'+1,%bl # cf: uc <= 'Z' sbb %ch,%ch # ch: 0xff if uc <= 'Z' cmp %bl,%ah # cf: thresh < uc sbb %bh,%bh # bh: 0xff if thresh < uc and %bh,%ch and %al,%ch # ch: key - 26 & isleZ & isgttthres not %bh # bh: ~isgtthresh cmp $'A',%bl cmc # cf: 'A' <= uc sbb %bl,%bl # bl: 0xff if 'A' <= uc and %bh,%bl and %cl,%bl # bl: key & isgeA & ~isgtthresh or %ch,%bl add (%esi,%edx),%bl mov %bl,(%edi,%edx) inc %edx jnz 0b 1: pop %esi pop %edi pop %ebx leave ret .size caesar,.-caesar .section .rodata.cst8,"aM",@progbits,8 .align 8 .type ucmask,@object ucmask: .fill 8, 1, ~0x20 .size ucmask, 8 .type Amask,@object Amask: .fill 8, 1, 'A' - 1 .size Amask, 8 .type Zmask,@object Zmask: .fill 8, 1, 'Z' .size Zmask, 8 .type twentysix,@object twentysix: .fill 8, 1, 26 .size twentysix, 8 You can use this harness to test the code:
#define _POSIX_C_SOURCE 200809L #include <stdio.h> #include <stdlib.h> #include <string.h> extern void caesar(const char *out, char *in, size_t len, int key); extern int main(int argc, char *argv[]) { char *buf = NULL; size_t len = 0; ssize_t count; int key = 13; if (argc > 1) key = atoi(argv[1]) % 26; while (count = getline(&buf, &len, stdin), count != EOF) { caesar(buf, buf, strlen(buf), key); fputs(buf, stdout); } return (EXIT_SUCCESS); }