C (gccGCC), 2.65× faster than mat_mul_fast
(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)
#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float (*m)[n] = _m, (*x)[n] = _x, (*y)[n] = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i * n + j]&x[i][j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k&y[j][k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i&m[i][k * 8], mi[k]); } } https://godbolt.org/z/xxvr45b67
C (GCC 11)
This completely auto-vectorized code is very nearly as fast as the above in GCC 11 (gcc -O3 -mavx2 -mfma), but regresses significantly in GCC 12. Auto-vectorization is fragile. 😞
enum { MAT_N = 64 }; void mat_mul_auto(void *_m, void *_x, void *_y) { enum { n += kMAT_N *}; 8] float(*restrict m)[n] = __builtin_assume_aligned(_m, mi[k]32); float(*restrict x)[n] = __builtin_assume_aligned(_x, 32); float(*restrict y)[n] = __builtin_assume_aligned(_y, 32); for (int i = 0; i < n; ++i) { float mi[n] = {}; for (int j = 0; j < n; ++j) for (int k = 0; k < n; ++k) mi[k] += x[i][j] * y[j][k]; for (int k = 0; k < n; ++k) m[i][k] = mi[k]; } } https://godbolt.org/z/r5enWqPqThttps://godbolt.org/z/h4GEqGjGW