Skip to main content
Add auto-vectorized answer
Source Link
Anders Kaseorg
  • 40.7k
  • 3
  • 76
  • 149

C (gccGCC), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float (*m)[n] = _m, (*x)[n] = _x, (*y)[n] = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i * n + j]&x[i][j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k&y[j][k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i&m[i][k * 8], mi[k]); } } 

https://godbolt.org/z/xxvr45b67

C (GCC 11)

This completely auto-vectorized code is very nearly as fast as the above in GCC 11 (gcc -O3 -mavx2 -mfma), but regresses significantly in GCC 12. Auto-vectorization is fragile. 😞

enum { MAT_N = 64 }; void mat_mul_auto(void *_m, void *_x, void *_y) { enum { n += kMAT_N *};  8] float(*restrict m)[n] = __builtin_assume_aligned(_m, mi[k]32);  float(*restrict x)[n] = __builtin_assume_aligned(_x, 32); float(*restrict y)[n] = __builtin_assume_aligned(_y, 32); for (int i = 0; i < n; ++i) { float mi[n] = {}; for (int j = 0; j < n; ++j)  for (int k = 0; k < n; ++k) mi[k] += x[i][j] * y[j][k]; for (int k = 0; k < n; ++k) m[i][k] = mi[k]; } } 

https://godbolt.org/z/r5enWqPqThttps://godbolt.org/z/h4GEqGjGW

C (gcc), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float *m = _m, *x = _x, *y = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i * n + j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i * n + k * 8], mi[k]); } } 

https://godbolt.org/z/r5enWqPqT

C (GCC), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float(*m)[n] = _m, (*x)[n] = _x, (*y)[n] = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i][j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j][k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i][k * 8], mi[k]); } } 

https://godbolt.org/z/xxvr45b67

C (GCC 11)

This completely auto-vectorized code is very nearly as fast as the above in GCC 11 (gcc -O3 -mavx2 -mfma), but regresses significantly in GCC 12. Auto-vectorization is fragile. 😞

enum { MAT_N = 64 }; void mat_mul_auto(void *_m, void *_x, void *_y) { enum { n = MAT_N };   float(*restrict m)[n] = __builtin_assume_aligned(_m, 32);  float(*restrict x)[n] = __builtin_assume_aligned(_x, 32); float(*restrict y)[n] = __builtin_assume_aligned(_y, 32); for (int i = 0; i < n; ++i) { float mi[n] = {}; for (int j = 0; j < n; ++j)  for (int k = 0; k < n; ++k) mi[k] += x[i][j] * y[j][k]; for (int k = 0; k < n; ++k) m[i][k] = mi[k]; } } 

https://godbolt.org/z/h4GEqGjGW

added 38 characters in body
Source Link
Anders Kaseorg
  • 40.7k
  • 3
  • 76
  • 149

C (gcc), 2.65× faster than mat_mul_fast

(MeasuredCompiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float *m = _m, *x = _x, *y = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i * n + j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i * n + k * 8], mi[k]); } } 

https://godbolt.org/z/r5enWqPqT

C (gcc), 2.65× faster than mat_mul_fast

(Measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float *m = _m, *x = _x, *y = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i * n + j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i * n + k * 8], mi[k]); } } 

https://godbolt.org/z/r5enWqPqT

C (gcc), 2.65× faster than mat_mul_fast

(Compiled with gcc -O3 -mavx2 -mfma, measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float *m = _m, *x = _x, *y = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i * n + j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i * n + k * 8], mi[k]); } } 

https://godbolt.org/z/r5enWqPqT

Source Link
Anders Kaseorg
  • 40.7k
  • 3
  • 76
  • 149

C (gcc), 2.65× faster than mat_mul_fast

(Measured on my Intel Comet Lake i7-10710U.)

#include <x86intrin.h> enum { MAT_N = 64 }; void mat_mul_simd(void *_m, void *_x, void *_y) { enum { n = MAT_N }; float *m = _m, *x = _x, *y = _y; for (int i = 0; i < n; ++i) { __m256 mi[n / 8]; for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_setzero_ps(); for (int j = 0; j < n; ++j) { __m256 xij = _mm256_broadcast_ss(&x[i * n + j]); for (int k = 0; k < n / 8; ++k) mi[k] = _mm256_fmadd_ps(xij, _mm256_load_ps(&y[j * n + k * 8]), mi[k]); } for (int k = 0; k < n / 8; ++k) _mm256_store_ps(&m[i * n + k * 8], mi[k]); } } 

https://godbolt.org/z/r5enWqPqT