Optimizing multiplication of square matrices for full CPU utilization

Question

Problem

I am learning about HPC and code optimization. I attempt to replicate the results in Goto's seminal matrix multiplication paper. Despite my best efforts, I cannot get over ~50% maximum theoretical CPU performance.

Background

See related issues here, including info about my hardware.

What I have attempted

This related paper has a good description of Goto's algorithmic structure. I provide my source code below.

My question

I am asking for general help. I have been working on this for far too long, have tried many different algorithms, inline assembly, inner kernels of various sizes (2x2, 4x4, 2x8, ..., mxn with m and n large), yet I cannot seem to break 50% CPU GFLOPS. This is purely for education purposes and not a homework.

Compile Options

On 32 bit GCC:

gcc -std=c99 -O3 -msse3 -ffast-math -march=nocona -mtune=nocona -funroll-loops -fomit-frame-pointer -masm=intel

Source Code

I set up the macro structure (for loops) as described in the 2nd paper above. I pack the matrices as discussed in either paper. My inner kernel computes 2x8 blocks, as this seems to be the optimal computation for Nehalem architecture (see GotoBLAS source code - kernels). The inner kernel is based on the concept of calculating rank-1 updates as described here.

#include <stdio.h> #include <time.h> #include <stdlib.h> #include <string.h> #include <x86intrin.h> #include <math.h> #include <omp.h> #include <stdint.h> // define some prefetch functions #define PREFETCHNTA(addr,nrOfBytesAhead) \ _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_NTA) #define PREFETCHT0(addr,nrOfBytesAhead) \ _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0) #define PREFETCHT1(addr,nrOfBytesAhead) \ _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T1) #define PREFETCHT2(addr,nrOfBytesAhead) \ _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T2) // define a min function #ifndef min #define min( a, b ) ( ((a) < (b)) ? (a) : (b) ) #endif // zero a matrix void zeromat(double *C, int n) { int i = n; while (i--) { int j = n; while (j--) { *(C + i*n + j) = 0.0; } } } // compute a 2x8 block from (2 x kc) x (kc x 8) matrices inline void __attribute__ ((gnu_inline)) __attribute__ ((aligned(64))) dgemm_2x8_sse( int k, const double* restrict a1, const int cs_a, const double* restrict b1, const int rs_b, double* restrict c11, const int rs_c ) { register __m128d xmm1, xmm4, // r8, r9, r10, r11, r12, r13, r14, r15; // accumulators // 10 registers declared here r8 = _mm_xor_pd(r8,r8); // ab r9 = _mm_xor_pd(r9,r9); r10 = _mm_xor_pd(r10,r10); r11 = _mm_xor_pd(r11,r11); r12 = _mm_xor_pd(r12,r12); // ab + 8 r13 = _mm_xor_pd(r13,r13); r14 = _mm_xor_pd(r14,r14); r15 = _mm_xor_pd(r15,r15); // PREFETCHT2(b1,0); // PREFETCHT2(b1,64); //int l = k; while (k--) { //PREFETCHT0(a1,0); // fetch 64 bytes from a1 // i = 0 xmm1 = _mm_load1_pd(a1); xmm4 = _mm_load_pd(b1); xmm4 = _mm_mul_pd(xmm1,xmm4); r8 = _mm_add_pd(r8,xmm4); xmm4 = _mm_load_pd(b1 + 2); xmm4 = _mm_mul_pd(xmm1,xmm4); r9 = _mm_add_pd(r9,xmm4); xmm4 = _mm_load_pd(b1 + 4); xmm4 = _mm_mul_pd(xmm1,xmm4); r10 = _mm_add_pd(r10,xmm4); xmm4 = _mm_load_pd(b1 + 6); xmm4 = _mm_mul_pd(xmm1,xmm4); r11 = _mm_add_pd(r11,xmm4); // // i = 1 xmm1 = _mm_load1_pd(a1 + 1); xmm4 = _mm_load_pd(b1); xmm4 = _mm_mul_pd(xmm1,xmm4); r12 = _mm_add_pd(r12,xmm4); xmm4 = _mm_load_pd(b1 + 2); xmm4 = _mm_mul_pd(xmm1,xmm4); r13 = _mm_add_pd(r13,xmm4); xmm4 = _mm_load_pd(b1 + 4); xmm4 = _mm_mul_pd(xmm1,xmm4); r14 = _mm_add_pd(r14,xmm4); xmm4 = _mm_load_pd(b1 + 6); xmm4 = _mm_mul_pd(xmm1,xmm4); r15 = _mm_add_pd(r15,xmm4); a1 += cs_a; b1 += rs_b; //PREFETCHT2(b1,0); //PREFETCHT2(b1,64); } // copy result into C PREFETCHT0(c11,0); xmm1 = _mm_load_pd(c11); xmm1 = _mm_add_pd(xmm1,r8); _mm_store_pd(c11,xmm1); xmm1 = _mm_load_pd(c11 + 2); xmm1 = _mm_add_pd(xmm1,r9); _mm_store_pd(c11 + 2,xmm1); xmm1 = _mm_load_pd(c11 + 4); xmm1 = _mm_add_pd(xmm1,r10); _mm_store_pd(c11 + 4,xmm1); xmm1 = _mm_load_pd(c11 + 6); xmm1 = _mm_add_pd(xmm1,r11); _mm_store_pd(c11 + 6,xmm1); c11 += rs_c; PREFETCHT0(c11,0); xmm1 = _mm_load_pd(c11); xmm1 = _mm_add_pd(xmm1,r12); _mm_store_pd(c11,xmm1); xmm1 = _mm_load_pd(c11 + 2); xmm1 = _mm_add_pd(xmm1,r13); _mm_store_pd(c11 + 2,xmm1); xmm1 = _mm_load_pd(c11 + 4); xmm1 = _mm_add_pd(xmm1,r14); _mm_store_pd(c11 + 4,xmm1); xmm1 = _mm_load_pd(c11 + 6); xmm1 = _mm_add_pd(xmm1,r15); _mm_store_pd(c11 + 6,xmm1); } // packs a matrix into rows of slivers inline void __attribute__ ((gnu_inline)) __attribute__ ((aligned(64))) rpack( double* restrict dst, const double* restrict src, const int kc, const int mc, const int mr, const int n) { double tmp[mc*kc] __attribute__ ((aligned(64))); double* restrict ptr = &tmp[0]; for (int i = 0; i < mc; ++i) for (int j = 0; j < kc; ++j) *ptr++ = *(src + i*n + j); ptr = &tmp[0]; //const int inc_dst = mr*kc; for (int k = 0; k < mc; k+=mr) for (int j = 0; j < kc; ++j) for (int i = 0; i < mr*kc; i+=kc) *dst++ = *(ptr + k*kc + j + i); } // packs a matrix into columns of slivers inline void __attribute__ ((gnu_inline)) __attribute__ ((aligned(64))) cpack(double* restrict dst, const double* restrict src, const int nc, const int kc, const int nr, const int n) { double tmp[kc*nc] __attribute__ ((aligned(64))); double* restrict ptr = &tmp[0]; for (int i = 0; i < kc; ++i) for (int j = 0; j < nc; ++j) *ptr++ = *(src + i*n + j); ptr = &tmp[0]; // const int inc_k = nc/nr; for (int k = 0; k < nc; k+=nr) for (int j = 0; j < kc*nc; j+=nc) for (int i = 0; i < nr; ++i) *dst++ = *(ptr + k + i + j); } void blis_dgemm_ref( const int n, const double* restrict A, const double* restrict B, double* restrict C, const int mc, const int nc, const int kc ) { int mr = 2; int nr = 8; double locA[mc*kc] __attribute__ ((aligned(64))); double locB[kc*nc] __attribute__ ((aligned(64))); int ii,jj,kk,i,j; #pragma omp parallel num_threads(4) shared(A,B,C) private(ii,jj,kk,i,j,locA,locB) {//use all threads in parallel #pragma omp for // partitions C and B into wide column panels for ( jj = 0; jj < n; jj+=nc) { // A and the current column of B are partitioned into col and row panels for ( kk = 0; kk < n; kk+=kc) { cpack(locB, B + kk*n + jj, nc, kc, nr, n); // partition current panel of A into blocks for ( ii = 0; ii < n; ii+=mc) { rpack(locA, A + ii*n + kk, kc, mc, mr, n); for ( i = 0; i < min(n-ii,mc); i+=mr) { for ( j = 0; j < min(n-jj,nc); j+=nr) { // inner kernel that compues 2 x 8 block dgemm_2x8_sse( kc, locA + i*kc , mr, locB + j*kc , nr, C + (i+ii)*n + (j+jj), n ); } } } } } } } double compute_gflops(const double time, const int n) { // computes the gigaflops for a square matrix-matrix multiplication double gflops; gflops = (double) (2.0*n*n*n)/time/1.0e9; return(gflops); } // ******* MAIN ********// void main() { clock_t time1, time2; double time3; double gflops; const int trials = 10; int nmax = 4096; printf("%10s %10s\n","N","Gflops/s"); int mc = 128; int kc = 256; int nc = 128; for (int n = kc; n <= nmax; n+=kc) { //assuming kc is the max dim double *A = NULL; double *B = NULL; double *C = NULL; A = _mm_malloc (n*n * sizeof(*A),64); B = _mm_malloc (n*n * sizeof(*B),64); C = _mm_malloc (n*n * sizeof(*C),64); srand(time(NULL)); // Create the matrices for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { A[i*n + j] = (double) rand()/RAND_MAX; B[i*n + j] = (double) rand()/RAND_MAX; //D[j*n + i] = B[i*n + j]; // Transpose C[i*n + j] = 0.0; } } // warmup zeromat(C,n); blis_dgemm_ref(n,A,B,C,mc,nc,kc); zeromat(C,n); time2 = 0; for (int count = 0; count < trials; count++){// iterations per experiment here time1 = clock(); blis_dgemm_ref(n,A,B,C,mc,nc,kc); time2 += clock() - time1; zeromat(C,n); } time3 = (double)(time2)/CLOCKS_PER_SEC/trials; gflops = compute_gflops(time3, n); printf("%10d %10f\n",n,gflops); _mm_free(A); _mm_free(B); _mm_free(C); } printf("tests are done\n"); }

i5 540M. 2 cores with HT turned off. More details here: cpu-world.com/CPUs/Core_i5/… — user43443
– user43443, Commented Jun 1, 2014 at 3:20

Community · Accepted Answer · 2017-05-23 12:40:52Z

6

Without addressing performance concerns, some trivial observations:

#include <omp.h> is unnecessary. (You use OpenMP, but don't call any OpenMP functions.)
The return type of main() should be int, not void.
The code also compiles with clang (LLVM), if you omit the -masm=intel option.
zeromat() could simply be memset(C, 0, n * sizeof(double)).

When compiling with -Wall, the code in dgemm_2x8_sse() to zero some registers causes spurious warnings:

matmul.c:56:21: warning: variable 'r8' is uninitialized when used here [-Wuninitialized] r8 = _mm_xor_pd(r8,r8); // ab ^~ matmul.c:52:5: note: variable 'r8' is declared here register __m128d xmm1, xmm4, // ^

I recommend disabling the warnings with a pair of pragmas:

#pragma GCC diagnostic ignored "-Wuninitialized" r8 = _mm_xor_pd(r8,r8); // ab r9 = _mm_xor_pd(r9,r9); r10 = _mm_xor_pd(r10,r10); r11 = _mm_xor_pd(r11,r11); r12 = _mm_xor_pd(r12,r12); // ab + 8 r13 = _mm_xor_pd(r13,r13); r14 = _mm_xor_pd(r14,r14); r15 = _mm_xor_pd(r15,r15); #pragma GCC diagnostic warning "-Wuninitialized"

You should also discard the confusing and useless comment that precedes that code:

// 10 registers declared here

"Gflops/s" is redundant and incorrect terminology (unless you are talking about acceleration, not speed!)

edited May 23, 2017 at 12:40

CommunityBot

1

answered May 31, 2014 at 8:46

200_success

146k22 gold badges191 silver badges481 bronze badges

\$\begingroup\$ Regarding compiler flags, I'd argue that -march=nocona -mtune=nocona and probably -msse3 too, unless you want to force GCC to use nothing newer, are redundant. march will generate code optimized only for that CPU, while mtune for that CPU and its family. Moreover, specifying -march=cpu-type implies -mtune=cpu-type.. And march will take also care of sse. \$\endgroup\$

edmz
– edmz

2014-05-31 15:20:21 +00:00
Commented May 31, 2014 at 15:20
\$\begingroup\$ The warning on r8 is interesting, I wonder what causes it. EDIT The comment about the registers was actually important but unclear. I coded my dgemm_2x8_sse() directly in assembly (inline gcc assembly, intel syntax). I thought I would be able to use 16 xmm registers, but apparently I can only access 8. So my little note was to inform me of how many registers I declared in the C code. I was unable to reduce it to 8, so I could not code it as efficiently as I hoped in assembly. \$\endgroup\$

user43443
– user43443

2014-05-31 18:51:10 +00:00
Commented May 31, 2014 at 18:51
\$\begingroup\$ Getting compilers to do things that don't depend on the input is tricky. Older gcc will often zero out variables before you use them for something like _mm_cmpeq_si128(x,x) to generate all-ones. Instead, use _mm_setzero_pd() to generate a zeroing xorpd or equivalent with the compiler's choice of register. Similarly, you actually get better code from _mm_set1_epi32(-1) than from trying to do it directly with v=_mm_uninitialized_si128(); ones = _mm_cmpeq_epi32(v,v); See stackoverflow.com/a/32422471/224132 and the comments \$\endgroup\$

Peter Cordes
– Peter Cordes

2015-09-19 16:31:12 +00:00
Commented Sep 19, 2015 at 16:31
\$\begingroup\$ @black: -march=native doesn't fully imply -mtune=native, unfortunately. I can't find an example right now, but yesterday I was looking at g++ output that didn't put cmp/jcc next to each other with -march=native, but it did with -march=native -mtune=native. (On godbolt, where native=haswell). \$\endgroup\$

Peter Cordes
– Peter Cordes

2015-09-19 16:40:58 +00:00
Commented Sep 19, 2015 at 16:40
\$\begingroup\$ @user43443: No, the warning on r8 is the same you'd get if you did int a; int b = 2*a;. The compiler doesn't warn you if it has to spill registers to the stack. Use 64bit; the 32bit ABI is obsolete, and passes float/double on the stack, and returns them in x87 registers. Also, as you saw, you only get 8 registers in 32bit mode, regardless of ABI. (No REX prefix to give an extra bit to select the other 8 src/dest registers.) \$\endgroup\$

Peter Cordes
– Peter Cordes

2015-09-19 16:47:27 +00:00
Commented Sep 19, 2015 at 16:47

Add a comment |

Stack Exchange Network

Optimizing multiplication of square matrices for full CPU utilization

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Optimizing multiplication of square matrices for full CPU utilization

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions