Cross-post from stack overflow since this is very code intensive

**Problem**

I am learning about HPC and code optimization. I attempt to replicate the results in Goto's seminal matrix multiplication paper (Google "goto anatomy of a high performance"). Despite my best efforts, I cannot get over ~50% maximum theoretical CPU performance.

**Background**

See related issues here (stack overflow title: Optimized 2x2 matrix multiplication: Slow assembly versus fast SIMD), including info about my hardware

**What I have attempted**

This related paper (http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf) has a good description of Goto's algorithmic structure. I provide my source code below.

**My question**

I am asking for general help. I have been working on this for far too long, have tried many different algorithms, inline assembly, inner kernels of various sizes (2x2, 4x4, 2x8, ..., mxn with m and n large), yet I cannot seem to break 50% CPU Gflops. This is purely for education purposes and not a homework.

**Compile Options**

On 32 bit GCC:

`gcc -std=c99 -O3 -msse3 -ffast-math -march=nocona -mtune=nocona -funroll-loops -fomit-frame-pointer -masm=intel`

**Source Code**

Hopefully is understandable. Please ask if not. I set up the macro structure (for loops) as described in the 2nd paper above. I pack the matrices as discussed in either paper. My inner kernel computes 2x8 blocks, as this seems to be the optimal computation for Nehalem architecture (see GotoBLAS source code - kernels). The inner kernel is based on the concept of calculating rank-1 updates as described here (http://code.google.com/p/blis/source/browse/config/template/kernels/3/bli_gemm_opt_mxn.c)

 #include <stdio.h>
 #include <time.h>
 #include <stdlib.h>
 #include <string.h>
 #include <x86intrin.h>
 #include <math.h>
 #include <omp.h>
 #include <stdint.h>
 
 
 // define some prefetch functions
 #define PREFETCHNTA(addr,nrOfBytesAhead) \
 _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_NTA)
 
 #define PREFETCHT0(addr,nrOfBytesAhead) \
 _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
 
 #define PREFETCHT1(addr,nrOfBytesAhead) \
 _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T1)
 
 #define PREFETCHT2(addr,nrOfBytesAhead) \
 _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T2)
 
 // define a min function
 #ifndef min
 #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
 #endif
 
 // zero a matrix
 void zeromat(double *C, int n)
 {
 int i = n;
 while (i--) {
 int j = n;
 while (j--) {
 *(C + i*n + j) = 0.0;
 }
 }
 }
 
 // compute a 2x8 block from (2 x kc) x (kc x 8) matrices
 inline void 
 __attribute__ ((gnu_inline)) 
 __attribute__ ((aligned(64))) dgemm_2x8_sse(
 int k,
 const double* restrict a1, const int cs_a,
 const double* restrict b1, const int rs_b,
 double* restrict c11, const int rs_c
 )
 {
 
 register __m128d xmm1, xmm4, //
 r8, r9, r10, r11, r12, r13, r14, r15; // accumulators
 
 // 10 registers declared here
 
 r8 = _mm_xor_pd(r8,r8); // ab
 r9 = _mm_xor_pd(r9,r9);
 r10 = _mm_xor_pd(r10,r10);
 r11 = _mm_xor_pd(r11,r11);
 
 r12 = _mm_xor_pd(r12,r12); // ab + 8
 r13 = _mm_xor_pd(r13,r13);
 r14 = _mm_xor_pd(r14,r14);
 r15 = _mm_xor_pd(r15,r15);
 
 // PREFETCHT2(b1,0);
 // PREFETCHT2(b1,64);
 
 
 
 //int l = k;
 while (k--) {
 
 //PREFETCHT0(a1,0); // fetch 64 bytes from a1
 
 // i = 0
 xmm1 = _mm_load1_pd(a1);
 
 xmm4 = _mm_load_pd(b1);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r8 = _mm_add_pd(r8,xmm4);
 
 xmm4 = _mm_load_pd(b1 + 2);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r9 = _mm_add_pd(r9,xmm4);
 
 xmm4 = _mm_load_pd(b1 + 4);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r10 = _mm_add_pd(r10,xmm4);
 
 xmm4 = _mm_load_pd(b1 + 6);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r11 = _mm_add_pd(r11,xmm4);
 
 //
 // i = 1
 xmm1 = _mm_load1_pd(a1 + 1);
 
 xmm4 = _mm_load_pd(b1);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r12 = _mm_add_pd(r12,xmm4);
 
 xmm4 = _mm_load_pd(b1 + 2);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r13 = _mm_add_pd(r13,xmm4);
 
 xmm4 = _mm_load_pd(b1 + 4);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r14 = _mm_add_pd(r14,xmm4);
 
 xmm4 = _mm_load_pd(b1 + 6);
 xmm4 = _mm_mul_pd(xmm1,xmm4);
 r15 = _mm_add_pd(r15,xmm4);
 
 a1 += cs_a;
 b1 += rs_b;
 
 //PREFETCHT2(b1,0);
 //PREFETCHT2(b1,64);
 
 }
 
 // copy result into C
 
 PREFETCHT0(c11,0);
 xmm1 = _mm_load_pd(c11);
 xmm1 = _mm_add_pd(xmm1,r8);
 _mm_store_pd(c11,xmm1);
 
 xmm1 = _mm_load_pd(c11 + 2);
 xmm1 = _mm_add_pd(xmm1,r9);
 _mm_store_pd(c11 + 2,xmm1);
 
 xmm1 = _mm_load_pd(c11 + 4);
 xmm1 = _mm_add_pd(xmm1,r10);
 _mm_store_pd(c11 + 4,xmm1);
 
 xmm1 = _mm_load_pd(c11 + 6);
 xmm1 = _mm_add_pd(xmm1,r11);
 _mm_store_pd(c11 + 6,xmm1);
 
 c11 += rs_c;
 
 PREFETCHT0(c11,0);
 xmm1 = _mm_load_pd(c11);
 xmm1 = _mm_add_pd(xmm1,r12);
 _mm_store_pd(c11,xmm1);
 
 xmm1 = _mm_load_pd(c11 + 2);
 xmm1 = _mm_add_pd(xmm1,r13);
 _mm_store_pd(c11 + 2,xmm1);
 
 xmm1 = _mm_load_pd(c11 + 4);
 xmm1 = _mm_add_pd(xmm1,r14);
 _mm_store_pd(c11 + 4,xmm1);
 
 xmm1 = _mm_load_pd(c11 + 6);
 xmm1 = _mm_add_pd(xmm1,r15);
 _mm_store_pd(c11 + 6,xmm1);
 
 }
 
 // packs a matrix into rows of slivers
 inline void 
 __attribute__ ((gnu_inline)) 
 __attribute__ ((aligned(64))) rpack( double* restrict dst, 
 const double* restrict src, 
 const int kc, const int mc, const int mr, const int n)
 {
 double tmp[mc*kc] __attribute__ ((aligned(64)));
 double* restrict ptr = &tmp[0];
 
 for (int i = 0; i < mc; ++i)
 for (int j = 0; j < kc; ++j)
 *ptr++ = *(src + i*n + j);
 
 ptr = &tmp[0];
 
 //const int inc_dst = mr*kc;
 for (int k = 0; k < mc; k+=mr)
 for (int j = 0; j < kc; ++j)
 for (int i = 0; i < mr*kc; i+=kc)
 *dst++ = *(ptr + k*kc + j + i);
 
 }
 
 // packs a matrix into columns of slivers
 inline void 
 __attribute__ ((gnu_inline)) 
 __attribute__ ((aligned(64))) cpack(double* restrict dst, 
 const double* restrict src, 
 const int nc, 
 const int kc, 
 const int nr, 
 const int n)
 {
 double tmp[kc*nc] __attribute__ ((aligned(64)));
 double* restrict ptr = &tmp[0];
 
 for (int i = 0; i < kc; ++i)
 for (int j = 0; j < nc; ++j)
 *ptr++ = *(src + i*n + j);
 
 ptr = &tmp[0];
 
 // const int inc_k = nc/nr;
 for (int k = 0; k < nc; k+=nr)
 for (int j = 0; j < kc*nc; j+=nc)
 for (int i = 0; i < nr; ++i)
 *dst++ = *(ptr + k + i + j);
 
 }
 
 void blis_dgemm_ref(
 const int n,
 const double* restrict A,
 const double* restrict B,
 double* restrict C,
 const int mc,
 const int nc,
 const int kc
 )
 {
 int mr = 2;
 int nr = 8;
 double locA[mc*kc] __attribute__ ((aligned(64)));
 double locB[kc*nc] __attribute__ ((aligned(64)));
 int ii,jj,kk,i,j;
 #pragma omp parallel num_threads(4) shared(A,B,C) private(ii,jj,kk,i,j,locA,locB)
 {//use all threads in parallel
 #pragma omp for
 // partitions C and B into wide column panels
 for ( jj = 0; jj < n; jj+=nc) {
 // A and the current column of B are partitioned into col and row panels
 for ( kk = 0; kk < n; kk+=kc) {
 cpack(locB, B + kk*n + jj, nc, kc, nr, n);
 // partition current panel of A into blocks
 for ( ii = 0; ii < n; ii+=mc) {
 rpack(locA, A + ii*n + kk, kc, mc, mr, n);
 for ( i = 0; i < min(n-ii,mc); i+=mr) {
 for ( j = 0; j < min(n-jj,nc); j+=nr) {
 // inner kernel that compues 2 x 8 block
 dgemm_2x8_sse( kc,
 locA + i*kc , mr,
 locB + j*kc , nr,
 C + (i+ii)*n + (j+jj), n );
 }
 }
 }
 }
 }
 }
 }
 
 double compute_gflops(const double time, const int n)
 {
 // computes the gigaflops for a square matrix-matrix multiplication
 double gflops;
 gflops = (double) (2.0*n*n*n)/time/1.0e9;
 return(gflops);
 }
 
 // ******* MAIN ********//
 void main() {
 clock_t time1, time2;
 double time3;
 double gflops;
 const int trials = 10;
 
 int nmax = 4096;
 printf("%10s %10s\n","N","Gflops/s");
 
 int mc = 128;
 int kc = 256;
 int nc = 128;
 
 for (int n = kc; n <= nmax; n+=kc) { //assuming kc is the max dim
 double *A = NULL;
 double *B = NULL;
 double *C = NULL;
 
 A = _mm_malloc (n*n * sizeof(*A),64);
 B = _mm_malloc (n*n * sizeof(*B),64);
 C = _mm_malloc (n*n * sizeof(*C),64);
 
 srand(time(NULL));
 
 // Create the matrices
 for (int i = 0; i < n; i++) {
 for (int j = 0; j < n; j++) {
 A[i*n + j] = (double) rand()/RAND_MAX;
 B[i*n + j] = (double) rand()/RAND_MAX;
 //D[j*n + i] = B[i*n + j]; // Transpose
 C[i*n + j] = 0.0;
 }
 }
 
 // warmup
 zeromat(C,n);
 blis_dgemm_ref(n,A,B,C,mc,nc,kc);
 zeromat(C,n);
 time2 = 0;
 for (int count = 0; count < trials; count++){// iterations per experiment here
 time1 = clock();
 blis_dgemm_ref(n,A,B,C,mc,nc,kc);
 time2 += clock() - time1;
 zeromat(C,n);
 }
 time3 = (double)(time2)/CLOCKS_PER_SEC/trials;
 gflops = compute_gflops(time3, n);
 printf("%10d %10f\n",n,gflops);
 
 _mm_free(A);
 _mm_free(B);
 _mm_free(C);
 
 }
 
 printf("tests are done\n");
 }