LoginSignup
5
6

More than 5 years have passed since last update.

『プロセッサの性能を引き出すのは大変だよというお話(行列積チューニング)』へのコメント続き

Last updated at Posted at 2017-02-04

Kanbayashi Ryo氏の書かれた記事『プロセッサの性能を引き出すのは大変だよというお話(行列積チューニング)』へのコメントの続きです。

先の記事に私が書いたコメント

コンパイラによる自動ベクトル化でどの程度性能が上がるかを確認した上で、作業的に煩雑な SIMD 命令の使用を検討すべきと思います。

について検証してみました。
コード1.5 に SIMD 命令の効果的な使用を考えてみます。
コード1.5 の内容は先にも書いたように

   100401600:   c4 c2 7d 19 77 c8       vbroadcastsd -0x38(%r15),%ymm6
   100401606:   c4 c1 4d 59 ba c0 24    vmulpd -0xdb40(%r10),%ymm6,%ymm7
   10040160d:   ff ff 
   10040160f:   c5 d5 58 ef             vaddpd %ymm7,%ymm5,%ymm5
   100401613:   c4 c2 7d 19 7f d0       vbroadcastsd -0x30(%r15),%ymm7
   100401619:   c4 41 45 59 82 00 44    vmulpd -0xbc00(%r10),%ymm7,%ymm8
   100401620:   ff ff 
   100401622:   c4 c1 55 58 e8          vaddpd %ymm8,%ymm5,%ymm5
   100401627:   c4 42 7d 19 47 d8       vbroadcastsd -0x28(%r15),%ymm8
   10040162d:   c4 41 3d 59 8a 40 63    vmulpd -0x9cc0(%r10),%ymm8,%ymm9
   100401634:   ff ff 
   100401636:   c4 c1 55 58 e9          vaddpd %ymm9,%ymm5,%ymm5
   10040163b:   c4 42 7d 19 4f e0       vbroadcastsd -0x20(%r15),%ymm9
   100401641:   c4 41 35 59 92 80 82    vmulpd -0x7d80(%r10),%ymm9,%ymm10
   100401648:   ff ff 
   10040164a:   c4 c1 55 58 ea          vaddpd %ymm10,%ymm5,%ymm5
   10040164f:   c4 42 7d 19 57 e8       vbroadcastsd -0x18(%r15),%ymm10
   100401655:   c4 41 2d 59 9a c0 a1    vmulpd -0x5e40(%r10),%ymm10,%ymm11
   10040165c:   ff ff 
   10040165e:   c4 c1 55 58 eb          vaddpd %ymm11,%ymm5,%ymm5

乗算と加算のが連続するコードが出力されています。
今回私が使用した Core i7-5500U はBroadwellマイクロアーキテクチャであり、FMA命令による積和演算が使用できるので、これで先の乗算と加算を置き換えてみました。

--- code1.5.c   
+++ code1.6.c   
@@ -2,6 +2,7 @@
 #include <stdlib.h>
 #include <sys/time.h>
 #include <math.h>
+#include <immintrin.h>      /* AVX operations */

 #define N 1000
 #define BLOCK_SIZE 40
@@ -47,12 +48,13 @@ double compare(double a[N][N],double b[N

 int main(void)
 {
-  static double a[N][N], b[N][N], c[N][N];
+  static double a[N][N] __attribute__ ((aligned(256)));
+  static double b[N][N] __attribute__ ((aligned(256)));
+  static double c[N][N] __attribute__ ((aligned(256)));

   int i, ii,j,jj,k,kk;
   double start,elapsed_time;
-  double s0,s1,s2,s3,s4,s5,s6,s7,s8,s9;
-  double s10,s11,s12,s13,s14,s15,s16,s17,s18,s19;
+  __m256d c0, c1, c2, c3, c4;

   for (i = 0; i < N; i++) {
     for (j = 0; j < N; j++) {
@@ -74,227 +76,66 @@ int main(void)
       for (k = 0; k < N; k+=BLOCK_SIZE) {
    for(ii=i;ii < (i + BLOCK_SIZE);ii++){
      for(jj=j;jj < (j + BLOCK_SIZE);jj+=20){
-       s0 = 0.0;
-       s1 = 0.0;
-       s2 = 0.0;
-       s3 = 0.0;
-       s4 = 0.0;
-       s5 = 0.0;
-       s6 = 0.0;
-       s7 = 0.0;
-       s8 = 0.0;
-       s9 = 0.0;
-       s10 = 0.0;
-       s11 = 0.0;
-       s12 = 0.0;
-       s13 = 0.0;
-       s14 = 0.0;
-       s15 = 0.0;
-       s16 = 0.0;
-       s17 = 0.0;
-       s18 = 0.0;
-       s19 = 0.0;
+       c0 = _mm256_setzero_pd();
+       c1 = _mm256_setzero_pd();
+       c2 = _mm256_setzero_pd();
+       c3 = _mm256_setzero_pd();
+       c4 = _mm256_setzero_pd();
        for (kk = k; kk < (k + BLOCK_SIZE); kk+=8) {
-         s0 += a[ii][kk] * b[kk][jj];
-         s0 += a[ii][kk+1] * b[kk+1][jj];
-         s0 += a[ii][kk+2] * b[kk+2][jj];
-         s0 += a[ii][kk+3] * b[kk+3][jj];
-         s0 += a[ii][kk+4] * b[kk+4][jj];
-         s0 += a[ii][kk+5] * b[kk+5][jj];
-         s0 += a[ii][kk+6] * b[kk+6][jj];
-         s0 += a[ii][kk+7] * b[kk+7][jj];
-
-         s1 += a[ii][kk] * b[kk][jj+1];
-         s1 += a[ii][kk+1] * b[kk+1][jj+1];
-         s1 += a[ii][kk+2] * b[kk+2][jj+1];
-         s1 += a[ii][kk+3] * b[kk+3][jj+1];
-         s1 += a[ii][kk+4] * b[kk+4][jj+1];
-         s1 += a[ii][kk+5] * b[kk+5][jj+1];
-         s1 += a[ii][kk+6] * b[kk+6][jj+1];
-         s1 += a[ii][kk+7] * b[kk+7][jj+1];
-
-         s2 += a[ii][kk] * b[kk][jj+2];
-         s2 += a[ii][kk+1] * b[kk+1][jj+2];
-         s2 += a[ii][kk+2] * b[kk+2][jj+2];
-         s2 += a[ii][kk+3] * b[kk+3][jj+2];
-         s2 += a[ii][kk+4] * b[kk+4][jj+2];
-         s2 += a[ii][kk+5] * b[kk+5][jj+2];
-         s2 += a[ii][kk+6] * b[kk+6][jj+2];
-         s2 += a[ii][kk+7] * b[kk+7][jj+2];
-
-         s3 += a[ii][kk] * b[kk][jj+3];
-         s3 += a[ii][kk+1] * b[kk+1][jj+3];
-         s3 += a[ii][kk+2] * b[kk+2][jj+3];
-         s3 += a[ii][kk+3] * b[kk+3][jj+3];
-         s3 += a[ii][kk+4] * b[kk+4][jj+3];
-         s3 += a[ii][kk+5] * b[kk+5][jj+3];
-         s3 += a[ii][kk+6] * b[kk+6][jj+3];
-         s3 += a[ii][kk+7] * b[kk+7][jj+3];
-
-         s4 += a[ii][kk] * b[kk][jj+4];
-         s4 += a[ii][kk+1] * b[kk+1][jj+4];
-         s4 += a[ii][kk+2] * b[kk+2][jj+4];
-         s4 += a[ii][kk+3] * b[kk+3][jj+4];
-         s4 += a[ii][kk+4] * b[kk+4][jj+4];
-         s4 += a[ii][kk+5] * b[kk+5][jj+4];
-         s4 += a[ii][kk+6] * b[kk+6][jj+4];
-         s4 += a[ii][kk+7] * b[kk+7][jj+4];
-
-         s5 += a[ii][kk] * b[kk][jj+5];
-         s5 += a[ii][kk+1] * b[kk+1][jj+5];
-         s5 += a[ii][kk+2] * b[kk+2][jj+5];
-         s5 += a[ii][kk+3] * b[kk+3][jj+5];
-         s5 += a[ii][kk+4] * b[kk+4][jj+5];
-         s5 += a[ii][kk+5] * b[kk+5][jj+5];
-         s5 += a[ii][kk+6] * b[kk+6][jj+5];
-         s5 += a[ii][kk+7] * b[kk+7][jj+5];
-
-         s6 += a[ii][kk] * b[kk][jj+6];
-         s6 += a[ii][kk+1] * b[kk+1][jj+6];
-         s6 += a[ii][kk+2] * b[kk+2][jj+6];
-         s6 += a[ii][kk+3] * b[kk+3][jj+6];
-         s6 += a[ii][kk+4] * b[kk+4][jj+6];
-         s6 += a[ii][kk+5] * b[kk+5][jj+6];
-         s6 += a[ii][kk+6] * b[kk+6][jj+6];
-         s6 += a[ii][kk+7] * b[kk+7][jj+6];
-
-         s7 += a[ii][kk] * b[kk][jj+7];
-         s7 += a[ii][kk+1] * b[kk+1][jj+7];
-         s7 += a[ii][kk+2] * b[kk+2][jj+7];
-         s7 += a[ii][kk+3] * b[kk+3][jj+7];
-         s7 += a[ii][kk+4] * b[kk+4][jj+7];
-         s7 += a[ii][kk+5] * b[kk+5][jj+7];
-         s7 += a[ii][kk+6] * b[kk+6][jj+7];
-         s7 += a[ii][kk+7] * b[kk+7][jj+7];
-
-         s8 += a[ii][kk] * b[kk][jj+8];
-         s8 += a[ii][kk+1] * b[kk+1][jj+8];
-         s8 += a[ii][kk+2] * b[kk+2][jj+8];
-         s8 += a[ii][kk+3] * b[kk+3][jj+8];
-         s8 += a[ii][kk+4] * b[kk+4][jj+8];
-         s8 += a[ii][kk+5] * b[kk+5][jj+8];
-         s8 += a[ii][kk+6] * b[kk+6][jj+8];
-         s8 += a[ii][kk+7] * b[kk+7][jj+8];
-
-         s9 += a[ii][kk] * b[kk][jj+9];
-         s9 += a[ii][kk+1] * b[kk+1][jj+9];
-         s9 += a[ii][kk+2] * b[kk+2][jj+9];
-         s9 += a[ii][kk+3] * b[kk+3][jj+9];
-         s9 += a[ii][kk+4] * b[kk+4][jj+9];
-         s9 += a[ii][kk+5] * b[kk+5][jj+9];
-         s9 += a[ii][kk+6] * b[kk+6][jj+9];
-         s9 += a[ii][kk+7] * b[kk+7][jj+9];
-
-         s10 += a[ii][kk] * b[kk][jj+10];
-         s10 += a[ii][kk+1] * b[kk+1][jj+10];
-         s10 += a[ii][kk+2] * b[kk+2][jj+10];
-         s10 += a[ii][kk+3] * b[kk+3][jj+10];
-         s10 += a[ii][kk+4] * b[kk+4][jj+10];
-         s10 += a[ii][kk+5] * b[kk+5][jj+10];
-         s10 += a[ii][kk+6] * b[kk+6][jj+10];
-         s10 += a[ii][kk+7] * b[kk+7][jj+10];
-
-         s11 += a[ii][kk] * b[kk][jj+11];
-         s11 += a[ii][kk+1] * b[kk+1][jj+11];
-         s11 += a[ii][kk+2] * b[kk+2][jj+11];
-         s11 += a[ii][kk+3] * b[kk+3][jj+11];
-         s11 += a[ii][kk+4] * b[kk+4][jj+11];
-         s11 += a[ii][kk+5] * b[kk+5][jj+11];
-         s11 += a[ii][kk+6] * b[kk+6][jj+11];
-         s11 += a[ii][kk+7] * b[kk+7][jj+11];
-
-         s12 += a[ii][kk] * b[kk][jj+12];
-         s12 += a[ii][kk+1] * b[kk+1][jj+12];
-         s12 += a[ii][kk+2] * b[kk+2][jj+12];
-         s12 += a[ii][kk+3] * b[kk+3][jj+12];
-         s12 += a[ii][kk+4] * b[kk+4][jj+12];
-         s12 += a[ii][kk+5] * b[kk+5][jj+12];
-         s12 += a[ii][kk+6] * b[kk+6][jj+12];
-         s12 += a[ii][kk+7] * b[kk+7][jj+12];
-
-         s13 += a[ii][kk] * b[kk][jj+13];
-         s13 += a[ii][kk+1] * b[kk+1][jj+13];
-         s13 += a[ii][kk+2] * b[kk+2][jj+13];
-         s13 += a[ii][kk+3] * b[kk+3][jj+13];
-         s13 += a[ii][kk+4] * b[kk+4][jj+13];
-         s13 += a[ii][kk+5] * b[kk+5][jj+13];
-         s13 += a[ii][kk+6] * b[kk+6][jj+13];
-         s13 += a[ii][kk+7] * b[kk+7][jj+13];
-
-         s14 += a[ii][kk] * b[kk][jj+14];
-         s14 += a[ii][kk+1] * b[kk+1][jj+14];
-         s14 += a[ii][kk+2] * b[kk+2][jj+14];
-         s14 += a[ii][kk+3] * b[kk+3][jj+14];
-         s14 += a[ii][kk+4] * b[kk+4][jj+14];
-         s14 += a[ii][kk+5] * b[kk+5][jj+14];
-         s14 += a[ii][kk+6] * b[kk+6][jj+14];
-         s14 += a[ii][kk+7] * b[kk+7][jj+14];
-
-         s15 += a[ii][kk] * b[kk][jj+15];
-         s15 += a[ii][kk+1] * b[kk+1][jj+15];
-         s15 += a[ii][kk+2] * b[kk+2][jj+15];
-         s15 += a[ii][kk+3] * b[kk+3][jj+15];
-         s15 += a[ii][kk+4] * b[kk+4][jj+15];
-         s15 += a[ii][kk+5] * b[kk+5][jj+15];
-         s15 += a[ii][kk+6] * b[kk+6][jj+15];
-         s15 += a[ii][kk+7] * b[kk+7][jj+15];
-
-         s16 += a[ii][kk] * b[kk][jj+16];
-         s16 += a[ii][kk+1] * b[kk+1][jj+16];
-         s16 += a[ii][kk+2] * b[kk+2][jj+16];
-         s16 += a[ii][kk+3] * b[kk+3][jj+16];
-         s16 += a[ii][kk+4] * b[kk+4][jj+16];
-         s16 += a[ii][kk+5] * b[kk+5][jj+16];
-         s16 += a[ii][kk+6] * b[kk+6][jj+16];
-         s16 += a[ii][kk+7] * b[kk+7][jj+16];
-
-         s17 += a[ii][kk] * b[kk][jj+17];
-         s17 += a[ii][kk+1] * b[kk+1][jj+17];
-         s17 += a[ii][kk+2] * b[kk+2][jj+17];
-         s17 += a[ii][kk+3] * b[kk+3][jj+17];
-         s17 += a[ii][kk+4] * b[kk+4][jj+17];
-         s17 += a[ii][kk+5] * b[kk+5][jj+17];
-         s17 += a[ii][kk+6] * b[kk+6][jj+17];
-         s17 += a[ii][kk+7] * b[kk+7][jj+17];
-
-         s18 += a[ii][kk] * b[kk][jj+18];
-         s18 += a[ii][kk+1] * b[kk+1][jj+18];
-         s18 += a[ii][kk+2] * b[kk+2][jj+18];
-         s18 += a[ii][kk+3] * b[kk+3][jj+18];
-         s18 += a[ii][kk+4] * b[kk+4][jj+18];
-         s18 += a[ii][kk+5] * b[kk+5][jj+18];
-         s18 += a[ii][kk+6] * b[kk+6][jj+18];
-         s18 += a[ii][kk+7] * b[kk+7][jj+18];
-
-         s19 += a[ii][kk] * b[kk][jj+19];
-         s19 += a[ii][kk+1] * b[kk+1][jj+19];
-         s19 += a[ii][kk+2] * b[kk+2][jj+19];
-         s19 += a[ii][kk+3] * b[kk+3][jj+19];
-         s19 += a[ii][kk+4] * b[kk+4][jj+19];
-         s19 += a[ii][kk+5] * b[kk+5][jj+19];
-         s19 += a[ii][kk+6] * b[kk+6][jj+19];
-         s19 += a[ii][kk+7] * b[kk+7][jj+19];
+         __m256d a0 = _mm256_set1_pd(a[ii+0][kk+0]);
+         c0 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+16]), c4);
+         __m256d a1 = _mm256_set1_pd(a[ii+0][kk+1]);
+         c0 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+16]), c4);
+         __m256d a2 = _mm256_set1_pd(a[ii+0][kk+2]);
+         c0 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+16]), c4);
+         __m256d a3 = _mm256_set1_pd(a[ii+0][kk+3]);
+         c0 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+16]), c4);
+         __m256d a4 = _mm256_set1_pd(a[ii+0][kk+4]);
+         c0 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+16]), c4);
+         __m256d a5 = _mm256_set1_pd(a[ii+0][kk+5]);
+         c0 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+16]), c4);
+         __m256d a6 = _mm256_set1_pd(a[ii+0][kk+6]);
+         c0 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+16]), c4);
+         __m256d a7 = _mm256_set1_pd(a[ii+0][kk+7]);
+         c0 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 0]), c0);
+         c1 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 4]), c1);
+         c2 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 8]), c2);
+         c3 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+12]), c3);
+         c4 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+16]), c4);
        }
-       c[ii][jj] += s0;
-       c[ii][jj+1] += s1;
-       c[ii][jj+2] += s2;
-       c[ii][jj+3] += s3;
-       c[ii][jj+4] += s4;
-       c[ii][jj+5] += s5;
-       c[ii][jj+6] += s6;
-       c[ii][jj+7] += s7;
-       c[ii][jj+8] += s8;
-       c[ii][jj+9] += s9;
-       c[ii][jj+10] += s10;
-       c[ii][jj+11] += s11;
-       c[ii][jj+12] += s12;
-       c[ii][jj+13] += s13;
-       c[ii][jj+14] += s14;
-       c[ii][jj+15] += s15;
-       c[ii][jj+16] += s16;
-       c[ii][jj+17] += s17;
-       c[ii][jj+18] += s18;
-       c[ii][jj+19] += s19;
+       _mm256_store_pd(&c[ii][jj+ 0], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 0]), c0));
+       _mm256_store_pd(&c[ii][jj+ 4], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 4]), c1));
+       _mm256_store_pd(&c[ii][jj+ 8], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 8]), c2));
+       _mm256_store_pd(&c[ii][jj+12], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+12]), c3));
+       _mm256_store_pd(&c[ii][jj+16], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+16]), c4));
      }
    }
       }

これを実行した結果が

$ ./code1.6
elapsed 0.101164 sec
diff is 0.000000

であり、コード1.5 と較べて 12% 程の高速化となりました。乗算 + 加算 を積和演算に置き換えたことで浮動小数点演算のコストはおよそ半分になっている筈ですが、全体としてそれほど高速化が果たせていないのはメモリアクセスがボトルネックとなってるためと思われます。
コード1.6 の再内側ループ部分は

   1004015a0:   c4 c2 7d 19 70 c8       vbroadcastsd -0x38(%r8),%ymm6
   1004015a6:   c4 c2 cd b8 91 c0 24    vfmadd231pd -0xdb40(%r9),%ymm6,%ymm2
   1004015ad:   ff ff
   1004015af:   c4 c2 cd b8 a1 e0 24    vfmadd231pd -0xdb20(%r9),%ymm6,%ymm4
   1004015b6:   ff ff
   1004015b8:   c4 c2 cd b8 a9 00 25    vfmadd231pd -0xdb00(%r9),%ymm6,%ymm5
   1004015bf:   ff ff
   1004015c1:   c4 c2 cd b8 99 20 25    vfmadd231pd -0xdae0(%r9),%ymm6,%ymm3
   1004015c8:   ff ff
   1004015ca:   c4 c2 cd b8 89 40 25    vfmadd231pd -0xdac0(%r9),%ymm6,%ymm1
   1004015d1:   ff ff
   1004015d3:   c4 c2 7d 19 70 d0       vbroadcastsd -0x30(%r8),%ymm6
   1004015d9:   c4 c2 cd b8 91 00 44    vfmadd231pd -0xbc00(%r9),%ymm6,%ymm2
   1004015e0:   ff ff
   1004015e2:   c4 c2 cd b8 a1 20 44    vfmadd231pd -0xbbe0(%r9),%ymm6,%ymm4
   1004015e9:   ff ff
   1004015eb:   c4 c2 cd b8 a9 40 44    vfmadd231pd -0xbbc0(%r9),%ymm6,%ymm5
   1004015f2:   ff ff
   1004015f4:   c4 c2 cd b8 99 60 44    vfmadd231pd -0xbba0(%r9),%ymm6,%ymm3
   1004015fb:   ff ff
   1004015fd:   c4 c2 cd b8 89 80 44    vfmadd231pd -0xbb80(%r9),%ymm6,%ymm1
   100401604:   ff ff
   100401606:   c4 c2 7d 19 70 d8       vbroadcastsd -0x28(%r8),%ymm6
   10040160c:   c4 c2 cd b8 91 40 63    vfmadd231pd -0x9cc0(%r9),%ymm6,%ymm2
   100401613:   ff ff
   100401615:   c4 c2 cd b8 a1 60 63    vfmadd231pd -0x9ca0(%r9),%ymm6,%ymm4
   10040161c:   ff ff
   10040161e:   c4 c2 cd b8 a9 80 63    vfmadd231pd -0x9c80(%r9),%ymm6,%ymm5
   100401625:   ff ff
   100401627:   c4 c2 cd b8 99 a0 63    vfmadd231pd -0x9c60(%r9),%ymm6,%ymm3
   10040162e:   ff ff
   100401630:   c4 c2 cd b8 89 c0 63    vfmadd231pd -0x9c40(%r9),%ymm6,%ymm1
   100401637:   ff ff
   100401639:   c4 c2 7d 19 70 e0       vbroadcastsd -0x20(%r8),%ymm6
   10040163f:   c4 c2 cd b8 91 80 82    vfmadd231pd -0x7d80(%r9),%ymm6,%ymm2
   100401646:   ff ff
   100401648:   c4 c2 cd b8 a1 a0 82    vfmadd231pd -0x7d60(%r9),%ymm6,%ymm4
   10040164f:   ff ff
   100401651:   c4 c2 cd b8 a9 c0 82    vfmadd231pd -0x7d40(%r9),%ymm6,%ymm5
   100401658:   ff ff
   10040165a:   c4 c2 cd b8 99 e0 82    vfmadd231pd -0x7d20(%r9),%ymm6,%ymm3
   100401661:   ff ff
   100401663:   c4 c2 cd b8 89 00 83    vfmadd231pd -0x7d00(%r9),%ymm6,%ymm1
   10040166a:   ff ff
   10040166c:   c4 c2 7d 19 70 e8       vbroadcastsd -0x18(%r8),%ymm6
   100401672:   c4 c2 cd b8 91 c0 a1    vfmadd231pd -0x5e40(%r9),%ymm6,%ymm2
   100401679:   ff ff
   10040167b:   c4 c2 cd b8 a1 e0 a1    vfmadd231pd -0x5e20(%r9),%ymm6,%ymm4
   100401682:   ff ff
   100401684:   c4 c2 cd b8 a9 00 a2    vfmadd231pd -0x5e00(%r9),%ymm6,%ymm5
   10040168b:   ff ff
   10040168d:   c4 c2 cd b8 99 20 a2    vfmadd231pd -0x5de0(%r9),%ymm6,%ymm3
   100401694:   ff ff
   100401696:   c4 c2 cd b8 89 40 a2    vfmadd231pd -0x5dc0(%r9),%ymm6,%ymm1
   10040169d:   ff ff
   10040169f:   c4 c2 7d 19 70 f0       vbroadcastsd -0x10(%r8),%ymm6
   1004016a5:   c4 c2 cd b8 91 00 c1    vfmadd231pd -0x3f00(%r9),%ymm6,%ymm2
   1004016ac:   ff ff
   1004016ae:   c4 c2 cd b8 a1 20 c1    vfmadd231pd -0x3ee0(%r9),%ymm6,%ymm4
   1004016b5:   ff ff
   1004016b7:   c4 c2 cd b8 a9 40 c1    vfmadd231pd -0x3ec0(%r9),%ymm6,%ymm5
   1004016be:   ff ff
   1004016c0:   c4 c2 cd b8 99 60 c1    vfmadd231pd -0x3ea0(%r9),%ymm6,%ymm3
   1004016c7:   ff ff
   1004016c9:   c4 c2 cd b8 89 80 c1    vfmadd231pd -0x3e80(%r9),%ymm6,%ymm1
   1004016d0:   ff ff
   1004016d2:   c4 c2 7d 19 70 f8       vbroadcastsd -0x8(%r8),%ymm6
   1004016d8:   c4 c2 cd b8 91 40 e0    vfmadd231pd -0x1fc0(%r9),%ymm6,%ymm2
   1004016df:   ff ff
   1004016e1:   c4 c2 cd b8 a1 60 e0    vfmadd231pd -0x1fa0(%r9),%ymm6,%ymm4
   1004016e8:   ff ff
   1004016ea:   c4 c2 cd b8 a9 80 e0    vfmadd231pd -0x1f80(%r9),%ymm6,%ymm5
   1004016f1:   ff ff
   1004016f3:   c4 c2 cd b8 99 a0 e0    vfmadd231pd -0x1f60(%r9),%ymm6,%ymm3
   1004016fa:   ff ff
   1004016fc:   c4 c2 cd b8 89 c0 e0    vfmadd231pd -0x1f40(%r9),%ymm6,%ymm1
   100401703:   ff ff
   100401705:   c4 c2 7d 19 30          vbroadcastsd (%r8),%ymm6
   10040170a:   c4 c2 cd b8 51 80       vfmadd231pd -0x80(%r9),%ymm6,%ymm2
   100401710:   c4 c2 cd b8 61 a0       vfmadd231pd -0x60(%r9),%ymm6,%ymm4
   100401716:   c4 c2 cd b8 69 c0       vfmadd231pd -0x40(%r9),%ymm6,%ymm5
   10040171c:   c4 c2 cd b8 59 e0       vfmadd231pd -0x20(%r9),%ymm6,%ymm3
   100401722:   c4 c2 cd b8 09          vfmadd231pd (%r9),%ymm6,%ymm1
   100401727:   48 83 c0 08             add    $0x8,%rax
   10040172b:   49 83 c0 40             add    $0x40,%r8
   10040172f:   49 81 c1 00 fa 00 00    add    $0xfa00,%r9
   100401736:   4c 39 f0                cmp    %r14,%rax
   100401739:   0f 8c 61 fe ff ff       jl     1004015a0 <main+0x1d0>

というコードとなっていましたが、16本ある SIMD レジスタが %ymm0 から %ymm6 までしか使用されておらず、レジスタの数にはまだ余裕があるので jj ループのアンロールを更に進めてみます。

--- code1.6.c   
+++ code1.7.c   
@@ -54,7 +54,7 @@ int main(void)

   int i, ii,j,jj,k,kk;
   double start,elapsed_time;
-  __m256d c0, c1, c2, c3, c4;
+  __m256d c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;

   for (i = 0; i < N; i++) {
     for (j = 0; j < N; j++) {
@@ -75,12 +75,17 @@ int main(void)
     for (j = 0; j < N; j+=BLOCK_SIZE) {
       for (k = 0; k < N; k+=BLOCK_SIZE) {
    for(ii=i;ii < (i + BLOCK_SIZE);ii++){
-     for(jj=j;jj < (j + BLOCK_SIZE);jj+=20){
+     jj=j;{
        c0 = _mm256_setzero_pd();
        c1 = _mm256_setzero_pd();
        c2 = _mm256_setzero_pd();
        c3 = _mm256_setzero_pd();
        c4 = _mm256_setzero_pd();
+       c5 = _mm256_setzero_pd();
+       c6 = _mm256_setzero_pd();
+       c7 = _mm256_setzero_pd();
+       c8 = _mm256_setzero_pd();
+       c9 = _mm256_setzero_pd();
        for (kk = k; kk < (k + BLOCK_SIZE); kk+=8) {
          __m256d a0 = _mm256_set1_pd(a[ii+0][kk+0]);
          c0 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 0]), c0);
@@ -88,54 +93,99 @@ int main(void)
          c2 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+36]), c9);
          __m256d a1 = _mm256_set1_pd(a[ii+0][kk+1]);
          c0 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 0]), c0);
          c1 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 4]), c1);
          c2 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+36]), c9);
          __m256d a2 = _mm256_set1_pd(a[ii+0][kk+2]);
          c0 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 0]), c0);
          c1 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 4]), c1);
          c2 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+36]), c9);
          __m256d a3 = _mm256_set1_pd(a[ii+0][kk+3]);
          c0 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 0]), c0);
          c1 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 4]), c1);
          c2 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+36]), c9);
          __m256d a4 = _mm256_set1_pd(a[ii+0][kk+4]);
          c0 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 0]), c0);
          c1 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 4]), c1);
          c2 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+36]), c9);
          __m256d a5 = _mm256_set1_pd(a[ii+0][kk+5]);
          c0 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 0]), c0);
          c1 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 4]), c1);
          c2 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+36]), c9);
          __m256d a6 = _mm256_set1_pd(a[ii+0][kk+6]);
          c0 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 0]), c0);
          c1 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 4]), c1);
          c2 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+36]), c9);
          __m256d a7 = _mm256_set1_pd(a[ii+0][kk+7]);
          c0 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 0]), c0);
          c1 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 4]), c1);
          c2 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 8]), c2);
          c3 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+12]), c3);
          c4 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+16]), c4);
+         c5 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+20]), c5);
+         c6 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+24]), c6);
+         c7 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+28]), c7);
+         c8 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+32]), c8);
+         c9 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+36]), c9);
        }
        _mm256_store_pd(&c[ii][jj+ 0], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 0]), c0));
        _mm256_store_pd(&c[ii][jj+ 4], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 4]), c1));
        _mm256_store_pd(&c[ii][jj+ 8], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 8]), c2));
        _mm256_store_pd(&c[ii][jj+12], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+12]), c3));
        _mm256_store_pd(&c[ii][jj+16], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+16]), c4));
+       _mm256_store_pd(&c[ii][jj+20], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+20]), c5));
+       _mm256_store_pd(&c[ii][jj+24], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+24]), c6));
+       _mm256_store_pd(&c[ii][jj+28], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+28]), c7));
+       _mm256_store_pd(&c[ii][jj+32], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+32]), c8));
+       _mm256_store_pd(&c[ii][jj+36], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+36]), c9));
      }
    }
       }

これを実行した結果が以下となります。

$ ./code1.7
elapsed 0.077831 sec
diff is 0.000000

コード1.6 と較べて約 23% 程の高速化となりました。コードを見てみると

   1004015e0:   c4 42 7d 19 5d c8       vbroadcastsd -0x38(%r13),%ymm11
   1004015e6:   c4 c2 a5 b8 af 20 24    vfmadd231pd -0xdbe0(%r15),%ymm11,%ymm5
   1004015ed:   ff ff
   1004015ef:   c4 42 a5 b8 87 40 24    vfmadd231pd -0xdbc0(%r15),%ymm11,%ymm8
   1004015f6:   ff ff
   1004015f8:   c4 c2 a5 b8 bf 60 24    vfmadd231pd -0xdba0(%r15),%ymm11,%ymm7
   1004015ff:   ff ff
   100401601:   c4 42 a5 b8 97 80 24    vfmadd231pd -0xdb80(%r15),%ymm11,%ymm10
   100401608:   ff ff
   10040160a:   c4 42 a5 b8 8f a0 24    vfmadd231pd -0xdb60(%r15),%ymm11,%ymm9
   100401611:   ff ff
   100401613:   c4 c2 a5 b8 b7 c0 24    vfmadd231pd -0xdb40(%r15),%ymm11,%ymm6
   10040161a:   ff ff
   10040161c:   c4 c2 a5 b8 a7 e0 24    vfmadd231pd -0xdb20(%r15),%ymm11,%ymm4
   100401623:   ff ff
   100401625:   c4 c2 a5 b8 9f 00 25    vfmadd231pd -0xdb00(%r15),%ymm11,%ymm3
   10040162c:   ff ff
   10040162e:   c4 c2 a5 b8 97 20 25    vfmadd231pd -0xdae0(%r15),%ymm11,%ymm2
   100401635:   ff ff
   100401637:   c4 c2 a5 b8 8f 40 25    vfmadd231pd -0xdac0(%r15),%ymm11,%ymm1
   10040163e:   ff ff
   100401640:   c4 42 7d 19 5d d0       vbroadcastsd -0x30(%r13),%ymm11
   100401646:   c4 c2 a5 b8 af 60 43    vfmadd231pd -0xbca0(%r15),%ymm11,%ymm5
   10040164d:   ff ff
   10040164f:   c4 42 a5 b8 87 80 43    vfmadd231pd -0xbc80(%r15),%ymm11,%ymm8
   100401656:   ff ff
   100401658:   c4 c2 a5 b8 bf a0 43    vfmadd231pd -0xbc60(%r15),%ymm11,%ymm7
   10040165f:   ff ff
   100401661:   c4 42 a5 b8 97 c0 43    vfmadd231pd -0xbc40(%r15),%ymm11,%ymm10
   100401668:   ff ff
   10040166a:   c4 42 a5 b8 8f e0 43    vfmadd231pd -0xbc20(%r15),%ymm11,%ymm9
   100401671:   ff ff
   100401673:   c4 c2 a5 b8 b7 00 44    vfmadd231pd -0xbc00(%r15),%ymm11,%ymm6
   10040167a:   ff ff
   10040167c:   c4 c2 a5 b8 a7 20 44    vfmadd231pd -0xbbe0(%r15),%ymm11,%ymm4
   100401683:   ff ff
   100401685:   c4 c2 a5 b8 9f 40 44    vfmadd231pd -0xbbc0(%r15),%ymm11,%ymm3
   10040168c:   ff ff
   10040168e:   c4 c2 a5 b8 97 60 44    vfmadd231pd -0xbba0(%r15),%ymm11,%ymm2
   100401695:   ff ff
   100401697:   c4 c2 a5 b8 8f 80 44    vfmadd231pd -0xbb80(%r15),%ymm11,%ymm1
   10040169e:   ff ff
   1004016a0:   c4 42 7d 19 5d d8       vbroadcastsd -0x28(%r13),%ymm11
   1004016a6:   c4 c2 a5 b8 af a0 62    vfmadd231pd -0x9d60(%r15),%ymm11,%ymm5
   1004016ad:   ff ff
   1004016af:   c4 42 a5 b8 87 c0 62    vfmadd231pd -0x9d40(%r15),%ymm11,%ymm8
   1004016b6:   ff ff
   1004016b8:   c4 c2 a5 b8 bf e0 62    vfmadd231pd -0x9d20(%r15),%ymm11,%ymm7
   1004016bf:   ff ff
   1004016c1:   c4 42 a5 b8 97 00 63    vfmadd231pd -0x9d00(%r15),%ymm11,%ymm10
   1004016c8:   ff ff
   1004016ca:   c4 42 a5 b8 8f 20 63    vfmadd231pd -0x9ce0(%r15),%ymm11,%ymm9
   1004016d1:   ff ff
   1004016d3:   c4 c2 a5 b8 b7 40 63    vfmadd231pd -0x9cc0(%r15),%ymm11,%ymm6
   1004016da:   ff ff
   1004016dc:   c4 c2 a5 b8 a7 60 63    vfmadd231pd -0x9ca0(%r15),%ymm11,%ymm4
   1004016e3:   ff ff
   1004016e5:   c4 c2 a5 b8 9f 80 63    vfmadd231pd -0x9c80(%r15),%ymm11,%ymm3
   1004016ec:   ff ff
   1004016ee:   c4 c2 a5 b8 97 a0 63    vfmadd231pd -0x9c60(%r15),%ymm11,%ymm2
   1004016f5:   ff ff
   1004016f7:   c4 c2 a5 b8 8f c0 63    vfmadd231pd -0x9c40(%r15),%ymm11,%ymm1
   1004016fe:   ff ff
   100401700:   c4 42 7d 19 5d e0       vbroadcastsd -0x20(%r13),%ymm11
   100401706:   c4 c2 a5 b8 af e0 81    vfmadd231pd -0x7e20(%r15),%ymm11,%ymm5
   10040170d:   ff ff
   10040170f:   c4 42 a5 b8 87 00 82    vfmadd231pd -0x7e00(%r15),%ymm11,%ymm8
   100401716:   ff ff
   100401718:   c4 c2 a5 b8 bf 20 82    vfmadd231pd -0x7de0(%r15),%ymm11,%ymm7
   10040171f:   ff ff
   100401721:   c4 42 a5 b8 97 40 82    vfmadd231pd -0x7dc0(%r15),%ymm11,%ymm10
   100401728:   ff ff
   10040172a:   c4 42 a5 b8 8f 60 82    vfmadd231pd -0x7da0(%r15),%ymm11,%ymm9
   100401731:   ff ff
   100401733:   c4 c2 a5 b8 b7 80 82    vfmadd231pd -0x7d80(%r15),%ymm11,%ymm6
   10040173a:   ff ff
   10040173c:   c4 c2 a5 b8 a7 a0 82    vfmadd231pd -0x7d60(%r15),%ymm11,%ymm4
   100401743:   ff ff
   100401745:   c4 c2 a5 b8 9f c0 82    vfmadd231pd -0x7d40(%r15),%ymm11,%ymm3
   10040174c:   ff ff
   10040174e:   c4 c2 a5 b8 97 e0 82    vfmadd231pd -0x7d20(%r15),%ymm11,%ymm2
   100401755:   ff ff
   100401757:   c4 c2 a5 b8 8f 00 83    vfmadd231pd -0x7d00(%r15),%ymm11,%ymm1
   10040175e:   ff ff
   100401760:   c4 42 7d 19 5d e8       vbroadcastsd -0x18(%r13),%ymm11
   100401766:   c4 c2 a5 b8 af 20 a1    vfmadd231pd -0x5ee0(%r15),%ymm11,%ymm5
   10040176d:   ff ff
   10040176f:   c4 42 a5 b8 87 40 a1    vfmadd231pd -0x5ec0(%r15),%ymm11,%ymm8
   100401776:   ff ff
   100401778:   c4 c2 a5 b8 bf 60 a1    vfmadd231pd -0x5ea0(%r15),%ymm11,%ymm7
   10040177f:   ff ff
   100401781:   c4 42 a5 b8 97 80 a1    vfmadd231pd -0x5e80(%r15),%ymm11,%ymm10
   100401788:   ff ff
   10040178a:   c4 42 a5 b8 8f a0 a1    vfmadd231pd -0x5e60(%r15),%ymm11,%ymm9
   100401791:   ff ff
   100401793:   c4 c2 a5 b8 b7 c0 a1    vfmadd231pd -0x5e40(%r15),%ymm11,%ymm6
   10040179a:   ff ff
   10040179c:   c4 c2 a5 b8 a7 e0 a1    vfmadd231pd -0x5e20(%r15),%ymm11,%ymm4
   1004017a3:   ff ff
   1004017a5:   c4 c2 a5 b8 9f 00 a2    vfmadd231pd -0x5e00(%r15),%ymm11,%ymm3
   1004017ac:   ff ff
   1004017ae:   c4 c2 a5 b8 97 20 a2    vfmadd231pd -0x5de0(%r15),%ymm11,%ymm2
   1004017b5:   ff ff
   1004017b7:   c4 c2 a5 b8 8f 40 a2    vfmadd231pd -0x5dc0(%r15),%ymm11,%ymm1
   1004017be:   ff ff
   1004017c0:   c4 42 7d 19 5d f0       vbroadcastsd -0x10(%r13),%ymm11
   1004017c6:   c4 c2 a5 b8 af 60 c0    vfmadd231pd -0x3fa0(%r15),%ymm11,%ymm5
   1004017cd:   ff ff
   1004017cf:   c4 42 a5 b8 87 80 c0    vfmadd231pd -0x3f80(%r15),%ymm11,%ymm8
   1004017d6:   ff ff
   1004017d8:   c4 c2 a5 b8 bf a0 c0    vfmadd231pd -0x3f60(%r15),%ymm11,%ymm7
   1004017df:   ff ff
   1004017e1:   c4 42 a5 b8 97 c0 c0    vfmadd231pd -0x3f40(%r15),%ymm11,%ymm10
   1004017e8:   ff ff
   1004017ea:   c4 42 a5 b8 8f e0 c0    vfmadd231pd -0x3f20(%r15),%ymm11,%ymm9
   1004017f1:   ff ff
   1004017f3:   c4 c2 a5 b8 b7 00 c1    vfmadd231pd -0x3f00(%r15),%ymm11,%ymm6
   1004017fa:   ff ff
   1004017fc:   c4 c2 a5 b8 a7 20 c1    vfmadd231pd -0x3ee0(%r15),%ymm11,%ymm4
   100401803:   ff ff
   100401805:   c4 c2 a5 b8 9f 40 c1    vfmadd231pd -0x3ec0(%r15),%ymm11,%ymm3
   10040180c:   ff ff
   10040180e:   c4 c2 a5 b8 97 60 c1    vfmadd231pd -0x3ea0(%r15),%ymm11,%ymm2
   100401815:   ff ff
   100401817:   c4 c2 a5 b8 8f 80 c1    vfmadd231pd -0x3e80(%r15),%ymm11,%ymm1
   10040181e:   ff ff
   100401820:   c4 42 7d 19 5d f8       vbroadcastsd -0x8(%r13),%ymm11
   100401826:   c4 c2 a5 b8 af a0 df    vfmadd231pd -0x2060(%r15),%ymm11,%ymm5
   10040182d:   ff ff
   10040182f:   c4 42 a5 b8 87 c0 df    vfmadd231pd -0x2040(%r15),%ymm11,%ymm8
   100401836:   ff ff
   100401838:   c4 c2 a5 b8 bf e0 df    vfmadd231pd -0x2020(%r15),%ymm11,%ymm7
   10040183f:   ff ff
   100401841:   c4 42 a5 b8 97 00 e0    vfmadd231pd -0x2000(%r15),%ymm11,%ymm10
   100401848:   ff ff
   10040184a:   c4 42 a5 b8 8f 20 e0    vfmadd231pd -0x1fe0(%r15),%ymm11,%ymm9
   100401851:   ff ff
   100401853:   c4 c2 a5 b8 b7 40 e0    vfmadd231pd -0x1fc0(%r15),%ymm11,%ymm6
   10040185a:   ff ff
   10040185c:   c4 c2 a5 b8 a7 60 e0    vfmadd231pd -0x1fa0(%r15),%ymm11,%ymm4
   100401863:   ff ff
   100401865:   c4 c2 a5 b8 9f 80 e0    vfmadd231pd -0x1f80(%r15),%ymm11,%ymm3
   10040186c:   ff ff
   10040186e:   c4 c2 a5 b8 97 a0 e0    vfmadd231pd -0x1f60(%r15),%ymm11,%ymm2
   100401875:   ff ff
   100401877:   c4 c2 a5 b8 8f c0 e0    vfmadd231pd -0x1f40(%r15),%ymm11,%ymm1
   10040187e:   ff ff
   100401880:   c4 42 7d 19 5d 00       vbroadcastsd 0x0(%r13),%ymm11
   100401886:   c4 c2 a5 b8 af e0 fe    vfmadd231pd -0x120(%r15),%ymm11,%ymm5
   10040188d:   ff ff
   10040188f:   c4 42 a5 b8 87 00 ff    vfmadd231pd -0x100(%r15),%ymm11,%ymm8
   100401896:   ff ff
   100401898:   c4 c2 a5 b8 bf 20 ff    vfmadd231pd -0xe0(%r15),%ymm11,%ymm7
   10040189f:   ff ff
   1004018a1:   c4 42 a5 b8 97 40 ff    vfmadd231pd -0xc0(%r15),%ymm11,%ymm10
   1004018a8:   ff ff
   1004018aa:   c4 42 a5 b8 8f 60 ff    vfmadd231pd -0xa0(%r15),%ymm11,%ymm9
   1004018b1:   ff ff
   1004018b3:   c4 c2 a5 b8 77 80       vfmadd231pd -0x80(%r15),%ymm11,%ymm6
   1004018b9:   c4 c2 a5 b8 67 a0       vfmadd231pd -0x60(%r15),%ymm11,%ymm4
   1004018bf:   c4 c2 a5 b8 5f c0       vfmadd231pd -0x40(%r15),%ymm11,%ymm3
   1004018c5:   c4 c2 a5 b8 57 e0       vfmadd231pd -0x20(%r15),%ymm11,%ymm2
   1004018cb:   c4 c2 a5 b8 0f          vfmadd231pd (%r15),%ymm11,%ymm1
   1004018d0:   48 83 c0 08             add    $0x8,%rax
   1004018d4:   49 83 c5 40             add    $0x40,%r13
   1004018d8:   49 81 c7 00 fa 00 00    add    $0xfa00,%r15
   1004018df:   4c 39 e0                cmp    %r12,%rax
   1004018e2:   0f 8c f8 fc ff ff       jl     1004015e0 <main+0x210>

ということであり、おおよそ FMA 命令の積和演算が繰り返される内容となりました。レジスタも %ymm11 まで使用されるようなったので、浮動小数点演算の実行効率は向上したと思います。これ以上の速度向上を目指すにはメモリアクセスの方法を再度検討することになると思われます。
SIMD 命令の直接の使用は、環境によっては動作しなかったり、将来的に更に強力な命令セットの登場やコンパイラの改良に対しては足かせとなってしまうことも考えられるため、保守性等も考慮した上で使用は検討すべきと思います。

これまでの結果を以下に纏めます。

コード 結果 変更内容
コード1 0.473149 sec
コード1.1 0.386477 sec 隣接するメモリアクセスが効率的であることを考慮し、展開するループを ii から jj に変更
コード1.2 0.197167 sec ループの展開を 2ループ分 から 4ループ分に変更
コード1.3 0.190590 sec kk のループを 5ループ分展開から 4ループ分展開に変更
コード1.4 0.143134 sec jj と kk のループ展開をそれぞれ 4ループ分から 8ループ分に変更
コード1.5 0.115160 sec jj のループ展開を 20ループ分に変更
コード1.6 0.101164 sec 乗算 + 加算の処理を積和演算命令を使用するよう変更
コード1.7 0.077831 sec jj のループをすべて(40ループ分)展開
コード2 1.064653 sec
コード3 1.192261 sec
コード4 2.136027 sec
コード5 0.323239 sec
5
6
5

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
5
6