チューニング
SIMD
AVX
行列積
FMA

『プロセッサの性能を引き出すのは大変だよというお話(行列積チューニング)』へのコメント続き

More than 1 year has passed since last update.

Kanbayashi Ryo氏の書かれた記事『プロセッサの性能を引き出すのは大変だよというお話(行列積チューニング)』へのコメントの続きです。

先の記事に私が書いたコメント


コンパイラによる自動ベクトル化でどの程度性能が上がるかを確認した上で、作業的に煩雑な SIMD 命令の使用を検討すべきと思います。


について検証してみました。

コード1.5 に SIMD 命令の効果的な使用を考えてみます。

コード1.5 の内容は先にも書いたように

   100401600:   c4 c2 7d 19 77 c8       vbroadcastsd -0x38(%r15),%ymm6

100401606: c4 c1 4d 59 ba c0 24 vmulpd -0xdb40(%r10),%ymm6,%ymm7
10040160d: ff ff
10040160f: c5 d5 58 ef vaddpd %ymm7,%ymm5,%ymm5
100401613: c4 c2 7d 19 7f d0 vbroadcastsd -0x30(%r15),%ymm7
100401619: c4 41 45 59 82 00 44 vmulpd -0xbc00(%r10),%ymm7,%ymm8
100401620: ff ff
100401622: c4 c1 55 58 e8 vaddpd %ymm8,%ymm5,%ymm5
100401627: c4 42 7d 19 47 d8 vbroadcastsd -0x28(%r15),%ymm8
10040162d: c4 41 3d 59 8a 40 63 vmulpd -0x9cc0(%r10),%ymm8,%ymm9
100401634: ff ff
100401636: c4 c1 55 58 e9 vaddpd %ymm9,%ymm5,%ymm5
10040163b: c4 42 7d 19 4f e0 vbroadcastsd -0x20(%r15),%ymm9
100401641: c4 41 35 59 92 80 82 vmulpd -0x7d80(%r10),%ymm9,%ymm10
100401648: ff ff
10040164a: c4 c1 55 58 ea vaddpd %ymm10,%ymm5,%ymm5
10040164f: c4 42 7d 19 57 e8 vbroadcastsd -0x18(%r15),%ymm10
100401655: c4 41 2d 59 9a c0 a1 vmulpd -0x5e40(%r10),%ymm10,%ymm11
10040165c: ff ff
10040165e: c4 c1 55 58 eb vaddpd %ymm11,%ymm5,%ymm5

乗算と加算のが連続するコードが出力されています。

今回私が使用した Core i7-5500U はBroadwellマイクロアーキテクチャであり、FMA命令による積和演算が使用できるので、これで先の乗算と加算を置き換えてみました。

--- code1.5.c   

+++ code1.6.c
@@ -2,6 +2,7 @@
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>
+#include <immintrin.h> /* AVX operations */

#define N 1000
#define BLOCK_SIZE 40
@@ -47,12 +48,13 @@ double compare(double a[N][N],double b[N

int main(void)
{
- static double a[N][N], b[N][N], c[N][N];
+ static double a[N][N] __attribute__ ((aligned(256)));
+ static double b[N][N] __attribute__ ((aligned(256)));
+ static double c[N][N] __attribute__ ((aligned(256)));

int i, ii,j,jj,k,kk;
double start,elapsed_time;
- double s0,s1,s2,s3,s4,s5,s6,s7,s8,s9;
- double s10,s11,s12,s13,s14,s15,s16,s17,s18,s19;
+ __m256d c0, c1, c2, c3, c4;

for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
@@ -74,227 +76,66 @@ int main(void)
for (k = 0; k < N; k+=BLOCK_SIZE) {
for(ii=i;ii < (i + BLOCK_SIZE);ii++){
for(jj=j;jj < (j + BLOCK_SIZE);jj+=20){
- s0 = 0.0;
- s1 = 0.0;
- s2 = 0.0;
- s3 = 0.0;
- s4 = 0.0;
- s5 = 0.0;
- s6 = 0.0;
- s7 = 0.0;
- s8 = 0.0;
- s9 = 0.0;
- s10 = 0.0;
- s11 = 0.0;
- s12 = 0.0;
- s13 = 0.0;
- s14 = 0.0;
- s15 = 0.0;
- s16 = 0.0;
- s17 = 0.0;
- s18 = 0.0;
- s19 = 0.0;
+ c0 = _mm256_setzero_pd();
+ c1 = _mm256_setzero_pd();
+ c2 = _mm256_setzero_pd();
+ c3 = _mm256_setzero_pd();
+ c4 = _mm256_setzero_pd();
for (kk = k; kk < (k + BLOCK_SIZE); kk+=8) {
- s0 += a[ii][kk] * b[kk][jj];
- s0 += a[ii][kk+1] * b[kk+1][jj];
- s0 += a[ii][kk+2] * b[kk+2][jj];
- s0 += a[ii][kk+3] * b[kk+3][jj];
- s0 += a[ii][kk+4] * b[kk+4][jj];
- s0 += a[ii][kk+5] * b[kk+5][jj];
- s0 += a[ii][kk+6] * b[kk+6][jj];
- s0 += a[ii][kk+7] * b[kk+7][jj];
-
- s1 += a[ii][kk] * b[kk][jj+1];
- s1 += a[ii][kk+1] * b[kk+1][jj+1];
- s1 += a[ii][kk+2] * b[kk+2][jj+1];
- s1 += a[ii][kk+3] * b[kk+3][jj+1];
- s1 += a[ii][kk+4] * b[kk+4][jj+1];
- s1 += a[ii][kk+5] * b[kk+5][jj+1];
- s1 += a[ii][kk+6] * b[kk+6][jj+1];
- s1 += a[ii][kk+7] * b[kk+7][jj+1];
-
- s2 += a[ii][kk] * b[kk][jj+2];
- s2 += a[ii][kk+1] * b[kk+1][jj+2];
- s2 += a[ii][kk+2] * b[kk+2][jj+2];
- s2 += a[ii][kk+3] * b[kk+3][jj+2];
- s2 += a[ii][kk+4] * b[kk+4][jj+2];
- s2 += a[ii][kk+5] * b[kk+5][jj+2];
- s2 += a[ii][kk+6] * b[kk+6][jj+2];
- s2 += a[ii][kk+7] * b[kk+7][jj+2];
-
- s3 += a[ii][kk] * b[kk][jj+3];
- s3 += a[ii][kk+1] * b[kk+1][jj+3];
- s3 += a[ii][kk+2] * b[kk+2][jj+3];
- s3 += a[ii][kk+3] * b[kk+3][jj+3];
- s3 += a[ii][kk+4] * b[kk+4][jj+3];
- s3 += a[ii][kk+5] * b[kk+5][jj+3];
- s3 += a[ii][kk+6] * b[kk+6][jj+3];
- s3 += a[ii][kk+7] * b[kk+7][jj+3];
-
- s4 += a[ii][kk] * b[kk][jj+4];
- s4 += a[ii][kk+1] * b[kk+1][jj+4];
- s4 += a[ii][kk+2] * b[kk+2][jj+4];
- s4 += a[ii][kk+3] * b[kk+3][jj+4];
- s4 += a[ii][kk+4] * b[kk+4][jj+4];
- s4 += a[ii][kk+5] * b[kk+5][jj+4];
- s4 += a[ii][kk+6] * b[kk+6][jj+4];
- s4 += a[ii][kk+7] * b[kk+7][jj+4];
-
- s5 += a[ii][kk] * b[kk][jj+5];
- s5 += a[ii][kk+1] * b[kk+1][jj+5];
- s5 += a[ii][kk+2] * b[kk+2][jj+5];
- s5 += a[ii][kk+3] * b[kk+3][jj+5];
- s5 += a[ii][kk+4] * b[kk+4][jj+5];
- s5 += a[ii][kk+5] * b[kk+5][jj+5];
- s5 += a[ii][kk+6] * b[kk+6][jj+5];
- s5 += a[ii][kk+7] * b[kk+7][jj+5];
-
- s6 += a[ii][kk] * b[kk][jj+6];
- s6 += a[ii][kk+1] * b[kk+1][jj+6];
- s6 += a[ii][kk+2] * b[kk+2][jj+6];
- s6 += a[ii][kk+3] * b[kk+3][jj+6];
- s6 += a[ii][kk+4] * b[kk+4][jj+6];
- s6 += a[ii][kk+5] * b[kk+5][jj+6];
- s6 += a[ii][kk+6] * b[kk+6][jj+6];
- s6 += a[ii][kk+7] * b[kk+7][jj+6];
-
- s7 += a[ii][kk] * b[kk][jj+7];
- s7 += a[ii][kk+1] * b[kk+1][jj+7];
- s7 += a[ii][kk+2] * b[kk+2][jj+7];
- s7 += a[ii][kk+3] * b[kk+3][jj+7];
- s7 += a[ii][kk+4] * b[kk+4][jj+7];
- s7 += a[ii][kk+5] * b[kk+5][jj+7];
- s7 += a[ii][kk+6] * b[kk+6][jj+7];
- s7 += a[ii][kk+7] * b[kk+7][jj+7];
-
- s8 += a[ii][kk] * b[kk][jj+8];
- s8 += a[ii][kk+1] * b[kk+1][jj+8];
- s8 += a[ii][kk+2] * b[kk+2][jj+8];
- s8 += a[ii][kk+3] * b[kk+3][jj+8];
- s8 += a[ii][kk+4] * b[kk+4][jj+8];
- s8 += a[ii][kk+5] * b[kk+5][jj+8];
- s8 += a[ii][kk+6] * b[kk+6][jj+8];
- s8 += a[ii][kk+7] * b[kk+7][jj+8];
-
- s9 += a[ii][kk] * b[kk][jj+9];
- s9 += a[ii][kk+1] * b[kk+1][jj+9];
- s9 += a[ii][kk+2] * b[kk+2][jj+9];
- s9 += a[ii][kk+3] * b[kk+3][jj+9];
- s9 += a[ii][kk+4] * b[kk+4][jj+9];
- s9 += a[ii][kk+5] * b[kk+5][jj+9];
- s9 += a[ii][kk+6] * b[kk+6][jj+9];
- s9 += a[ii][kk+7] * b[kk+7][jj+9];
-
- s10 += a[ii][kk] * b[kk][jj+10];
- s10 += a[ii][kk+1] * b[kk+1][jj+10];
- s10 += a[ii][kk+2] * b[kk+2][jj+10];
- s10 += a[ii][kk+3] * b[kk+3][jj+10];
- s10 += a[ii][kk+4] * b[kk+4][jj+10];
- s10 += a[ii][kk+5] * b[kk+5][jj+10];
- s10 += a[ii][kk+6] * b[kk+6][jj+10];
- s10 += a[ii][kk+7] * b[kk+7][jj+10];
-
- s11 += a[ii][kk] * b[kk][jj+11];
- s11 += a[ii][kk+1] * b[kk+1][jj+11];
- s11 += a[ii][kk+2] * b[kk+2][jj+11];
- s11 += a[ii][kk+3] * b[kk+3][jj+11];
- s11 += a[ii][kk+4] * b[kk+4][jj+11];
- s11 += a[ii][kk+5] * b[kk+5][jj+11];
- s11 += a[ii][kk+6] * b[kk+6][jj+11];
- s11 += a[ii][kk+7] * b[kk+7][jj+11];
-
- s12 += a[ii][kk] * b[kk][jj+12];
- s12 += a[ii][kk+1] * b[kk+1][jj+12];
- s12 += a[ii][kk+2] * b[kk+2][jj+12];
- s12 += a[ii][kk+3] * b[kk+3][jj+12];
- s12 += a[ii][kk+4] * b[kk+4][jj+12];
- s12 += a[ii][kk+5] * b[kk+5][jj+12];
- s12 += a[ii][kk+6] * b[kk+6][jj+12];
- s12 += a[ii][kk+7] * b[kk+7][jj+12];
-
- s13 += a[ii][kk] * b[kk][jj+13];
- s13 += a[ii][kk+1] * b[kk+1][jj+13];
- s13 += a[ii][kk+2] * b[kk+2][jj+13];
- s13 += a[ii][kk+3] * b[kk+3][jj+13];
- s13 += a[ii][kk+4] * b[kk+4][jj+13];
- s13 += a[ii][kk+5] * b[kk+5][jj+13];
- s13 += a[ii][kk+6] * b[kk+6][jj+13];
- s13 += a[ii][kk+7] * b[kk+7][jj+13];
-
- s14 += a[ii][kk] * b[kk][jj+14];
- s14 += a[ii][kk+1] * b[kk+1][jj+14];
- s14 += a[ii][kk+2] * b[kk+2][jj+14];
- s14 += a[ii][kk+3] * b[kk+3][jj+14];
- s14 += a[ii][kk+4] * b[kk+4][jj+14];
- s14 += a[ii][kk+5] * b[kk+5][jj+14];
- s14 += a[ii][kk+6] * b[kk+6][jj+14];
- s14 += a[ii][kk+7] * b[kk+7][jj+14];
-
- s15 += a[ii][kk] * b[kk][jj+15];
- s15 += a[ii][kk+1] * b[kk+1][jj+15];
- s15 += a[ii][kk+2] * b[kk+2][jj+15];
- s15 += a[ii][kk+3] * b[kk+3][jj+15];
- s15 += a[ii][kk+4] * b[kk+4][jj+15];
- s15 += a[ii][kk+5] * b[kk+5][jj+15];
- s15 += a[ii][kk+6] * b[kk+6][jj+15];
- s15 += a[ii][kk+7] * b[kk+7][jj+15];
-
- s16 += a[ii][kk] * b[kk][jj+16];
- s16 += a[ii][kk+1] * b[kk+1][jj+16];
- s16 += a[ii][kk+2] * b[kk+2][jj+16];
- s16 += a[ii][kk+3] * b[kk+3][jj+16];
- s16 += a[ii][kk+4] * b[kk+4][jj+16];
- s16 += a[ii][kk+5] * b[kk+5][jj+16];
- s16 += a[ii][kk+6] * b[kk+6][jj+16];
- s16 += a[ii][kk+7] * b[kk+7][jj+16];
-
- s17 += a[ii][kk] * b[kk][jj+17];
- s17 += a[ii][kk+1] * b[kk+1][jj+17];
- s17 += a[ii][kk+2] * b[kk+2][jj+17];
- s17 += a[ii][kk+3] * b[kk+3][jj+17];
- s17 += a[ii][kk+4] * b[kk+4][jj+17];
- s17 += a[ii][kk+5] * b[kk+5][jj+17];
- s17 += a[ii][kk+6] * b[kk+6][jj+17];
- s17 += a[ii][kk+7] * b[kk+7][jj+17];
-
- s18 += a[ii][kk] * b[kk][jj+18];
- s18 += a[ii][kk+1] * b[kk+1][jj+18];
- s18 += a[ii][kk+2] * b[kk+2][jj+18];
- s18 += a[ii][kk+3] * b[kk+3][jj+18];
- s18 += a[ii][kk+4] * b[kk+4][jj+18];
- s18 += a[ii][kk+5] * b[kk+5][jj+18];
- s18 += a[ii][kk+6] * b[kk+6][jj+18];
- s18 += a[ii][kk+7] * b[kk+7][jj+18];
-
- s19 += a[ii][kk] * b[kk][jj+19];
- s19 += a[ii][kk+1] * b[kk+1][jj+19];
- s19 += a[ii][kk+2] * b[kk+2][jj+19];
- s19 += a[ii][kk+3] * b[kk+3][jj+19];
- s19 += a[ii][kk+4] * b[kk+4][jj+19];
- s19 += a[ii][kk+5] * b[kk+5][jj+19];
- s19 += a[ii][kk+6] * b[kk+6][jj+19];
- s19 += a[ii][kk+7] * b[kk+7][jj+19];
+ __m256d a0 = _mm256_set1_pd(a[ii+0][kk+0]);
+ c0 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+16]), c4);
+ __m256d a1 = _mm256_set1_pd(a[ii+0][kk+1]);
+ c0 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+16]), c4);
+ __m256d a2 = _mm256_set1_pd(a[ii+0][kk+2]);
+ c0 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+16]), c4);
+ __m256d a3 = _mm256_set1_pd(a[ii+0][kk+3]);
+ c0 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+16]), c4);
+ __m256d a4 = _mm256_set1_pd(a[ii+0][kk+4]);
+ c0 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+16]), c4);
+ __m256d a5 = _mm256_set1_pd(a[ii+0][kk+5]);
+ c0 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+16]), c4);
+ __m256d a6 = _mm256_set1_pd(a[ii+0][kk+6]);
+ c0 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+16]), c4);
+ __m256d a7 = _mm256_set1_pd(a[ii+0][kk+7]);
+ c0 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 0]), c0);
+ c1 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 4]), c1);
+ c2 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 8]), c2);
+ c3 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+12]), c3);
+ c4 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+16]), c4);
}
- c[ii][jj] += s0;
- c[ii][jj+1] += s1;
- c[ii][jj+2] += s2;
- c[ii][jj+3] += s3;
- c[ii][jj+4] += s4;
- c[ii][jj+5] += s5;
- c[ii][jj+6] += s6;
- c[ii][jj+7] += s7;
- c[ii][jj+8] += s8;
- c[ii][jj+9] += s9;
- c[ii][jj+10] += s10;
- c[ii][jj+11] += s11;
- c[ii][jj+12] += s12;
- c[ii][jj+13] += s13;
- c[ii][jj+14] += s14;
- c[ii][jj+15] += s15;
- c[ii][jj+16] += s16;
- c[ii][jj+17] += s17;
- c[ii][jj+18] += s18;
- c[ii][jj+19] += s19;
+ _mm256_store_pd(&c[ii][jj+ 0], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 0]), c0));
+ _mm256_store_pd(&c[ii][jj+ 4], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 4]), c1));
+ _mm256_store_pd(&c[ii][jj+ 8], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 8]), c2));
+ _mm256_store_pd(&c[ii][jj+12], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+12]), c3));
+ _mm256_store_pd(&c[ii][jj+16], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+16]), c4));
}
}
}

これを実行した結果が

$ ./code1.6

elapsed 0.101164 sec
diff is 0.000000

であり、コード1.5 と較べて 12% 程の高速化となりました。乗算 + 加算 を積和演算に置き換えたことで浮動小数点演算のコストはおよそ半分になっている筈ですが、全体としてそれほど高速化が果たせていないのはメモリアクセスがボトルネックとなってるためと思われます。

コード1.6 の再内側ループ部分は

   1004015a0:   c4 c2 7d 19 70 c8       vbroadcastsd -0x38(%r8),%ymm6

1004015a6: c4 c2 cd b8 91 c0 24 vfmadd231pd -0xdb40(%r9),%ymm6,%ymm2
1004015ad: ff ff
1004015af: c4 c2 cd b8 a1 e0 24 vfmadd231pd -0xdb20(%r9),%ymm6,%ymm4
1004015b6: ff ff
1004015b8: c4 c2 cd b8 a9 00 25 vfmadd231pd -0xdb00(%r9),%ymm6,%ymm5
1004015bf: ff ff
1004015c1: c4 c2 cd b8 99 20 25 vfmadd231pd -0xdae0(%r9),%ymm6,%ymm3
1004015c8: ff ff
1004015ca: c4 c2 cd b8 89 40 25 vfmadd231pd -0xdac0(%r9),%ymm6,%ymm1
1004015d1: ff ff
1004015d3: c4 c2 7d 19 70 d0 vbroadcastsd -0x30(%r8),%ymm6
1004015d9: c4 c2 cd b8 91 00 44 vfmadd231pd -0xbc00(%r9),%ymm6,%ymm2
1004015e0: ff ff
1004015e2: c4 c2 cd b8 a1 20 44 vfmadd231pd -0xbbe0(%r9),%ymm6,%ymm4
1004015e9: ff ff
1004015eb: c4 c2 cd b8 a9 40 44 vfmadd231pd -0xbbc0(%r9),%ymm6,%ymm5
1004015f2: ff ff
1004015f4: c4 c2 cd b8 99 60 44 vfmadd231pd -0xbba0(%r9),%ymm6,%ymm3
1004015fb: ff ff
1004015fd: c4 c2 cd b8 89 80 44 vfmadd231pd -0xbb80(%r9),%ymm6,%ymm1
100401604: ff ff
100401606: c4 c2 7d 19 70 d8 vbroadcastsd -0x28(%r8),%ymm6
10040160c: c4 c2 cd b8 91 40 63 vfmadd231pd -0x9cc0(%r9),%ymm6,%ymm2
100401613: ff ff
100401615: c4 c2 cd b8 a1 60 63 vfmadd231pd -0x9ca0(%r9),%ymm6,%ymm4
10040161c: ff ff
10040161e: c4 c2 cd b8 a9 80 63 vfmadd231pd -0x9c80(%r9),%ymm6,%ymm5
100401625: ff ff
100401627: c4 c2 cd b8 99 a0 63 vfmadd231pd -0x9c60(%r9),%ymm6,%ymm3
10040162e: ff ff
100401630: c4 c2 cd b8 89 c0 63 vfmadd231pd -0x9c40(%r9),%ymm6,%ymm1
100401637: ff ff
100401639: c4 c2 7d 19 70 e0 vbroadcastsd -0x20(%r8),%ymm6
10040163f: c4 c2 cd b8 91 80 82 vfmadd231pd -0x7d80(%r9),%ymm6,%ymm2
100401646: ff ff
100401648: c4 c2 cd b8 a1 a0 82 vfmadd231pd -0x7d60(%r9),%ymm6,%ymm4
10040164f: ff ff
100401651: c4 c2 cd b8 a9 c0 82 vfmadd231pd -0x7d40(%r9),%ymm6,%ymm5
100401658: ff ff
10040165a: c4 c2 cd b8 99 e0 82 vfmadd231pd -0x7d20(%r9),%ymm6,%ymm3
100401661: ff ff
100401663: c4 c2 cd b8 89 00 83 vfmadd231pd -0x7d00(%r9),%ymm6,%ymm1
10040166a: ff ff
10040166c: c4 c2 7d 19 70 e8 vbroadcastsd -0x18(%r8),%ymm6
100401672: c4 c2 cd b8 91 c0 a1 vfmadd231pd -0x5e40(%r9),%ymm6,%ymm2
100401679: ff ff
10040167b: c4 c2 cd b8 a1 e0 a1 vfmadd231pd -0x5e20(%r9),%ymm6,%ymm4
100401682: ff ff
100401684: c4 c2 cd b8 a9 00 a2 vfmadd231pd -0x5e00(%r9),%ymm6,%ymm5
10040168b: ff ff
10040168d: c4 c2 cd b8 99 20 a2 vfmadd231pd -0x5de0(%r9),%ymm6,%ymm3
100401694: ff ff
100401696: c4 c2 cd b8 89 40 a2 vfmadd231pd -0x5dc0(%r9),%ymm6,%ymm1
10040169d: ff ff
10040169f: c4 c2 7d 19 70 f0 vbroadcastsd -0x10(%r8),%ymm6
1004016a5: c4 c2 cd b8 91 00 c1 vfmadd231pd -0x3f00(%r9),%ymm6,%ymm2
1004016ac: ff ff
1004016ae: c4 c2 cd b8 a1 20 c1 vfmadd231pd -0x3ee0(%r9),%ymm6,%ymm4
1004016b5: ff ff
1004016b7: c4 c2 cd b8 a9 40 c1 vfmadd231pd -0x3ec0(%r9),%ymm6,%ymm5
1004016be: ff ff
1004016c0: c4 c2 cd b8 99 60 c1 vfmadd231pd -0x3ea0(%r9),%ymm6,%ymm3
1004016c7: ff ff
1004016c9: c4 c2 cd b8 89 80 c1 vfmadd231pd -0x3e80(%r9),%ymm6,%ymm1
1004016d0: ff ff
1004016d2: c4 c2 7d 19 70 f8 vbroadcastsd -0x8(%r8),%ymm6
1004016d8: c4 c2 cd b8 91 40 e0 vfmadd231pd -0x1fc0(%r9),%ymm6,%ymm2
1004016df: ff ff
1004016e1: c4 c2 cd b8 a1 60 e0 vfmadd231pd -0x1fa0(%r9),%ymm6,%ymm4
1004016e8: ff ff
1004016ea: c4 c2 cd b8 a9 80 e0 vfmadd231pd -0x1f80(%r9),%ymm6,%ymm5
1004016f1: ff ff
1004016f3: c4 c2 cd b8 99 a0 e0 vfmadd231pd -0x1f60(%r9),%ymm6,%ymm3
1004016fa: ff ff
1004016fc: c4 c2 cd b8 89 c0 e0 vfmadd231pd -0x1f40(%r9),%ymm6,%ymm1
100401703: ff ff
100401705: c4 c2 7d 19 30 vbroadcastsd (%r8),%ymm6
10040170a: c4 c2 cd b8 51 80 vfmadd231pd -0x80(%r9),%ymm6,%ymm2
100401710: c4 c2 cd b8 61 a0 vfmadd231pd -0x60(%r9),%ymm6,%ymm4
100401716: c4 c2 cd b8 69 c0 vfmadd231pd -0x40(%r9),%ymm6,%ymm5
10040171c: c4 c2 cd b8 59 e0 vfmadd231pd -0x20(%r9),%ymm6,%ymm3
100401722: c4 c2 cd b8 09 vfmadd231pd (%r9),%ymm6,%ymm1
100401727: 48 83 c0 08 add $0x8,%rax
10040172b: 49 83 c0 40 add $0x40,%r8
10040172f: 49 81 c1 00 fa 00 00 add $0xfa00,%r9
100401736: 4c 39 f0 cmp %r14,%rax
100401739: 0f 8c 61 fe ff ff jl 1004015a0 <main+0x1d0>

というコードとなっていましたが、16本ある SIMD レジスタが %ymm0 から %ymm6 までしか使用されておらず、レジスタの数にはまだ余裕があるので jj ループのアンロールを更に進めてみます。

--- code1.6.c   

+++ code1.7.c
@@ -54,7 +54,7 @@ int main(void)

int i, ii,j,jj,k,kk;
double start,elapsed_time;
- __m256d c0, c1, c2, c3, c4;
+ __m256d c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;

for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
@@ -75,12 +75,17 @@ int main(void)
for (j = 0; j < N; j+=BLOCK_SIZE) {
for (k = 0; k < N; k+=BLOCK_SIZE) {
for(ii=i;ii < (i + BLOCK_SIZE);ii++){
- for(jj=j;jj < (j + BLOCK_SIZE);jj+=20){
+ jj=j;{
c0 = _mm256_setzero_pd();
c1 = _mm256_setzero_pd();
c2 = _mm256_setzero_pd();
c3 = _mm256_setzero_pd();
c4 = _mm256_setzero_pd();
+ c5 = _mm256_setzero_pd();
+ c6 = _mm256_setzero_pd();
+ c7 = _mm256_setzero_pd();
+ c8 = _mm256_setzero_pd();
+ c9 = _mm256_setzero_pd();
for (kk = k; kk < (k + BLOCK_SIZE); kk+=8) {
__m256d a0 = _mm256_set1_pd(a[ii+0][kk+0]);
c0 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 0]), c0);
@@ -88,54 +93,99 @@ int main(void)
c2 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+12]), c3);
c4 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a0, _mm256_load_pd(&b[kk+0][jj+36]), c9);
__m256d a1 = _mm256_set1_pd(a[ii+0][kk+1]);
c0 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 0]), c0);
c1 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 4]), c1);
c2 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+12]), c3);
c4 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a1, _mm256_load_pd(&b[kk+1][jj+36]), c9);
__m256d a2 = _mm256_set1_pd(a[ii+0][kk+2]);
c0 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 0]), c0);
c1 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 4]), c1);
c2 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+12]), c3);
c4 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a2, _mm256_load_pd(&b[kk+2][jj+36]), c9);
__m256d a3 = _mm256_set1_pd(a[ii+0][kk+3]);
c0 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 0]), c0);
c1 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 4]), c1);
c2 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+12]), c3);
c4 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a3, _mm256_load_pd(&b[kk+3][jj+36]), c9);
__m256d a4 = _mm256_set1_pd(a[ii+0][kk+4]);
c0 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 0]), c0);
c1 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 4]), c1);
c2 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+12]), c3);
c4 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a4, _mm256_load_pd(&b[kk+4][jj+36]), c9);
__m256d a5 = _mm256_set1_pd(a[ii+0][kk+5]);
c0 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 0]), c0);
c1 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 4]), c1);
c2 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+12]), c3);
c4 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a5, _mm256_load_pd(&b[kk+5][jj+36]), c9);
__m256d a6 = _mm256_set1_pd(a[ii+0][kk+6]);
c0 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 0]), c0);
c1 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 4]), c1);
c2 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+12]), c3);
c4 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a6, _mm256_load_pd(&b[kk+6][jj+36]), c9);
__m256d a7 = _mm256_set1_pd(a[ii+0][kk+7]);
c0 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 0]), c0);
c1 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 4]), c1);
c2 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+ 8]), c2);
c3 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+12]), c3);
c4 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+16]), c4);
+ c5 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+20]), c5);
+ c6 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+24]), c6);
+ c7 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+28]), c7);
+ c8 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+32]), c8);
+ c9 = _mm256_fmadd_pd(a7, _mm256_load_pd(&b[kk+7][jj+36]), c9);
}
_mm256_store_pd(&c[ii][jj+ 0], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 0]), c0));
_mm256_store_pd(&c[ii][jj+ 4], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 4]), c1));
_mm256_store_pd(&c[ii][jj+ 8], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+ 8]), c2));
_mm256_store_pd(&c[ii][jj+12], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+12]), c3));
_mm256_store_pd(&c[ii][jj+16], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+16]), c4));
+ _mm256_store_pd(&c[ii][jj+20], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+20]), c5));
+ _mm256_store_pd(&c[ii][jj+24], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+24]), c6));
+ _mm256_store_pd(&c[ii][jj+28], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+28]), c7));
+ _mm256_store_pd(&c[ii][jj+32], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+32]), c8));
+ _mm256_store_pd(&c[ii][jj+36], _mm256_add_pd(_mm256_load_pd(&c[ii][jj+36]), c9));
}
}
}

これを実行した結果が以下となります。

$ ./code1.7

elapsed 0.077831 sec
diff is 0.000000

コード1.6 と較べて約 23% 程の高速化となりました。コードを見てみると

   1004015e0:   c4 42 7d 19 5d c8       vbroadcastsd -0x38(%r13),%ymm11

1004015e6: c4 c2 a5 b8 af 20 24 vfmadd231pd -0xdbe0(%r15),%ymm11,%ymm5
1004015ed: ff ff
1004015ef: c4 42 a5 b8 87 40 24 vfmadd231pd -0xdbc0(%r15),%ymm11,%ymm8
1004015f6: ff ff
1004015f8: c4 c2 a5 b8 bf 60 24 vfmadd231pd -0xdba0(%r15),%ymm11,%ymm7
1004015ff: ff ff
100401601: c4 42 a5 b8 97 80 24 vfmadd231pd -0xdb80(%r15),%ymm11,%ymm10
100401608: ff ff
10040160a: c4 42 a5 b8 8f a0 24 vfmadd231pd -0xdb60(%r15),%ymm11,%ymm9
100401611: ff ff
100401613: c4 c2 a5 b8 b7 c0 24 vfmadd231pd -0xdb40(%r15),%ymm11,%ymm6
10040161a: ff ff
10040161c: c4 c2 a5 b8 a7 e0 24 vfmadd231pd -0xdb20(%r15),%ymm11,%ymm4
100401623: ff ff
100401625: c4 c2 a5 b8 9f 00 25 vfmadd231pd -0xdb00(%r15),%ymm11,%ymm3
10040162c: ff ff
10040162e: c4 c2 a5 b8 97 20 25 vfmadd231pd -0xdae0(%r15),%ymm11,%ymm2
100401635: ff ff
100401637: c4 c2 a5 b8 8f 40 25 vfmadd231pd -0xdac0(%r15),%ymm11,%ymm1
10040163e: ff ff
100401640: c4 42 7d 19 5d d0 vbroadcastsd -0x30(%r13),%ymm11
100401646: c4 c2 a5 b8 af 60 43 vfmadd231pd -0xbca0(%r15),%ymm11,%ymm5
10040164d: ff ff
10040164f: c4 42 a5 b8 87 80 43 vfmadd231pd -0xbc80(%r15),%ymm11,%ymm8
100401656: ff ff
100401658: c4 c2 a5 b8 bf a0 43 vfmadd231pd -0xbc60(%r15),%ymm11,%ymm7
10040165f: ff ff
100401661: c4 42 a5 b8 97 c0 43 vfmadd231pd -0xbc40(%r15),%ymm11,%ymm10
100401668: ff ff
10040166a: c4 42 a5 b8 8f e0 43 vfmadd231pd -0xbc20(%r15),%ymm11,%ymm9
100401671: ff ff
100401673: c4 c2 a5 b8 b7 00 44 vfmadd231pd -0xbc00(%r15),%ymm11,%ymm6
10040167a: ff ff
10040167c: c4 c2 a5 b8 a7 20 44 vfmadd231pd -0xbbe0(%r15),%ymm11,%ymm4
100401683: ff ff
100401685: c4 c2 a5 b8 9f 40 44 vfmadd231pd -0xbbc0(%r15),%ymm11,%ymm3
10040168c: ff ff
10040168e: c4 c2 a5 b8 97 60 44 vfmadd231pd -0xbba0(%r15),%ymm11,%ymm2
100401695: ff ff
100401697: c4 c2 a5 b8 8f 80 44 vfmadd231pd -0xbb80(%r15),%ymm11,%ymm1
10040169e: ff ff
1004016a0: c4 42 7d 19 5d d8 vbroadcastsd -0x28(%r13),%ymm11
1004016a6: c4 c2 a5 b8 af a0 62 vfmadd231pd -0x9d60(%r15),%ymm11,%ymm5
1004016ad: ff ff
1004016af: c4 42 a5 b8 87 c0 62 vfmadd231pd -0x9d40(%r15),%ymm11,%ymm8
1004016b6: ff ff
1004016b8: c4 c2 a5 b8 bf e0 62 vfmadd231pd -0x9d20(%r15),%ymm11,%ymm7
1004016bf: ff ff
1004016c1: c4 42 a5 b8 97 00 63 vfmadd231pd -0x9d00(%r15),%ymm11,%ymm10
1004016c8: ff ff
1004016ca: c4 42 a5 b8 8f 20 63 vfmadd231pd -0x9ce0(%r15),%ymm11,%ymm9
1004016d1: ff ff
1004016d3: c4 c2 a5 b8 b7 40 63 vfmadd231pd -0x9cc0(%r15),%ymm11,%ymm6
1004016da: ff ff
1004016dc: c4 c2 a5 b8 a7 60 63 vfmadd231pd -0x9ca0(%r15),%ymm11,%ymm4
1004016e3: ff ff
1004016e5: c4 c2 a5 b8 9f 80 63 vfmadd231pd -0x9c80(%r15),%ymm11,%ymm3
1004016ec: ff ff
1004016ee: c4 c2 a5 b8 97 a0 63 vfmadd231pd -0x9c60(%r15),%ymm11,%ymm2
1004016f5: ff ff
1004016f7: c4 c2 a5 b8 8f c0 63 vfmadd231pd -0x9c40(%r15),%ymm11,%ymm1
1004016fe: ff ff
100401700: c4 42 7d 19 5d e0 vbroadcastsd -0x20(%r13),%ymm11
100401706: c4 c2 a5 b8 af e0 81 vfmadd231pd -0x7e20(%r15),%ymm11,%ymm5
10040170d: ff ff
10040170f: c4 42 a5 b8 87 00 82 vfmadd231pd -0x7e00(%r15),%ymm11,%ymm8
100401716: ff ff
100401718: c4 c2 a5 b8 bf 20 82 vfmadd231pd -0x7de0(%r15),%ymm11,%ymm7
10040171f: ff ff
100401721: c4 42 a5 b8 97 40 82 vfmadd231pd -0x7dc0(%r15),%ymm11,%ymm10
100401728: ff ff
10040172a: c4 42 a5 b8 8f 60 82 vfmadd231pd -0x7da0(%r15),%ymm11,%ymm9
100401731: ff ff
100401733: c4 c2 a5 b8 b7 80 82 vfmadd231pd -0x7d80(%r15),%ymm11,%ymm6
10040173a: ff ff
10040173c: c4 c2 a5 b8 a7 a0 82 vfmadd231pd -0x7d60(%r15),%ymm11,%ymm4
100401743: ff ff
100401745: c4 c2 a5 b8 9f c0 82 vfmadd231pd -0x7d40(%r15),%ymm11,%ymm3
10040174c: ff ff
10040174e: c4 c2 a5 b8 97 e0 82 vfmadd231pd -0x7d20(%r15),%ymm11,%ymm2
100401755: ff ff
100401757: c4 c2 a5 b8 8f 00 83 vfmadd231pd -0x7d00(%r15),%ymm11,%ymm1
10040175e: ff ff
100401760: c4 42 7d 19 5d e8 vbroadcastsd -0x18(%r13),%ymm11
100401766: c4 c2 a5 b8 af 20 a1 vfmadd231pd -0x5ee0(%r15),%ymm11,%ymm5
10040176d: ff ff
10040176f: c4 42 a5 b8 87 40 a1 vfmadd231pd -0x5ec0(%r15),%ymm11,%ymm8
100401776: ff ff
100401778: c4 c2 a5 b8 bf 60 a1 vfmadd231pd -0x5ea0(%r15),%ymm11,%ymm7
10040177f: ff ff
100401781: c4 42 a5 b8 97 80 a1 vfmadd231pd -0x5e80(%r15),%ymm11,%ymm10
100401788: ff ff
10040178a: c4 42 a5 b8 8f a0 a1 vfmadd231pd -0x5e60(%r15),%ymm11,%ymm9
100401791: ff ff
100401793: c4 c2 a5 b8 b7 c0 a1 vfmadd231pd -0x5e40(%r15),%ymm11,%ymm6
10040179a: ff ff
10040179c: c4 c2 a5 b8 a7 e0 a1 vfmadd231pd -0x5e20(%r15),%ymm11,%ymm4
1004017a3: ff ff
1004017a5: c4 c2 a5 b8 9f 00 a2 vfmadd231pd -0x5e00(%r15),%ymm11,%ymm3
1004017ac: ff ff
1004017ae: c4 c2 a5 b8 97 20 a2 vfmadd231pd -0x5de0(%r15),%ymm11,%ymm2
1004017b5: ff ff
1004017b7: c4 c2 a5 b8 8f 40 a2 vfmadd231pd -0x5dc0(%r15),%ymm11,%ymm1
1004017be: ff ff
1004017c0: c4 42 7d 19 5d f0 vbroadcastsd -0x10(%r13),%ymm11
1004017c6: c4 c2 a5 b8 af 60 c0 vfmadd231pd -0x3fa0(%r15),%ymm11,%ymm5
1004017cd: ff ff
1004017cf: c4 42 a5 b8 87 80 c0 vfmadd231pd -0x3f80(%r15),%ymm11,%ymm8
1004017d6: ff ff
1004017d8: c4 c2 a5 b8 bf a0 c0 vfmadd231pd -0x3f60(%r15),%ymm11,%ymm7
1004017df: ff ff
1004017e1: c4 42 a5 b8 97 c0 c0 vfmadd231pd -0x3f40(%r15),%ymm11,%ymm10
1004017e8: ff ff
1004017ea: c4 42 a5 b8 8f e0 c0 vfmadd231pd -0x3f20(%r15),%ymm11,%ymm9
1004017f1: ff ff
1004017f3: c4 c2 a5 b8 b7 00 c1 vfmadd231pd -0x3f00(%r15),%ymm11,%ymm6
1004017fa: ff ff
1004017fc: c4 c2 a5 b8 a7 20 c1 vfmadd231pd -0x3ee0(%r15),%ymm11,%ymm4
100401803: ff ff
100401805: c4 c2 a5 b8 9f 40 c1 vfmadd231pd -0x3ec0(%r15),%ymm11,%ymm3
10040180c: ff ff
10040180e: c4 c2 a5 b8 97 60 c1 vfmadd231pd -0x3ea0(%r15),%ymm11,%ymm2
100401815: ff ff
100401817: c4 c2 a5 b8 8f 80 c1 vfmadd231pd -0x3e80(%r15),%ymm11,%ymm1
10040181e: ff ff
100401820: c4 42 7d 19 5d f8 vbroadcastsd -0x8(%r13),%ymm11
100401826: c4 c2 a5 b8 af a0 df vfmadd231pd -0x2060(%r15),%ymm11,%ymm5
10040182d: ff ff
10040182f: c4 42 a5 b8 87 c0 df vfmadd231pd -0x2040(%r15),%ymm11,%ymm8
100401836: ff ff
100401838: c4 c2 a5 b8 bf e0 df vfmadd231pd -0x2020(%r15),%ymm11,%ymm7
10040183f: ff ff
100401841: c4 42 a5 b8 97 00 e0 vfmadd231pd -0x2000(%r15),%ymm11,%ymm10
100401848: ff ff
10040184a: c4 42 a5 b8 8f 20 e0 vfmadd231pd -0x1fe0(%r15),%ymm11,%ymm9
100401851: ff ff
100401853: c4 c2 a5 b8 b7 40 e0 vfmadd231pd -0x1fc0(%r15),%ymm11,%ymm6
10040185a: ff ff
10040185c: c4 c2 a5 b8 a7 60 e0 vfmadd231pd -0x1fa0(%r15),%ymm11,%ymm4
100401863: ff ff
100401865: c4 c2 a5 b8 9f 80 e0 vfmadd231pd -0x1f80(%r15),%ymm11,%ymm3
10040186c: ff ff
10040186e: c4 c2 a5 b8 97 a0 e0 vfmadd231pd -0x1f60(%r15),%ymm11,%ymm2
100401875: ff ff
100401877: c4 c2 a5 b8 8f c0 e0 vfmadd231pd -0x1f40(%r15),%ymm11,%ymm1
10040187e: ff ff
100401880: c4 42 7d 19 5d 00 vbroadcastsd 0x0(%r13),%ymm11
100401886: c4 c2 a5 b8 af e0 fe vfmadd231pd -0x120(%r15),%ymm11,%ymm5
10040188d: ff ff
10040188f: c4 42 a5 b8 87 00 ff vfmadd231pd -0x100(%r15),%ymm11,%ymm8
100401896: ff ff
100401898: c4 c2 a5 b8 bf 20 ff vfmadd231pd -0xe0(%r15),%ymm11,%ymm7
10040189f: ff ff
1004018a1: c4 42 a5 b8 97 40 ff vfmadd231pd -0xc0(%r15),%ymm11,%ymm10
1004018a8: ff ff
1004018aa: c4 42 a5 b8 8f 60 ff vfmadd231pd -0xa0(%r15),%ymm11,%ymm9
1004018b1: ff ff
1004018b3: c4 c2 a5 b8 77 80 vfmadd231pd -0x80(%r15),%ymm11,%ymm6
1004018b9: c4 c2 a5 b8 67 a0 vfmadd231pd -0x60(%r15),%ymm11,%ymm4
1004018bf: c4 c2 a5 b8 5f c0 vfmadd231pd -0x40(%r15),%ymm11,%ymm3
1004018c5: c4 c2 a5 b8 57 e0 vfmadd231pd -0x20(%r15),%ymm11,%ymm2
1004018cb: c4 c2 a5 b8 0f vfmadd231pd (%r15),%ymm11,%ymm1
1004018d0: 48 83 c0 08 add $0x8,%rax
1004018d4: 49 83 c5 40 add $0x40,%r13
1004018d8: 49 81 c7 00 fa 00 00 add $0xfa00,%r15
1004018df: 4c 39 e0 cmp %r12,%rax
1004018e2: 0f 8c f8 fc ff ff jl 1004015e0 <main+0x210>

ということであり、おおよそ FMA 命令の積和演算が繰り返される内容となりました。レジスタも %ymm11 まで使用されるようなったので、浮動小数点演算の実行効率は向上したと思います。これ以上の速度向上を目指すにはメモリアクセスの方法を再度検討することになると思われます。

SIMD 命令の直接の使用は、環境によっては動作しなかったり、将来的に更に強力な命令セットの登場やコンパイラの改良に対しては足かせとなってしまうことも考えられるため、保守性等も考慮した上で使用は検討すべきと思います。

これまでの結果を以下に纏めます。

コード
結果
変更内容

コード1
0.473149 sec

コード1.1
0.386477 sec
隣接するメモリアクセスが効率的であることを考慮し、展開するループを ii から jj に変更

コード1.2
0.197167 sec
ループの展開を 2ループ分 から 4ループ分に変更

コード1.3
0.190590 sec
kk のループを 5ループ分展開から 4ループ分展開に変更

コード1.4
0.143134 sec
jj と kk のループ展開をそれぞれ 4ループ分から 8ループ分に変更

コード1.5
0.115160 sec
jj のループ展開を 20ループ分に変更

コード1.6
0.101164 sec
乗算 + 加算の処理を積和演算命令を使用するよう変更

コード1.7
0.077831 sec
jj のループをすべて(40ループ分)展開

コード2
1.064653 sec

コード3
1.192261 sec

コード4
2.136027 sec

コード5
0.323239 sec