C
C++
高速化
AVX
FMA

4x4行列同士の掛算を高速化

More than 1 year has passed since last update.


4x4行列同士の掛算を高速化してみる

演算単位が小さいので、あまり意味は無いかもしれないが、高速化してみた。

4x4行列は、OpenGLやらDirect3Dやら、主に3次元処理でよく使われる。


前提


  • 4x4行列の格納順は、OpenGL準拠(ColumnMajor)です。

  • Visual Studio2015 でコンパイルしてます。(gccとかはどうなるかわかりません)


共通

#include <Windows.h>

#include <intrin.h>

#define M44(base,row,col) base[col*4+row]


1. 普通版

ごく普通に作成するとこうなるであろうコード。

なのだけれども、演算途中結果をローカル変数に一度格納するか、しないかで大きく性能は違う。


MultMatrix_For

void    MultMatrix_For( double result[16], const double base[16], const double mult[16] )

{
for( int r = 0; r < 4; r++ )
{
for( int c = 0; c < 4; c++ )
{
double temp = 0;

for( int i = 0; i < 4; i++ )
{
temp += M44(base,r,i) * M44(mult,i,c);
}

M44(result,r,c) = temp;
}
}
}



2. For文を展開してみる


MultMatrix_ExR

void    MultMatrix_ExR( double result[16], const double base[16], const double mult[16] )

{
M44(result,0,0) = M44(base,0,0) * M44(mult,0,0) + M44(base,0,1) * M44(mult,1,0) + M44(base,0,2) * M44(mult,2,0) + M44(base,0,3) * M44(mult,3,0);
M44(result,1,0) = M44(base,1,0) * M44(mult,0,0) + M44(base,1,1) * M44(mult,1,0) + M44(base,1,2) * M44(mult,2,0) + M44(base,1,3) * M44(mult,3,0);
M44(result,2,0) = M44(base,2,0) * M44(mult,0,0) + M44(base,2,1) * M44(mult,1,0) + M44(base,2,2) * M44(mult,2,0) + M44(base,2,3) * M44(mult,3,0);
M44(result,3,0) = M44(base,3,0) * M44(mult,0,0) + M44(base,3,1) * M44(mult,1,0) + M44(base,3,2) * M44(mult,2,0) + M44(base,3,3) * M44(mult,3,0);

M44(result,0,1) = M44(base,0,0) * M44(mult,0,1) + M44(base,0,1) * M44(mult,1,1) + M44(base,0,2) * M44(mult,2,1) + M44(base,0,3) * M44(mult,3,1);
M44(result,1,1) = M44(base,1,0) * M44(mult,0,1) + M44(base,1,1) * M44(mult,1,1) + M44(base,1,2) * M44(mult,2,1) + M44(base,1,3) * M44(mult,3,1);
M44(result,2,1) = M44(base,2,0) * M44(mult,0,1) + M44(base,2,1) * M44(mult,1,1) + M44(base,2,2) * M44(mult,2,1) + M44(base,2,3) * M44(mult,3,1);
M44(result,3,1) = M44(base,3,0) * M44(mult,0,1) + M44(base,3,1) * M44(mult,1,1) + M44(base,3,2) * M44(mult,2,1) + M44(base,3,3) * M44(mult,3,1);

M44(result,0,2) = M44(base,0,0) * M44(mult,0,2) + M44(base,0,1) * M44(mult,1,2) + M44(base,0,2) * M44(mult,2,2) + M44(base,0,3) * M44(mult,3,2);
M44(result,1,2) = M44(base,1,0) * M44(mult,0,2) + M44(base,1,1) * M44(mult,1,2) + M44(base,1,2) * M44(mult,2,2) + M44(base,1,3) * M44(mult,3,2);
M44(result,2,2) = M44(base,2,0) * M44(mult,0,2) + M44(base,2,1) * M44(mult,1,2) + M44(base,2,2) * M44(mult,2,2) + M44(base,2,3) * M44(mult,3,2);
M44(result,3,2) = M44(base,3,0) * M44(mult,0,2) + M44(base,3,1) * M44(mult,1,2) + M44(base,3,2) * M44(mult,2,2) + M44(base,3,3) * M44(mult,3,2);

M44(result,0,3) = M44(base,0,0) * M44(mult,0,3) + M44(base,0,1) * M44(mult,1,3) + M44(base,0,2) * M44(mult,2,3) + M44(base,0,3) * M44(mult,3,3);
M44(result,1,3) = M44(base,1,0) * M44(mult,0,3) + M44(base,1,1) * M44(mult,1,3) + M44(base,1,2) * M44(mult,2,3) + M44(base,1,3) * M44(mult,3,3);
M44(result,2,3) = M44(base,2,0) * M44(mult,0,3) + M44(base,2,1) * M44(mult,1,3) + M44(base,2,2) * M44(mult,2,3) + M44(base,2,3) * M44(mult,3,3);
M44(result,3,3) = M44(base,3,0) * M44(mult,0,3) + M44(base,3,1) * M44(mult,1,3) + M44(base,3,2) * M44(mult,2,3) + M44(base,3,3) * M44(mult,3,3);
}



3. Forを展開してみる Type2

順序を変えてみる。

上記ではResultへの格納順がSequentialAccessになっているが、RandomAccessにした場合どうなるか。


MultMatrix_ExC

void    MultMatrix_ExC( double result[16], const double base[16], const double mult[16] )

{
M44(result,0,0) = M44(base,0,0) * M44(mult,0,0) + M44(base,0,1) * M44(mult,1,0) + M44(base,0,2) * M44(mult,2,0) + M44(base,0,3) * M44(mult,3,0);
M44(result,0,1) = M44(base,0,0) * M44(mult,0,1) + M44(base,0,1) * M44(mult,1,1) + M44(base,0,2) * M44(mult,2,1) + M44(base,0,3) * M44(mult,3,1);
M44(result,0,2) = M44(base,0,0) * M44(mult,0,2) + M44(base,0,1) * M44(mult,1,2) + M44(base,0,2) * M44(mult,2,2) + M44(base,0,3) * M44(mult,3,2);
M44(result,0,3) = M44(base,0,0) * M44(mult,0,3) + M44(base,0,1) * M44(mult,1,3) + M44(base,0,2) * M44(mult,2,3) + M44(base,0,3) * M44(mult,3,3);

M44(result,1,0) = M44(base,1,0) * M44(mult,0,0) + M44(base,1,1) * M44(mult,1,0) + M44(base,1,2) * M44(mult,2,0) + M44(base,1,3) * M44(mult,3,0);
M44(result,1,1) = M44(base,1,0) * M44(mult,0,1) + M44(base,1,1) * M44(mult,1,1) + M44(base,1,2) * M44(mult,2,1) + M44(base,1,3) * M44(mult,3,1);
M44(result,1,2) = M44(base,1,0) * M44(mult,0,2) + M44(base,1,1) * M44(mult,1,2) + M44(base,1,2) * M44(mult,2,2) + M44(base,1,3) * M44(mult,3,2);
M44(result,1,3) = M44(base,1,0) * M44(mult,0,3) + M44(base,1,1) * M44(mult,1,3) + M44(base,1,2) * M44(mult,2,3) + M44(base,1,3) * M44(mult,3,3);

M44(result,2,0) = M44(base,2,0) * M44(mult,0,0) + M44(base,2,1) * M44(mult,1,0) + M44(base,2,2) * M44(mult,2,0) + M44(base,2,3) * M44(mult,3,0);
M44(result,2,1) = M44(base,2,0) * M44(mult,0,1) + M44(base,2,1) * M44(mult,1,1) + M44(base,2,2) * M44(mult,2,1) + M44(base,2,3) * M44(mult,3,1);
M44(result,2,2) = M44(base,2,0) * M44(mult,0,2) + M44(base,2,1) * M44(mult,1,2) + M44(base,2,2) * M44(mult,2,2) + M44(base,2,3) * M44(mult,3,2);
M44(result,2,3) = M44(base,2,0) * M44(mult,0,3) + M44(base,2,1) * M44(mult,1,3) + M44(base,2,2) * M44(mult,2,3) + M44(base,2,3) * M44(mult,3,3);

M44(result,3,0) = M44(base,3,0) * M44(mult,0,0) + M44(base,3,1) * M44(mult,1,0) + M44(base,3,2) * M44(mult,2,0) + M44(base,3,3) * M44(mult,3,0);
M44(result,3,1) = M44(base,3,0) * M44(mult,0,1) + M44(base,3,1) * M44(mult,1,1) + M44(base,3,2) * M44(mult,2,1) + M44(base,3,3) * M44(mult,3,1);
M44(result,3,2) = M44(base,3,0) * M44(mult,0,2) + M44(base,3,1) * M44(mult,1,2) + M44(base,3,2) * M44(mult,2,2) + M44(base,3,3) * M44(mult,3,2);
M44(result,3,3) = M44(base,3,0) * M44(mult,0,3) + M44(base,3,1) * M44(mult,1,3) + M44(base,3,2) * M44(mult,2,3) + M44(base,3,3) * M44(mult,3,3);
}



4. SSE2命令を使ってみる

SSE2を使ってみる。

1命令で2演算できる。


MultMatrix_SSE2

void    MultMatrix_SSE2( double result[16], const double base[16], const double mult[16] )

{
__m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
__m128d xmm08, xmm09, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

xmm08 = _mm_loadu_pd( &mult[0] );
xmm09 = _mm_loadu_pd( &mult[2] );
xmm10 = _mm_loadu_pd( &mult[4] );
xmm11 = _mm_loadu_pd( &mult[6] );
xmm12 = _mm_loadu_pd( &mult[8] );
xmm13 = _mm_loadu_pd( &mult[10] );
xmm14 = _mm_loadu_pd( &mult[12] );
xmm15 = _mm_loadu_pd( &mult[14] );

// column0
xmm0 = _mm_load1_pd( &base[0] );
xmm2 = _mm_load1_pd( &base[1] );
xmm4 = _mm_load1_pd( &base[2] );
xmm6 = _mm_load1_pd( &base[3] );

xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );

xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );

_mm_storeu_pd( &result[0], xmm0 );
_mm_storeu_pd( &result[2], xmm1 );

// column1
xmm0 = _mm_load1_pd( &base[4] );
xmm2 = _mm_load1_pd( &base[5] );
xmm4 = _mm_load1_pd( &base[6] );
xmm6 = _mm_load1_pd( &base[7] );

xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );

xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );

_mm_storeu_pd( &result[4], xmm0 );
_mm_storeu_pd( &result[6], xmm1 );

// column2
xmm0 = _mm_load1_pd( &base[8] );
xmm2 = _mm_load1_pd( &base[9] );
xmm4 = _mm_load1_pd( &base[10] );
xmm6 = _mm_load1_pd( &base[11] );

xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );

xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );

_mm_storeu_pd( &result[8], xmm0 );
_mm_storeu_pd( &result[10], xmm1 );

// column3
xmm0 = _mm_load1_pd( &base[12] );
xmm2 = _mm_load1_pd( &base[13] );
xmm4 = _mm_load1_pd( &base[14] );
xmm6 = _mm_load1_pd( &base[15] );

xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );

xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );

_mm_storeu_pd( &result[12], xmm0 );
_mm_storeu_pd( &result[14], xmm1 );
}



5. AVX命令を使ってみる

こっちは、1命令で4演算である。


MultMatrix_AVX

void    MultMatrix_AVX( double result[16], const double base[16], const double mult[16] )

{
__m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;

ymm4 = _mm256_loadu_pd( &mult[0] );
ymm5 = _mm256_loadu_pd( &mult[4] );
ymm6 = _mm256_loadu_pd( &mult[8] );
ymm7 = _mm256_loadu_pd( &mult[12] );

// column0
ymm0 = _mm256_broadcast_sd( &base[0] );
ymm1 = _mm256_broadcast_sd( &base[1] );
ymm2 = _mm256_broadcast_sd( &base[2] );
ymm3 = _mm256_broadcast_sd( &base[3] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );

ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[0], ymm0 );

// column1
ymm0 = _mm256_broadcast_sd( &base[4] );
ymm1 = _mm256_broadcast_sd( &base[5] );
ymm2 = _mm256_broadcast_sd( &base[6] );
ymm3 = _mm256_broadcast_sd( &base[7] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );

ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[4], ymm0 );

// column2
ymm0 = _mm256_broadcast_sd( &base[8] );
ymm1 = _mm256_broadcast_sd( &base[9] );
ymm2 = _mm256_broadcast_sd( &base[10] );
ymm3 = _mm256_broadcast_sd( &base[11] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );

ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[8], ymm0 );

// column3
ymm0 = _mm256_broadcast_sd( &base[12] );
ymm1 = _mm256_broadcast_sd( &base[13] );
ymm2 = _mm256_broadcast_sd( &base[14] );
ymm3 = _mm256_broadcast_sd( &base[15] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );

ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[12], ymm0 );
}



6. FMA命令を使ってみる

こっちも1命令4演算が基本だけど、1命令fmaddは8演算になる。


MultMatrix_AVX_FMA

void    MultMatrix_AVX_FMA( double result[16], const double base[16], const double mult[16] )

{
__m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;

ymm4 = _mm256_loadu_pd( &mult[0] );
ymm5 = _mm256_loadu_pd( &mult[4] );
ymm6 = _mm256_loadu_pd( &mult[8] );
ymm7 = _mm256_loadu_pd( &mult[12] );

// column0
ymm0 = _mm256_broadcast_sd( &base[0] );
ymm1 = _mm256_broadcast_sd( &base[1] );
ymm2 = _mm256_broadcast_sd( &base[2] );
ymm3 = _mm256_broadcast_sd( &base[3] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );

_mm256_storeu_pd( &result[0], ymm0 );

// column1
ymm0 = _mm256_broadcast_sd( &base[4] );
ymm1 = _mm256_broadcast_sd( &base[5] );
ymm2 = _mm256_broadcast_sd( &base[6] );
ymm3 = _mm256_broadcast_sd( &base[7] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );

_mm256_storeu_pd( &result[4], ymm0 );

// column2
ymm0 = _mm256_broadcast_sd( &base[8] );
ymm1 = _mm256_broadcast_sd( &base[9] );
ymm2 = _mm256_broadcast_sd( &base[10] );
ymm3 = _mm256_broadcast_sd( &base[11] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );

_mm256_storeu_pd( &result[8], ymm0 );

// column3
ymm0 = _mm256_broadcast_sd( &base[12] );
ymm1 = _mm256_broadcast_sd( &base[13] );
ymm2 = _mm256_broadcast_sd( &base[14] );
ymm3 = _mm256_broadcast_sd( &base[15] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );

_mm256_storeu_pd( &result[12], ymm0 );
}



7. FMA命令での演算をちょっと変えてみる

合計値の足し込みの仕方を変えてみる。


MultMatrix_AVX_FMA_type2

void    MultMatrix_AVX_FMA_type2( double result[16], const double base[16], const double mult[16] )

{
__m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;

ymm4 = _mm256_loadu_pd( &mult[0] );
ymm5 = _mm256_loadu_pd( &mult[4] );
ymm6 = _mm256_loadu_pd( &mult[8] );
ymm7 = _mm256_loadu_pd( &mult[12] );

// column0
ymm0 = _mm256_broadcast_sd( &base[0] );
ymm1 = _mm256_broadcast_sd( &base[1] );
ymm2 = _mm256_broadcast_sd( &base[2] );
ymm3 = _mm256_broadcast_sd( &base[3] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

_mm256_storeu_pd( &result[0], ymm0 );

// column1
ymm0 = _mm256_broadcast_sd( &base[4] );
ymm1 = _mm256_broadcast_sd( &base[5] );
ymm2 = _mm256_broadcast_sd( &base[6] );
ymm3 = _mm256_broadcast_sd( &base[7] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

_mm256_storeu_pd( &result[4], ymm0 );

// column2
ymm0 = _mm256_broadcast_sd( &base[8] );
ymm1 = _mm256_broadcast_sd( &base[9] );
ymm2 = _mm256_broadcast_sd( &base[10] );
ymm3 = _mm256_broadcast_sd( &base[11] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

_mm256_storeu_pd( &result[8], ymm0 );

// column3
ymm0 = _mm256_broadcast_sd( &base[12] );
ymm1 = _mm256_broadcast_sd( &base[13] );
ymm2 = _mm256_broadcast_sd( &base[14] );
ymm3 = _mm256_broadcast_sd( &base[15] );

ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

_mm256_storeu_pd( &result[12], ymm0 );
}



結果。

測定環境:

CPU : Intel Core i7-6700K (定格運用)

MEM : DDR4-3000 DualChannel (Overclocked)

OS : Windows10 Professional 64bit

IDE : Visual Studio 2015

コンパイルオプション:速度優先, amd64

実行速度は、SingleThread(4.2GHz動作になる)と、MultiThread(4.0GHz動作になる)で計測。

MultiThread時は、物理4core論理8coreなので8 thread並列で実施している。

Gflopsは、1回の4x4行列の掛算が112演算なので、演算回数に112を乗したものである。

Func
[回/sec] @SingleThread

[回/sec]@8thread

マルチスレッド時性能比
Gflops @8threads

MultMatrix_For
55.8M
214.6M
384%
24.035

MultMatrix_ExR
60.5M
233.6M
386%
26.164

MultMatrix_ExC
60.1M
233.6M
389%
26.166

MultMatrix_SSE2
109.6M
420.1M
383%
47.049

MultMatrix_AVX
217.9M
837.5M
384%
93.803

MultMatrix_AVX_FMA
244.9M
938.0M
383%
105.057

MultMatrix_AVX_FMA_type2
258.9M
998.4M
386%
111.818

期待される「マルチスレッド時性能比」は、

4 core * 4.0 GHz / 4.2 GHz = 381%

である。

SIMD命令使わない時と、使った時とで5倍近くの性能差が出ている。

つまり、CPU使用率が100%だとしても、潜在能力の20%しか使っていないのかもしれない。

それは実にもったいないことだ。