4x4行列同士の掛算を高速化してみる
演算単位が小さいので、あまり意味は無いかもしれないが、高速化してみた。
4x4行列は、OpenGLやらDirect3Dやら、主に3次元処理でよく使われる。
前提
- 4x4行列の格納順は、OpenGL準拠(ColumnMajor)です。
- Visual Studio2015 でコンパイルしてます。(gccとかはどうなるかわかりません)
#include <Windows.h>
#include <intrin.h>
#define M44(base,row,col) base[col*4+row]
1. 普通版
ごく普通に作成するとこうなるであろうコード。
なのだけれども、演算途中結果をローカル変数に一度格納するか、しないかで大きく性能は違う。
void MultMatrix_For( double result[16], const double base[16], const double mult[16] )
{
for( int r = 0; r < 4; r++ )
{
for( int c = 0; c < 4; c++ )
{
double temp = 0;
for( int i = 0; i < 4; i++ )
{
temp += M44(base,r,i) * M44(mult,i,c);
}
M44(result,r,c) = temp;
}
}
}
2. For文を展開してみる
void MultMatrix_ExR( double result[16], const double base[16], const double mult[16] )
{
M44(result,0,0) = M44(base,0,0) * M44(mult,0,0) + M44(base,0,1) * M44(mult,1,0) + M44(base,0,2) * M44(mult,2,0) + M44(base,0,3) * M44(mult,3,0);
M44(result,1,0) = M44(base,1,0) * M44(mult,0,0) + M44(base,1,1) * M44(mult,1,0) + M44(base,1,2) * M44(mult,2,0) + M44(base,1,3) * M44(mult,3,0);
M44(result,2,0) = M44(base,2,0) * M44(mult,0,0) + M44(base,2,1) * M44(mult,1,0) + M44(base,2,2) * M44(mult,2,0) + M44(base,2,3) * M44(mult,3,0);
M44(result,3,0) = M44(base,3,0) * M44(mult,0,0) + M44(base,3,1) * M44(mult,1,0) + M44(base,3,2) * M44(mult,2,0) + M44(base,3,3) * M44(mult,3,0);
M44(result,0,1) = M44(base,0,0) * M44(mult,0,1) + M44(base,0,1) * M44(mult,1,1) + M44(base,0,2) * M44(mult,2,1) + M44(base,0,3) * M44(mult,3,1);
M44(result,1,1) = M44(base,1,0) * M44(mult,0,1) + M44(base,1,1) * M44(mult,1,1) + M44(base,1,2) * M44(mult,2,1) + M44(base,1,3) * M44(mult,3,1);
M44(result,2,1) = M44(base,2,0) * M44(mult,0,1) + M44(base,2,1) * M44(mult,1,1) + M44(base,2,2) * M44(mult,2,1) + M44(base,2,3) * M44(mult,3,1);
M44(result,3,1) = M44(base,3,0) * M44(mult,0,1) + M44(base,3,1) * M44(mult,1,1) + M44(base,3,2) * M44(mult,2,1) + M44(base,3,3) * M44(mult,3,1);
M44(result,0,2) = M44(base,0,0) * M44(mult,0,2) + M44(base,0,1) * M44(mult,1,2) + M44(base,0,2) * M44(mult,2,2) + M44(base,0,3) * M44(mult,3,2);
M44(result,1,2) = M44(base,1,0) * M44(mult,0,2) + M44(base,1,1) * M44(mult,1,2) + M44(base,1,2) * M44(mult,2,2) + M44(base,1,3) * M44(mult,3,2);
M44(result,2,2) = M44(base,2,0) * M44(mult,0,2) + M44(base,2,1) * M44(mult,1,2) + M44(base,2,2) * M44(mult,2,2) + M44(base,2,3) * M44(mult,3,2);
M44(result,3,2) = M44(base,3,0) * M44(mult,0,2) + M44(base,3,1) * M44(mult,1,2) + M44(base,3,2) * M44(mult,2,2) + M44(base,3,3) * M44(mult,3,2);
M44(result,0,3) = M44(base,0,0) * M44(mult,0,3) + M44(base,0,1) * M44(mult,1,3) + M44(base,0,2) * M44(mult,2,3) + M44(base,0,3) * M44(mult,3,3);
M44(result,1,3) = M44(base,1,0) * M44(mult,0,3) + M44(base,1,1) * M44(mult,1,3) + M44(base,1,2) * M44(mult,2,3) + M44(base,1,3) * M44(mult,3,3);
M44(result,2,3) = M44(base,2,0) * M44(mult,0,3) + M44(base,2,1) * M44(mult,1,3) + M44(base,2,2) * M44(mult,2,3) + M44(base,2,3) * M44(mult,3,3);
M44(result,3,3) = M44(base,3,0) * M44(mult,0,3) + M44(base,3,1) * M44(mult,1,3) + M44(base,3,2) * M44(mult,2,3) + M44(base,3,3) * M44(mult,3,3);
}
3. Forを展開してみる Type2
順序を変えてみる。
上記ではResultへの格納順がSequentialAccessになっているが、RandomAccessにした場合どうなるか。
void MultMatrix_ExC( double result[16], const double base[16], const double mult[16] )
{
M44(result,0,0) = M44(base,0,0) * M44(mult,0,0) + M44(base,0,1) * M44(mult,1,0) + M44(base,0,2) * M44(mult,2,0) + M44(base,0,3) * M44(mult,3,0);
M44(result,0,1) = M44(base,0,0) * M44(mult,0,1) + M44(base,0,1) * M44(mult,1,1) + M44(base,0,2) * M44(mult,2,1) + M44(base,0,3) * M44(mult,3,1);
M44(result,0,2) = M44(base,0,0) * M44(mult,0,2) + M44(base,0,1) * M44(mult,1,2) + M44(base,0,2) * M44(mult,2,2) + M44(base,0,3) * M44(mult,3,2);
M44(result,0,3) = M44(base,0,0) * M44(mult,0,3) + M44(base,0,1) * M44(mult,1,3) + M44(base,0,2) * M44(mult,2,3) + M44(base,0,3) * M44(mult,3,3);
M44(result,1,0) = M44(base,1,0) * M44(mult,0,0) + M44(base,1,1) * M44(mult,1,0) + M44(base,1,2) * M44(mult,2,0) + M44(base,1,3) * M44(mult,3,0);
M44(result,1,1) = M44(base,1,0) * M44(mult,0,1) + M44(base,1,1) * M44(mult,1,1) + M44(base,1,2) * M44(mult,2,1) + M44(base,1,3) * M44(mult,3,1);
M44(result,1,2) = M44(base,1,0) * M44(mult,0,2) + M44(base,1,1) * M44(mult,1,2) + M44(base,1,2) * M44(mult,2,2) + M44(base,1,3) * M44(mult,3,2);
M44(result,1,3) = M44(base,1,0) * M44(mult,0,3) + M44(base,1,1) * M44(mult,1,3) + M44(base,1,2) * M44(mult,2,3) + M44(base,1,3) * M44(mult,3,3);
M44(result,2,0) = M44(base,2,0) * M44(mult,0,0) + M44(base,2,1) * M44(mult,1,0) + M44(base,2,2) * M44(mult,2,0) + M44(base,2,3) * M44(mult,3,0);
M44(result,2,1) = M44(base,2,0) * M44(mult,0,1) + M44(base,2,1) * M44(mult,1,1) + M44(base,2,2) * M44(mult,2,1) + M44(base,2,3) * M44(mult,3,1);
M44(result,2,2) = M44(base,2,0) * M44(mult,0,2) + M44(base,2,1) * M44(mult,1,2) + M44(base,2,2) * M44(mult,2,2) + M44(base,2,3) * M44(mult,3,2);
M44(result,2,3) = M44(base,2,0) * M44(mult,0,3) + M44(base,2,1) * M44(mult,1,3) + M44(base,2,2) * M44(mult,2,3) + M44(base,2,3) * M44(mult,3,3);
M44(result,3,0) = M44(base,3,0) * M44(mult,0,0) + M44(base,3,1) * M44(mult,1,0) + M44(base,3,2) * M44(mult,2,0) + M44(base,3,3) * M44(mult,3,0);
M44(result,3,1) = M44(base,3,0) * M44(mult,0,1) + M44(base,3,1) * M44(mult,1,1) + M44(base,3,2) * M44(mult,2,1) + M44(base,3,3) * M44(mult,3,1);
M44(result,3,2) = M44(base,3,0) * M44(mult,0,2) + M44(base,3,1) * M44(mult,1,2) + M44(base,3,2) * M44(mult,2,2) + M44(base,3,3) * M44(mult,3,2);
M44(result,3,3) = M44(base,3,0) * M44(mult,0,3) + M44(base,3,1) * M44(mult,1,3) + M44(base,3,2) * M44(mult,2,3) + M44(base,3,3) * M44(mult,3,3);
}
4. SSE2命令を使ってみる
SSE2を使ってみる。
1命令で2演算できる。
void MultMatrix_SSE2( double result[16], const double base[16], const double mult[16] )
{
__m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
__m128d xmm08, xmm09, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
xmm08 = _mm_loadu_pd( &mult[0] );
xmm09 = _mm_loadu_pd( &mult[2] );
xmm10 = _mm_loadu_pd( &mult[4] );
xmm11 = _mm_loadu_pd( &mult[6] );
xmm12 = _mm_loadu_pd( &mult[8] );
xmm13 = _mm_loadu_pd( &mult[10] );
xmm14 = _mm_loadu_pd( &mult[12] );
xmm15 = _mm_loadu_pd( &mult[14] );
// column0
xmm0 = _mm_load1_pd( &base[0] );
xmm2 = _mm_load1_pd( &base[1] );
xmm4 = _mm_load1_pd( &base[2] );
xmm6 = _mm_load1_pd( &base[3] );
xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );
xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );
_mm_storeu_pd( &result[0], xmm0 );
_mm_storeu_pd( &result[2], xmm1 );
// column1
xmm0 = _mm_load1_pd( &base[4] );
xmm2 = _mm_load1_pd( &base[5] );
xmm4 = _mm_load1_pd( &base[6] );
xmm6 = _mm_load1_pd( &base[7] );
xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );
xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );
_mm_storeu_pd( &result[4], xmm0 );
_mm_storeu_pd( &result[6], xmm1 );
// column2
xmm0 = _mm_load1_pd( &base[8] );
xmm2 = _mm_load1_pd( &base[9] );
xmm4 = _mm_load1_pd( &base[10] );
xmm6 = _mm_load1_pd( &base[11] );
xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );
xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );
_mm_storeu_pd( &result[8], xmm0 );
_mm_storeu_pd( &result[10], xmm1 );
// column3
xmm0 = _mm_load1_pd( &base[12] );
xmm2 = _mm_load1_pd( &base[13] );
xmm4 = _mm_load1_pd( &base[14] );
xmm6 = _mm_load1_pd( &base[15] );
xmm1 = _mm_mul_pd( xmm0, xmm09 );
xmm3 = _mm_mul_pd( xmm2, xmm11 );
xmm5 = _mm_mul_pd( xmm4, xmm13 );
xmm7 = _mm_mul_pd( xmm6, xmm15 );
xmm0 = _mm_mul_pd( xmm0, xmm08 );
xmm2 = _mm_mul_pd( xmm2, xmm10 );
xmm4 = _mm_mul_pd( xmm4, xmm12 );
xmm6 = _mm_mul_pd( xmm6, xmm14 );
xmm0 = _mm_add_pd( xmm0, xmm4 );
xmm1 = _mm_add_pd( xmm1, xmm5 );
xmm2 = _mm_add_pd( xmm2, xmm6 );
xmm3 = _mm_add_pd( xmm3, xmm7 );
xmm0 = _mm_add_pd( xmm0, xmm2 );
xmm1 = _mm_add_pd( xmm1, xmm3 );
_mm_storeu_pd( &result[12], xmm0 );
_mm_storeu_pd( &result[14], xmm1 );
}
5. AVX命令を使ってみる
こっちは、1命令で4演算である。
void MultMatrix_AVX( double result[16], const double base[16], const double mult[16] )
{
__m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
ymm4 = _mm256_loadu_pd( &mult[0] );
ymm5 = _mm256_loadu_pd( &mult[4] );
ymm6 = _mm256_loadu_pd( &mult[8] );
ymm7 = _mm256_loadu_pd( &mult[12] );
// column0
ymm0 = _mm256_broadcast_sd( &base[0] );
ymm1 = _mm256_broadcast_sd( &base[1] );
ymm2 = _mm256_broadcast_sd( &base[2] );
ymm3 = _mm256_broadcast_sd( &base[3] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );
ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[0], ymm0 );
// column1
ymm0 = _mm256_broadcast_sd( &base[4] );
ymm1 = _mm256_broadcast_sd( &base[5] );
ymm2 = _mm256_broadcast_sd( &base[6] );
ymm3 = _mm256_broadcast_sd( &base[7] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );
ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[4], ymm0 );
// column2
ymm0 = _mm256_broadcast_sd( &base[8] );
ymm1 = _mm256_broadcast_sd( &base[9] );
ymm2 = _mm256_broadcast_sd( &base[10] );
ymm3 = _mm256_broadcast_sd( &base[11] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );
ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[8], ymm0 );
// column3
ymm0 = _mm256_broadcast_sd( &base[12] );
ymm1 = _mm256_broadcast_sd( &base[13] );
ymm2 = _mm256_broadcast_sd( &base[14] );
ymm3 = _mm256_broadcast_sd( &base[15] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm1 = _mm256_mul_pd( ymm1, ymm5 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm3 = _mm256_mul_pd( ymm3, ymm7 );
ymm0 = _mm256_add_pd( ymm0, ymm1 );
ymm2 = _mm256_add_pd( ymm2, ymm3 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[12], ymm0 );
}
6. FMA命令を使ってみる
こっちも1命令4演算が基本だけど、1命令fmaddは8演算になる。
void MultMatrix_AVX_FMA( double result[16], const double base[16], const double mult[16] )
{
__m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
ymm4 = _mm256_loadu_pd( &mult[0] );
ymm5 = _mm256_loadu_pd( &mult[4] );
ymm6 = _mm256_loadu_pd( &mult[8] );
ymm7 = _mm256_loadu_pd( &mult[12] );
// column0
ymm0 = _mm256_broadcast_sd( &base[0] );
ymm1 = _mm256_broadcast_sd( &base[1] );
ymm2 = _mm256_broadcast_sd( &base[2] );
ymm3 = _mm256_broadcast_sd( &base[3] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[0], ymm0 );
// column1
ymm0 = _mm256_broadcast_sd( &base[4] );
ymm1 = _mm256_broadcast_sd( &base[5] );
ymm2 = _mm256_broadcast_sd( &base[6] );
ymm3 = _mm256_broadcast_sd( &base[7] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[4], ymm0 );
// column2
ymm0 = _mm256_broadcast_sd( &base[8] );
ymm1 = _mm256_broadcast_sd( &base[9] );
ymm2 = _mm256_broadcast_sd( &base[10] );
ymm3 = _mm256_broadcast_sd( &base[11] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[8], ymm0 );
// column3
ymm0 = _mm256_broadcast_sd( &base[12] );
ymm1 = _mm256_broadcast_sd( &base[13] );
ymm2 = _mm256_broadcast_sd( &base[14] );
ymm3 = _mm256_broadcast_sd( &base[15] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm2 = _mm256_mul_pd( ymm2, ymm6 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm2 = _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
ymm0 = _mm256_add_pd( ymm0, ymm2 );
_mm256_storeu_pd( &result[12], ymm0 );
}
7. FMA命令での演算をちょっと変えてみる
合計値の足し込みの仕方を変えてみる。
void MultMatrix_AVX_FMA_type2( double result[16], const double base[16], const double mult[16] )
{
__m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
ymm4 = _mm256_loadu_pd( &mult[0] );
ymm5 = _mm256_loadu_pd( &mult[4] );
ymm6 = _mm256_loadu_pd( &mult[8] );
ymm7 = _mm256_loadu_pd( &mult[12] );
// column0
ymm0 = _mm256_broadcast_sd( &base[0] );
ymm1 = _mm256_broadcast_sd( &base[1] );
ymm2 = _mm256_broadcast_sd( &base[2] );
ymm3 = _mm256_broadcast_sd( &base[3] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );
_mm256_storeu_pd( &result[0], ymm0 );
// column1
ymm0 = _mm256_broadcast_sd( &base[4] );
ymm1 = _mm256_broadcast_sd( &base[5] );
ymm2 = _mm256_broadcast_sd( &base[6] );
ymm3 = _mm256_broadcast_sd( &base[7] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );
_mm256_storeu_pd( &result[4], ymm0 );
// column2
ymm0 = _mm256_broadcast_sd( &base[8] );
ymm1 = _mm256_broadcast_sd( &base[9] );
ymm2 = _mm256_broadcast_sd( &base[10] );
ymm3 = _mm256_broadcast_sd( &base[11] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );
_mm256_storeu_pd( &result[8], ymm0 );
// column3
ymm0 = _mm256_broadcast_sd( &base[12] );
ymm1 = _mm256_broadcast_sd( &base[13] );
ymm2 = _mm256_broadcast_sd( &base[14] );
ymm3 = _mm256_broadcast_sd( &base[15] );
ymm0 = _mm256_mul_pd( ymm0, ymm4 );
ymm0 = _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
ymm0 = _mm256_fmadd_pd( ymm3, ymm7, ymm0 );
_mm256_storeu_pd( &result[12], ymm0 );
}
結果。
測定環境:
CPU : Intel Core i7-6700K (定格運用)
MEM : DDR4-3000 DualChannel (Overclocked)
OS : Windows10 Professional 64bit
IDE : Visual Studio 2015
コンパイルオプション:速度優先, amd64
実行速度は、SingleThread(4.2GHz動作になる)と、MultiThread(4.0GHz動作になる)で計測。
MultiThread時は、物理4core論理8coreなので8 thread並列で実施している。
Gflopsは、1回の4x4行列の掛算が112演算なので、演算回数に112を乗したものである。
Func | [回/sec] @SingleThread | [回/sec]@8thread | マルチスレッド時性能比 | Gflops @8threads |
---|---|---|---|---|
MultMatrix_For | 55.8M | 214.6M | 384% | 24.035 |
MultMatrix_ExR | 60.5M | 233.6M | 386% | 26.164 |
MultMatrix_ExC | 60.1M | 233.6M | 389% | 26.166 |
MultMatrix_SSE2 | 109.6M | 420.1M | 383% | 47.049 |
MultMatrix_AVX | 217.9M | 837.5M | 384% | 93.803 |
MultMatrix_AVX_FMA | 244.9M | 938.0M | 383% | 105.057 |
MultMatrix_AVX_FMA_type2 | 258.9M | 998.4M | 386% | 111.818 |
期待される「マルチスレッド時性能比」は、
4 core * 4.0 GHz / 4.2 GHz = 381%
である。
SIMD命令使わない時と、使った時とで5倍近くの性能差が出ている。
つまり、CPU使用率が100%だとしても、潜在能力の20%しか使っていないのかもしれない。
それは実にもったいないことだ。