18
11

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

4x4行列同士の掛算を高速化

Last updated at Posted at 2017-07-06

4x4行列同士の掛算を高速化してみる

演算単位が小さいので、あまり意味は無いかもしれないが、高速化してみた。
4x4行列は、OpenGLやらDirect3Dやら、主に3次元処理でよく使われる。

前提

  • 4x4行列の格納順は、OpenGL準拠(ColumnMajor)です。
  • Visual Studio2015 でコンパイルしてます。(gccとかはどうなるかわかりません)
共通
#include <Windows.h>
#include <intrin.h>

#define	M44(base,row,col)	base[col*4+row]

1. 普通版

ごく普通に作成するとこうなるであろうコード。
なのだけれども、演算途中結果をローカル変数に一度格納するか、しないかで大きく性能は違う。

MultMatrix_For
void	MultMatrix_For( double result[16], const double base[16], const double mult[16] )
{
	for( int r = 0; r < 4; r++ )
	{
		for( int c = 0; c < 4; c++ )
		{
			double	temp	= 0;

			for( int i = 0; i < 4; i++ )
			{
				temp	+= M44(base,r,i) * M44(mult,i,c);
			}

			M44(result,r,c)	= temp;
		}
	}
}

2. For文を展開してみる

MultMatrix_ExR
void	MultMatrix_ExR( double result[16], const double base[16], const double mult[16] )
{
	M44(result,0,0)	= M44(base,0,0) * M44(mult,0,0) + M44(base,0,1) * M44(mult,1,0) + M44(base,0,2) * M44(mult,2,0) + M44(base,0,3) * M44(mult,3,0);
	M44(result,1,0)	= M44(base,1,0) * M44(mult,0,0) + M44(base,1,1) * M44(mult,1,0) + M44(base,1,2) * M44(mult,2,0) + M44(base,1,3) * M44(mult,3,0);
	M44(result,2,0)	= M44(base,2,0) * M44(mult,0,0) + M44(base,2,1) * M44(mult,1,0) + M44(base,2,2) * M44(mult,2,0) + M44(base,2,3) * M44(mult,3,0);
	M44(result,3,0)	= M44(base,3,0) * M44(mult,0,0) + M44(base,3,1) * M44(mult,1,0) + M44(base,3,2) * M44(mult,2,0) + M44(base,3,3) * M44(mult,3,0);

	M44(result,0,1)	= M44(base,0,0) * M44(mult,0,1) + M44(base,0,1) * M44(mult,1,1) + M44(base,0,2) * M44(mult,2,1) + M44(base,0,3) * M44(mult,3,1);
	M44(result,1,1)	= M44(base,1,0) * M44(mult,0,1) + M44(base,1,1) * M44(mult,1,1) + M44(base,1,2) * M44(mult,2,1) + M44(base,1,3) * M44(mult,3,1);
	M44(result,2,1)	= M44(base,2,0) * M44(mult,0,1) + M44(base,2,1) * M44(mult,1,1) + M44(base,2,2) * M44(mult,2,1) + M44(base,2,3) * M44(mult,3,1);
	M44(result,3,1)	= M44(base,3,0) * M44(mult,0,1) + M44(base,3,1) * M44(mult,1,1) + M44(base,3,2) * M44(mult,2,1) + M44(base,3,3) * M44(mult,3,1);

	M44(result,0,2)	= M44(base,0,0) * M44(mult,0,2) + M44(base,0,1) * M44(mult,1,2) + M44(base,0,2) * M44(mult,2,2) + M44(base,0,3) * M44(mult,3,2);
	M44(result,1,2)	= M44(base,1,0) * M44(mult,0,2) + M44(base,1,1) * M44(mult,1,2) + M44(base,1,2) * M44(mult,2,2) + M44(base,1,3) * M44(mult,3,2);
	M44(result,2,2)	= M44(base,2,0) * M44(mult,0,2) + M44(base,2,1) * M44(mult,1,2) + M44(base,2,2) * M44(mult,2,2) + M44(base,2,3) * M44(mult,3,2);
	M44(result,3,2)	= M44(base,3,0) * M44(mult,0,2) + M44(base,3,1) * M44(mult,1,2) + M44(base,3,2) * M44(mult,2,2) + M44(base,3,3) * M44(mult,3,2);

	M44(result,0,3)	= M44(base,0,0) * M44(mult,0,3) + M44(base,0,1) * M44(mult,1,3) + M44(base,0,2) * M44(mult,2,3) + M44(base,0,3) * M44(mult,3,3);
	M44(result,1,3)	= M44(base,1,0) * M44(mult,0,3) + M44(base,1,1) * M44(mult,1,3) + M44(base,1,2) * M44(mult,2,3) + M44(base,1,3) * M44(mult,3,3);
	M44(result,2,3)	= M44(base,2,0) * M44(mult,0,3) + M44(base,2,1) * M44(mult,1,3) + M44(base,2,2) * M44(mult,2,3) + M44(base,2,3) * M44(mult,3,3);
	M44(result,3,3)	= M44(base,3,0) * M44(mult,0,3) + M44(base,3,1) * M44(mult,1,3) + M44(base,3,2) * M44(mult,2,3) + M44(base,3,3) * M44(mult,3,3);
}

3. Forを展開してみる Type2

順序を変えてみる。
上記ではResultへの格納順がSequentialAccessになっているが、RandomAccessにした場合どうなるか。

MultMatrix_ExC
void	MultMatrix_ExC( double result[16], const double base[16], const double mult[16] )
{
	M44(result,0,0)	= M44(base,0,0) * M44(mult,0,0) + M44(base,0,1) * M44(mult,1,0) + M44(base,0,2) * M44(mult,2,0) + M44(base,0,3) * M44(mult,3,0);
	M44(result,0,1)	= M44(base,0,0) * M44(mult,0,1) + M44(base,0,1) * M44(mult,1,1) + M44(base,0,2) * M44(mult,2,1) + M44(base,0,3) * M44(mult,3,1);
	M44(result,0,2)	= M44(base,0,0) * M44(mult,0,2) + M44(base,0,1) * M44(mult,1,2) + M44(base,0,2) * M44(mult,2,2) + M44(base,0,3) * M44(mult,3,2);
	M44(result,0,3)	= M44(base,0,0) * M44(mult,0,3) + M44(base,0,1) * M44(mult,1,3) + M44(base,0,2) * M44(mult,2,3) + M44(base,0,3) * M44(mult,3,3);

	M44(result,1,0)	= M44(base,1,0) * M44(mult,0,0) + M44(base,1,1) * M44(mult,1,0) + M44(base,1,2) * M44(mult,2,0) + M44(base,1,3) * M44(mult,3,0);
	M44(result,1,1)	= M44(base,1,0) * M44(mult,0,1) + M44(base,1,1) * M44(mult,1,1) + M44(base,1,2) * M44(mult,2,1) + M44(base,1,3) * M44(mult,3,1);
	M44(result,1,2)	= M44(base,1,0) * M44(mult,0,2) + M44(base,1,1) * M44(mult,1,2) + M44(base,1,2) * M44(mult,2,2) + M44(base,1,3) * M44(mult,3,2);
	M44(result,1,3)	= M44(base,1,0) * M44(mult,0,3) + M44(base,1,1) * M44(mult,1,3) + M44(base,1,2) * M44(mult,2,3) + M44(base,1,3) * M44(mult,3,3);

	M44(result,2,0)	= M44(base,2,0) * M44(mult,0,0) + M44(base,2,1) * M44(mult,1,0) + M44(base,2,2) * M44(mult,2,0) + M44(base,2,3) * M44(mult,3,0);
	M44(result,2,1)	= M44(base,2,0) * M44(mult,0,1) + M44(base,2,1) * M44(mult,1,1) + M44(base,2,2) * M44(mult,2,1) + M44(base,2,3) * M44(mult,3,1);
	M44(result,2,2)	= M44(base,2,0) * M44(mult,0,2) + M44(base,2,1) * M44(mult,1,2) + M44(base,2,2) * M44(mult,2,2) + M44(base,2,3) * M44(mult,3,2);
	M44(result,2,3)	= M44(base,2,0) * M44(mult,0,3) + M44(base,2,1) * M44(mult,1,3) + M44(base,2,2) * M44(mult,2,3) + M44(base,2,3) * M44(mult,3,3);

	M44(result,3,0)	= M44(base,3,0) * M44(mult,0,0) + M44(base,3,1) * M44(mult,1,0) + M44(base,3,2) * M44(mult,2,0) + M44(base,3,3) * M44(mult,3,0);
	M44(result,3,1)	= M44(base,3,0) * M44(mult,0,1) + M44(base,3,1) * M44(mult,1,1) + M44(base,3,2) * M44(mult,2,1) + M44(base,3,3) * M44(mult,3,1);
	M44(result,3,2)	= M44(base,3,0) * M44(mult,0,2) + M44(base,3,1) * M44(mult,1,2) + M44(base,3,2) * M44(mult,2,2) + M44(base,3,3) * M44(mult,3,2);
	M44(result,3,3)	= M44(base,3,0) * M44(mult,0,3) + M44(base,3,1) * M44(mult,1,3) + M44(base,3,2) * M44(mult,2,3) + M44(base,3,3) * M44(mult,3,3);
}

4. SSE2命令を使ってみる

SSE2を使ってみる。
1命令で2演算できる。

MultMatrix_SSE2
void	MultMatrix_SSE2( double result[16], const double base[16], const double mult[16] )
{
	__m128d	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
	__m128d	xmm08, xmm09, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;


	xmm08	= _mm_loadu_pd( &mult[0] );
	xmm09	= _mm_loadu_pd( &mult[2] );
	xmm10	= _mm_loadu_pd( &mult[4] );
	xmm11	= _mm_loadu_pd( &mult[6] );
	xmm12	= _mm_loadu_pd( &mult[8] );
	xmm13	= _mm_loadu_pd( &mult[10] );
	xmm14	= _mm_loadu_pd( &mult[12] );
	xmm15	= _mm_loadu_pd( &mult[14] );

	// column0
	xmm0	= _mm_load1_pd( &base[0] );
	xmm2	= _mm_load1_pd( &base[1] );
	xmm4	= _mm_load1_pd( &base[2] );
	xmm6	= _mm_load1_pd( &base[3] );

	xmm1	= _mm_mul_pd( xmm0, xmm09 );
	xmm3	= _mm_mul_pd( xmm2, xmm11 );
	xmm5	= _mm_mul_pd( xmm4, xmm13 );
	xmm7	= _mm_mul_pd( xmm6, xmm15 );
	xmm0	= _mm_mul_pd( xmm0, xmm08 );
	xmm2	= _mm_mul_pd( xmm2, xmm10 );
	xmm4	= _mm_mul_pd( xmm4, xmm12 );
	xmm6	= _mm_mul_pd( xmm6, xmm14 );

	xmm0	= _mm_add_pd( xmm0, xmm4 );
	xmm1	= _mm_add_pd( xmm1, xmm5 );
	xmm2	= _mm_add_pd( xmm2, xmm6 );
	xmm3	= _mm_add_pd( xmm3, xmm7 );
	xmm0	= _mm_add_pd( xmm0, xmm2 );
	xmm1	= _mm_add_pd( xmm1, xmm3 );

	_mm_storeu_pd( &result[0], xmm0 );
	_mm_storeu_pd( &result[2], xmm1 );


	// column1
	xmm0	= _mm_load1_pd( &base[4] );
	xmm2	= _mm_load1_pd( &base[5] );
	xmm4	= _mm_load1_pd( &base[6] );
	xmm6	= _mm_load1_pd( &base[7] );

	xmm1	= _mm_mul_pd( xmm0, xmm09 );
	xmm3	= _mm_mul_pd( xmm2, xmm11 );
	xmm5	= _mm_mul_pd( xmm4, xmm13 );
	xmm7	= _mm_mul_pd( xmm6, xmm15 );
	xmm0	= _mm_mul_pd( xmm0, xmm08 );
	xmm2	= _mm_mul_pd( xmm2, xmm10 );
	xmm4	= _mm_mul_pd( xmm4, xmm12 );
	xmm6	= _mm_mul_pd( xmm6, xmm14 );

	xmm0	= _mm_add_pd( xmm0, xmm4 );
	xmm1	= _mm_add_pd( xmm1, xmm5 );
	xmm2	= _mm_add_pd( xmm2, xmm6 );
	xmm3	= _mm_add_pd( xmm3, xmm7 );
	xmm0	= _mm_add_pd( xmm0, xmm2 );
	xmm1	= _mm_add_pd( xmm1, xmm3 );

	_mm_storeu_pd( &result[4], xmm0 );
	_mm_storeu_pd( &result[6], xmm1 );

	// column2
	xmm0	= _mm_load1_pd( &base[8] );
	xmm2	= _mm_load1_pd( &base[9] );
	xmm4	= _mm_load1_pd( &base[10] );
	xmm6	= _mm_load1_pd( &base[11] );

	xmm1	= _mm_mul_pd( xmm0, xmm09 );
	xmm3	= _mm_mul_pd( xmm2, xmm11 );
	xmm5	= _mm_mul_pd( xmm4, xmm13 );
	xmm7	= _mm_mul_pd( xmm6, xmm15 );
	xmm0	= _mm_mul_pd( xmm0, xmm08 );
	xmm2	= _mm_mul_pd( xmm2, xmm10 );
	xmm4	= _mm_mul_pd( xmm4, xmm12 );
	xmm6	= _mm_mul_pd( xmm6, xmm14 );

	xmm0	= _mm_add_pd( xmm0, xmm4 );
	xmm1	= _mm_add_pd( xmm1, xmm5 );
	xmm2	= _mm_add_pd( xmm2, xmm6 );
	xmm3	= _mm_add_pd( xmm3, xmm7 );
	xmm0	= _mm_add_pd( xmm0, xmm2 );
	xmm1	= _mm_add_pd( xmm1, xmm3 );

	_mm_storeu_pd( &result[8], xmm0 );
	_mm_storeu_pd( &result[10], xmm1 );

	// column3
	xmm0	= _mm_load1_pd( &base[12] );
	xmm2	= _mm_load1_pd( &base[13] );
	xmm4	= _mm_load1_pd( &base[14] );
	xmm6	= _mm_load1_pd( &base[15] );

	xmm1	= _mm_mul_pd( xmm0, xmm09 );
	xmm3	= _mm_mul_pd( xmm2, xmm11 );
	xmm5	= _mm_mul_pd( xmm4, xmm13 );
	xmm7	= _mm_mul_pd( xmm6, xmm15 );
	xmm0	= _mm_mul_pd( xmm0, xmm08 );
	xmm2	= _mm_mul_pd( xmm2, xmm10 );
	xmm4	= _mm_mul_pd( xmm4, xmm12 );
	xmm6	= _mm_mul_pd( xmm6, xmm14 );

	xmm0	= _mm_add_pd( xmm0, xmm4 );
	xmm1	= _mm_add_pd( xmm1, xmm5 );
	xmm2	= _mm_add_pd( xmm2, xmm6 );
	xmm3	= _mm_add_pd( xmm3, xmm7 );
	xmm0	= _mm_add_pd( xmm0, xmm2 );
	xmm1	= _mm_add_pd( xmm1, xmm3 );

	_mm_storeu_pd( &result[12], xmm0 );
	_mm_storeu_pd( &result[14], xmm1 );
}

5. AVX命令を使ってみる

こっちは、1命令で4演算である。

MultMatrix_AVX
void	MultMatrix_AVX( double result[16], const double base[16], const double mult[16] )
{
	__m256d	ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;

	ymm4	= _mm256_loadu_pd( &mult[0] );
	ymm5	= _mm256_loadu_pd( &mult[4] );
	ymm6	= _mm256_loadu_pd( &mult[8] );
	ymm7	= _mm256_loadu_pd( &mult[12] );

	// column0
	ymm0	= _mm256_broadcast_sd( &base[0] );
	ymm1	= _mm256_broadcast_sd( &base[1] );
	ymm2	= _mm256_broadcast_sd( &base[2] );
	ymm3	= _mm256_broadcast_sd( &base[3] );
	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm1	= _mm256_mul_pd( ymm1, ymm5 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm3	= _mm256_mul_pd( ymm3, ymm7 );

	ymm0	= _mm256_add_pd( ymm0, ymm1 );
	ymm2	= _mm256_add_pd( ymm2, ymm3 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );
	_mm256_storeu_pd( &result[0], ymm0 );

	// column1
	ymm0	= _mm256_broadcast_sd( &base[4] );
	ymm1	= _mm256_broadcast_sd( &base[5] );
	ymm2	= _mm256_broadcast_sd( &base[6] );
	ymm3	= _mm256_broadcast_sd( &base[7] );
	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm1	= _mm256_mul_pd( ymm1, ymm5 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm3	= _mm256_mul_pd( ymm3, ymm7 );

	ymm0	= _mm256_add_pd( ymm0, ymm1 );
	ymm2	= _mm256_add_pd( ymm2, ymm3 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );
	_mm256_storeu_pd( &result[4], ymm0 );

	// column2
	ymm0	= _mm256_broadcast_sd( &base[8] );
	ymm1	= _mm256_broadcast_sd( &base[9] );
	ymm2	= _mm256_broadcast_sd( &base[10] );
	ymm3	= _mm256_broadcast_sd( &base[11] );
	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm1	= _mm256_mul_pd( ymm1, ymm5 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm3	= _mm256_mul_pd( ymm3, ymm7 );

	ymm0	= _mm256_add_pd( ymm0, ymm1 );
	ymm2	= _mm256_add_pd( ymm2, ymm3 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );
	_mm256_storeu_pd( &result[8], ymm0 );

	// column3
	ymm0	= _mm256_broadcast_sd( &base[12] );
	ymm1	= _mm256_broadcast_sd( &base[13] );
	ymm2	= _mm256_broadcast_sd( &base[14] );
	ymm3	= _mm256_broadcast_sd( &base[15] );
	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm1	= _mm256_mul_pd( ymm1, ymm5 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm3	= _mm256_mul_pd( ymm3, ymm7 );

	ymm0	= _mm256_add_pd( ymm0, ymm1 );
	ymm2	= _mm256_add_pd( ymm2, ymm3 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );
	_mm256_storeu_pd( &result[12], ymm0 );
}

6. FMA命令を使ってみる

こっちも1命令4演算が基本だけど、1命令fmaddは8演算になる。

MultMatrix_AVX_FMA
void	MultMatrix_AVX_FMA( double result[16], const double base[16], const double mult[16] )
{
	__m256d	ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;

	ymm4	= _mm256_loadu_pd( &mult[0] );
	ymm5	= _mm256_loadu_pd( &mult[4] );
	ymm6	= _mm256_loadu_pd( &mult[8] );
	ymm7	= _mm256_loadu_pd( &mult[12] );

	// column0
	ymm0	= _mm256_broadcast_sd( &base[0] );
	ymm1	= _mm256_broadcast_sd( &base[1] );
	ymm2	= _mm256_broadcast_sd( &base[2] );
	ymm3	= _mm256_broadcast_sd( &base[3] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm2	= _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );

	_mm256_storeu_pd( &result[0], ymm0 );

	// column1
	ymm0	= _mm256_broadcast_sd( &base[4] );
	ymm1	= _mm256_broadcast_sd( &base[5] );
	ymm2	= _mm256_broadcast_sd( &base[6] );
	ymm3	= _mm256_broadcast_sd( &base[7] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm2	= _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );

	_mm256_storeu_pd( &result[4], ymm0 );

	// column2
	ymm0	= _mm256_broadcast_sd( &base[8] );
	ymm1	= _mm256_broadcast_sd( &base[9] );
	ymm2	= _mm256_broadcast_sd( &base[10] );
	ymm3	= _mm256_broadcast_sd( &base[11] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm2	= _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );

	_mm256_storeu_pd( &result[8], ymm0 );

	// column3
	ymm0	= _mm256_broadcast_sd( &base[12] );
	ymm1	= _mm256_broadcast_sd( &base[13] );
	ymm2	= _mm256_broadcast_sd( &base[14] );
	ymm3	= _mm256_broadcast_sd( &base[15] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm2	= _mm256_mul_pd( ymm2, ymm6 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm2	= _mm256_fmadd_pd( ymm3, ymm7, ymm2 );
	ymm0	= _mm256_add_pd( ymm0, ymm2 );

	_mm256_storeu_pd( &result[12], ymm0 );
}

7. FMA命令での演算をちょっと変えてみる

合計値の足し込みの仕方を変えてみる。

MultMatrix_AVX_FMA_type2
void	MultMatrix_AVX_FMA_type2( double result[16], const double base[16], const double mult[16] )
{
	__m256d	ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;

	ymm4	= _mm256_loadu_pd( &mult[0] );
	ymm5	= _mm256_loadu_pd( &mult[4] );
	ymm6	= _mm256_loadu_pd( &mult[8] );
	ymm7	= _mm256_loadu_pd( &mult[12] );

	// column0
	ymm0	= _mm256_broadcast_sd( &base[0] );
	ymm1	= _mm256_broadcast_sd( &base[1] );
	ymm2	= _mm256_broadcast_sd( &base[2] );
	ymm3	= _mm256_broadcast_sd( &base[3] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

	_mm256_storeu_pd( &result[0], ymm0 );

	// column1
	ymm0	= _mm256_broadcast_sd( &base[4] );
	ymm1	= _mm256_broadcast_sd( &base[5] );
	ymm2	= _mm256_broadcast_sd( &base[6] );
	ymm3	= _mm256_broadcast_sd( &base[7] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

	_mm256_storeu_pd( &result[4], ymm0 );

	// column2
	ymm0	= _mm256_broadcast_sd( &base[8] );
	ymm1	= _mm256_broadcast_sd( &base[9] );
	ymm2	= _mm256_broadcast_sd( &base[10] );
	ymm3	= _mm256_broadcast_sd( &base[11] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

	_mm256_storeu_pd( &result[8], ymm0 );

	// column3
	ymm0	= _mm256_broadcast_sd( &base[12] );
	ymm1	= _mm256_broadcast_sd( &base[13] );
	ymm2	= _mm256_broadcast_sd( &base[14] );
	ymm3	= _mm256_broadcast_sd( &base[15] );

	ymm0	= _mm256_mul_pd( ymm0, ymm4 );
	ymm0	= _mm256_fmadd_pd( ymm1, ymm5, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm2, ymm6, ymm0 );
	ymm0	= _mm256_fmadd_pd( ymm3, ymm7, ymm0 );

	_mm256_storeu_pd( &result[12], ymm0 );
}

結果。

測定環境:
CPU : Intel Core i7-6700K (定格運用)
MEM : DDR4-3000 DualChannel (Overclocked)
OS : Windows10 Professional 64bit
IDE : Visual Studio 2015
コンパイルオプション:速度優先, amd64

実行速度は、SingleThread(4.2GHz動作になる)と、MultiThread(4.0GHz動作になる)で計測。
MultiThread時は、物理4core論理8coreなので8 thread並列で実施している。

Gflopsは、1回の4x4行列の掛算が112演算なので、演算回数に112を乗したものである。

Func [回/sec] @SingleThread [回/sec]@8thread マルチスレッド時性能比 Gflops @8threads
MultMatrix_For 55.8M 214.6M 384% 24.035
MultMatrix_ExR 60.5M 233.6M 386% 26.164
MultMatrix_ExC 60.1M 233.6M 389% 26.166
MultMatrix_SSE2 109.6M 420.1M 383% 47.049
MultMatrix_AVX 217.9M 837.5M 384% 93.803
MultMatrix_AVX_FMA 244.9M 938.0M 383% 105.057
MultMatrix_AVX_FMA_type2 258.9M 998.4M 386% 111.818

期待される「マルチスレッド時性能比」は、
4 core * 4.0 GHz / 4.2 GHz = 381%
である。

SIMD命令使わない時と、使った時とで5倍近くの性能差が出ている。
つまり、CPU使用率が100%だとしても、潜在能力の20%しか使っていないのかもしれない。
それは実にもったいないことだ。

18
11
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
18
11

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?