More than 5 years have passed since last update.

4x4行列同士の掛算を高速化してみる～Part2～

Last updated at 2019-09-05Posted at 2017-07-10

概要

4x4行列同士の掛算を高速化してみるの続編。

今回新たに以下を追加。

NEON(Arm)版追加。(ただしdouble使えないのでfloatのみ)
float の4x4行列掛け算 (通常版、SSE2版、NEON版)
逆行列(公式Ver)計算追加 (通常版、AVX2_FMA版)
行列のコピー処理計測 (memcpy版、代入版、各種SIMD版)
マルチプラットフォーム対応

ソースコード

全体はgit.
https://github.com/blue777/NanoPi-NEO

今回の関連するコードは以下の２つ。
コンパイルはPerfTest_Matrix.cppだけをやればOK!

PerfTest_Matrix.cpp
common/multithread_tools.h

コンパイルスイッチ

各種命令系の有効無効を行頭の部分で編集できるようにしてみました。
使用するコンパイラ・実行環境によって適当に切り替えてください。

PerfTest_Matrix.cpp

//#define ENABLE_SSE2
//#define ENABLE_AVX
//#define ENABLE_AVX_FMA
//#define ENABLE_AVX2_FMA
# define ENABLE_NEON

4x4行列floatの掛け算 SSE2版

MultMatrixF_SSE2


void	MultMatrixF_SSE2( float result[16], const float base[16], const float mult[16] )
{
	__m128	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

	xmm4	= _mm_loadu_ps( &mult[0] );
	xmm5	= _mm_loadu_ps( &mult[4] );
	xmm6	= _mm_loadu_ps( &mult[8] );
	xmm7	= _mm_loadu_ps( &mult[12] );

	// column0
	xmm0	= _mm_load1_ps( &base[0] );
	xmm1	= _mm_load1_ps( &base[1] );
	xmm2	= _mm_load1_ps( &base[2] );
	xmm3	= _mm_load1_ps( &base[3] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[0], xmm0 );

	// column1
	xmm0	= _mm_load1_ps( &base[4] );
	xmm1	= _mm_load1_ps( &base[5] );
	xmm2	= _mm_load1_ps( &base[6] );
	xmm3	= _mm_load1_ps( &base[7] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[4], xmm0 );

	// column2
	xmm0	= _mm_load1_ps( &base[8] );
	xmm1	= _mm_load1_ps( &base[9] );
	xmm2	= _mm_load1_ps( &base[10] );
	xmm3	= _mm_load1_ps( &base[11] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[8], xmm0 );

	// column3
	xmm0	= _mm_load1_ps( &base[12] );
	xmm1	= _mm_load1_ps( &base[13] );
	xmm2	= _mm_load1_ps( &base[14] );
	xmm3	= _mm_load1_ps( &base[15] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[12], xmm0 );
}

4x4行列floatの掛け算 NEON命令版

SSE2のfloat版とほぼ同等の命令構成で書いた行列掛け算。

MultMatrixF_NEON

void	MultMatrixF_NEON( float result[16], const float base[16], const float mult[16] )
{
	float32x4_t	c0, c1, c2, c3;
	float32x4_t	r0, r1, r2, r3;

	c0	= vld1q_f32( &mult[0] );
	c1	= vld1q_f32( &mult[4] );
	c2	= vld1q_f32( &mult[8] );
	c3	= vld1q_f32( &mult[12] );

	// column 0
	r0	= vmulq_n_f32( c0, base[0] ); 
	r1	= vmulq_n_f32( c1, base[1] ); 
	r2	= vmulq_n_f32( c2, base[2] ); 
	r3	= vmulq_n_f32( c3, base[3] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[0], r0 );

	// column 1
	r0	= vmulq_n_f32( c0, base[4] ); 
	r1	= vmulq_n_f32( c1, base[5] ); 
	r2	= vmulq_n_f32( c2, base[6] ); 
	r3	= vmulq_n_f32( c3, base[7] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[4], r0 );

	// column 2
	r0	= vmulq_n_f32( c0, base[8] ); 
	r1	= vmulq_n_f32( c1, base[9] ); 
	r2	= vmulq_n_f32( c2, base[10] ); 
	r3	= vmulq_n_f32( c3, base[11] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[8], r0 );

	// column 3
	r0	= vmulq_n_f32( c0, base[12] ); 
	r1	= vmulq_n_f32( c1, base[13] ); 
	r2	= vmulq_n_f32( c2, base[14] ); 
	r3	= vmulq_n_f32( c3, base[15] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[12], r0 );
}

4x4行列floatの掛け算 NEON版 Type2

使う命令をちょっと変えた版。
行数は少なくなるが、果たしてどうなるか。

MultMatrixF_NEON_type2

void	MultMatrixF_NEON_type2( float result[16], const float base[16], const float mult[16] )
{
	float32x4_t	c0, c1, c2, c3;
	float32x4_t	r0, r1, r2, r3;

	c0	= vld1q_f32( &mult[0] );
	c1	= vld1q_f32( &mult[4] );
	c2	= vld1q_f32( &mult[8] );
	c3	= vld1q_f32( &mult[12] );

	// column 0
	r0	= vmulq_n_f32(     c0, base[0] ); 
	r0	= vmlaq_n_f32( r0, c1, base[1] );
	r0	= vmlaq_n_f32( r0, c2, base[2] );
	r0	= vmlaq_n_f32( r0, c3, base[3] );

	vst1q_f32( &result[0], r0 );

	// column 1
	r0	= vmulq_n_f32(     c0, base[4] ); 
	r0	= vmlaq_n_f32( r0, c1, base[5] );
	r0	= vmlaq_n_f32( r0, c2, base[6] );
	r0	= vmlaq_n_f32( r0, c3, base[7] );

	vst1q_f32( &result[4], r0 );

	// column 2
	r0	= vmulq_n_f32(     c0, base[8] ); 
	r0	= vmlaq_n_f32( r0, c1, base[9] );
	r0	= vmlaq_n_f32( r0, c2, base[10] );
	r0	= vmlaq_n_f32( r0, c3, base[11] );

	vst1q_f32( &result[8], r0 );

	// column 3
	r0	= vmulq_n_f32(     c0, base[12] ); 
	r0	= vmlaq_n_f32( r0, c1, base[13] );
	r0	= vmlaq_n_f32( r0, c2, base[14] );
	r0	= vmlaq_n_f32( r0, c3, base[15] );

	vst1q_f32( &result[12], r0 );
}

測定結果。

手持ちの環境で測ったみた。
あと６機種ほど測れそうなのがあるが、ひとまずメイン環境のみで。

NanoPi-NEO

RaspberryPiの仲間。

CPU : Allwinner H3 (Cortex-A7 1.2GHz x 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 32bit (Official rom image, kernel=3.4.39)

Compile Command : g++ -O3 -pthread -std=c++11 -mfpu=neon PerfTest_Matrix.cpp

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
`MultMatrix_ForC<float>`	2.3 M mul/s	9.0 M mul/s	1.006 Gflops	396 %
`MultMatrix_ForR<float>`	2.3 M mul/s	9.2 M mul/s	1.029 Gflops	400 %
`MultMatrix_ExpC<float>`	2.9 M mul/s	11.4 M mul/s	1.274 Gflops	393 %
`MultMatrix_ExpR<float>`	2.9 M mul/s	11.6 M mul/s	1.296 Gflops	401 %
`MultMatrixF_NEON`	5.8 M mul/s	22.5 M mul/s	2.518 Gflops	385 %
`MultMatrixF_NEON_type2`	6.0 M mul/s	23.0 M mul/s	2.579 Gflops	386 %
`MultMatrix_ForC<double>`	1.7 M mul/s	6.7 M mul/s	0.749 Gflops	403 %
`MultMatrix_ForR<double>`	1.7 M mul/s	6.7 M mul/s	0.747 Gflops	400 %
`MultMatrix_ExpC<double>`	2.0 M mul/s	8.1 M mul/s	0.907 Gflops	397 %
`MultMatrix_ExpR<double>`	2.0 M mul/s	8.1 M mul/s	0.912 Gflops	400 %
`CalcInverse<float>`	1.9 M inv/s	7.5 M inv/s	1.842 Gflops	399 %
`CalcInverse<double>`	1.2 M inv/s	4.6 M inv/s	1.131 Gflops	397 %
`CopyMatrix_memcpy<double>`	13.6 M cpy/s	55.8 M cpy/s	7.140 GB/sec	411 %
`CopyMatrix_Expand<double>`	27.7 M cpy/s	111.1 M cpy/s	14.224 GB/sec	401 %
`CopyMatrixD_NEON`	21.0 M cpy/s	83.9 M cpy/s	10.738 GB/sec	400 %

NanoPi NEO2

RaspberryPiの仲間。

CPU : Allwinner H5 (Cortex-A53 1.0 GHz * 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 64bit (Official rom image, kernel=4.11.2)

CompileCommand : g++ -std=c++11 -pthread -O3 PerfTest_Matrix.cpp

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
`MultMatrix_ForC<float>`	5.8 M mul/s	24.4 M mul/s	2.729 Gflops	421 %
`MultMatrix_ForR<float>`	2.5 M mul/s	10.1 M mul/s	1.134 Gflops	397 %
`MultMatrix_ExpC<float>`	2.7 M mul/s	11.2 M mul/s	1.253 Gflops	413 %
`MultMatrix_ExpR<float>`	2.8 M mul/s	11.2 M mul/s	1.253 Gflops	404 %
`MultMatrixF_NEON`	10.2 M mul/s	41.2 M mul/s	4.619 Gflops	406 %
`MultMatrixF_NEON_type2`	9.1 M mul/s	36.7 M mul/s	4.110 Gflops	403 %
`MultMatrix_ForC<double>`	2.6 M mul/s	10.2 M mul/s	1.144 Gflops	397 %
`MultMatrix_ForR<double>`	2.3 M mul/s	9.3 M mul/s	1.042 Gflops	404 %
`MultMatrix_ExpC<double>`	2.6 M mul/s	10.2 M mul/s	1.140 Gflops	398 %
`MultMatrix_ExpR<double>`	2.5 M mul/s	10.2 M mul/s	1.139 Gflops	403 %
`CalcInverse<float>`	2.1 M inv/s	8.6 M inv/s	2.115 Gflops	403 %
`CalcInverse<double>`	1.8 M inv/s	7.5 M inv/s	1.854 Gflops	409 %
`CopyMatrix_memcpy<double>`	20.1 M cpy/s	79.8 M cpy/s	10.209 GB/sec	397 %
`CopyMatrix_Expand<double>`	29.5 M cpy/s	117.2 M cpy/s	14.998 GB/sec	397 %
`CopyMatrixD_NEON`	23.3 M cpy/s	92.7 M cpy/s	11.869 GB/sec	397 %

NanoPi NEO4

RaspberryPiの仲間。

CPU : RockChip RK3399 (Cortex-A72 1.8GHz x2 + Cortex-A53 1.4GHz x4)
MEM : 1024 MB
OS : Ubuntu 18.04 64bit (Official rom image FriendlyDesktop)

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
MultMatrix_ForC	27.6 M mul/s	92.9 M mul/s	10.407 Gflops	336 %
MultMatrix_ForR	18.4 M mul/s	50.6 M mul/s	5.671 Gflops	275 %
MultMatrix_ExpC	16.0 M mul/s	48.6 M mul/s	5.442 Gflops	304 %
MultMatrix_ExpR	15.8 M mul/s	48.4 M mul/s	5.416 Gflops	306 %
MultMatrixF_NEON	53.2 M mul/s	169.8 M mul/s	19.022 Gflops	319 %
MultMatrixF_NEON_type2	39.7 M mul/s	132.3 M mul/s	14.815 Gflops	333 %
MultMatrix_ForC	12.7 M mul/s	40.6 M mul/s	4.553 Gflops	320 %
MultMatrix_ForR	16.9 M mul/s	52.0 M mul/s	5.819 Gflops	307 %
MultMatrix_ExpC	14.9 M mul/s	45.3 M mul/s	5.068 Gflops	303 %
MultMatrix_ExpR	15.1 M mul/s	45.5 M mul/s	5.091 Gflops	302 %
CalcInverse	10.2 M inv/s	39.5 M inv/s	9.766 Gflops	388 %
CalcInverse	4.7 M inv/s	23.0 M inv/s	5.683 Gflops	492 %
CopyMatrix_memcpy	81.1 M cpy/s	286.2 M cpy/s	36.628 GB/sec	353 %
CopyMatrix_Expand	104.4 M cpy/s	320.5 M cpy/s	41.025 GB/sec	307 %
CopyMatrixD_NEON	112.0 M cpy/s	353.5 M cpy/s	45.249 GB/sec	316 %

Rock Pi S

CPU : ARM Cortex-A35 (quad-core) @ ?.?? GHz
MEM : 512 MB
OS : Debian

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
MultMatrix_ForC	4.4 M mul/s	17.8 M mul/s	1.991 Gflops	400 %
MultMatrix_ForR	2.2 M mul/s	8.8 M mul/s	0.990 Gflops	400 %
MultMatrix_ExpC	2.5 M mul/s	9.9 M mul/s	1.106 Gflops	400 %
MultMatrix_ExpR	2.5 M mul/s	9.8 M mul/s	1.102 Gflops	399 %
MultMatrixF_NEON	8.8 M mul/s	35.1 M mul/s	3.927 Gflops	400 %
MultMatrixF_NEON_type2	7.9 M mul/s	31.5 M mul/s	3.526 Gflops	400 %
MultMatrix_ForC	2.4 M mul/s	9.8 M mul/s	1.093 Gflops	400 %
MultMatrix_ForR	2.2 M mul/s	8.8 M mul/s	0.989 Gflops	400 %
MultMatrix_ExpC	2.5 M mul/s	9.8 M mul/s	1.103 Gflops	400 %
MultMatrix_ExpR	2.5 M mul/s	9.8 M mul/s	1.103 Gflops	400 %
CalcInverse	1.7 M inv/s	6.9 M inv/s	1.694 Gflops	400 %
CalcInverse	1.4 M inv/s	5.5 M inv/s	1.355 Gflops	400 %
CopyMatrix_memcpy	17.8 M cpy/s	71.3 M cpy/s	9.121 GB/sec	400 %
CopyMatrix_Expand	19.2 M cpy/s	76.8 M cpy/s	9.828 GB/sec	400 %
CopyMatrixD_NEON	27.7 M cpy/s	110.8 M cpy/s	14.187 GB/sec	400 %

Jetson Nano

nVidia のDeepLearning用SBC
なぜか、pinレイアウトはRaspberryPiに準拠しているっぴので、ラズパイの仲間でもありそう。

CPU : ARM Cortex-A57 (quad-core) @ 1.43GHz
MEM : 4096 MB
OS : Jetson用。

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
MultMatrix_ForC	20.6 M mul/s	82.0 M mul/s	9.179 Gflops	397 %
MultMatrix_ForR	9.8 M mul/s	38.8 M mul/s	4.347 Gflops	397 %
MultMatrix_ExpC	10.0 M mul/s	39.5 M mul/s	4.429 Gflops	397 %
MultMatrix_ExpR	10.0 M mul/s	39.5 M mul/s	4.427 Gflops	397 %
MultMatrixF_NEON	39.9 M mul/s	158.8 M mul/s	17.780 Gflops	398 %
MultMatrixF_NEON_type2	27.9 M mul/s	111.1 M mul/s	12.441 Gflops	398 %
MultMatrix_ForC	8.0 M mul/s	31.9 M mul/s	3.573 Gflops	398 %
MultMatrix_ForR	9.7 M mul/s	38.7 M mul/s	4.333 Gflops	397 %
MultMatrix_ExpC	9.8 M mul/s	39.0 M mul/s	4.370 Gflops	397 %
MultMatrix_ExpR	9.7 M mul/s	38.7 M mul/s	4.333 Gflops	397 %
CalcInverse	6.0 M inv/s	24.0 M inv/s	5.937 Gflops	400 %
CalcInverse	2.3 M inv/s	9.0 M inv/s	2.232 Gflops	397 %
CopyMatrix_memcpy	55.5 M cpy/s	229.7 M cpy/s	29.398 GB/sec	414 %
CopyMatrix_Expand	72.7 M cpy/s	306.2 M cpy/s	39.194 GB/sec	421 %
CopyMatrixD_NEON	74.8 M cpy/s	311.5 M cpy/s	39.872 GB/sec	416 %

Intel Atom z3795

CPU : Intel Atom z3795 (2.4G Hz? x 4)
MEM : 4 GB
OS : Windows10 Professional (32bit)

Compile: VisualStudio 2015, 速度優先

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
`MultMatrix_ForC<float>`	7.4 M mul/s	28.1 M mul/s	3.143 Gflops	378 %
`MultMatrix_ForR<float>`	7.5 M mul/s	28.3 M mul/s	3.169 Gflops	378 %
`MultMatrix_ExpC<float>`	10.0 M mul/s	37.4 M mul/s	4.190 Gflops	376 %
`MultMatrix_ExpR<float>`	9.9 M mul/s	37.4 M mul/s	4.191 Gflops	376 %
`MultMatrixF_SSE2`	26.8 M mul/s	100.8 M mul/s	11.292 Gflops	377 %
`MultMatrix_ForC<double>`	7.0 M mul/s	26.4 M mul/s	2.951 Gflops	375 %
`MultMatrix_ForR<double>`	7.1 M mul/s	26.7 M mul/s	2.988 Gflops	375 %
`MultMatrix_ExpC<double>`	8.8 M mul/s	32.8 M mul/s	3.677 Gflops	374 %
`MultMatrix_ExpR<double>`	8.8 M mul/s	33.0 M mul/s	3.700 Gflops	376 %
`MultMatrixD_SSE2`	11.0 M mul/s	40.9 M mul/s	4.576 Gflops	373 %
`CalcInverse<float>`	3.5 M inv/s	13.1 M inv/s	3.230 Gflops	378 %
`CalcInverse<double>`	2.5 M inv/s	9.5 M inv/s	2.350 Gflops	377 %
`CopyMatrix_memcpy<double>`	24.9 M cpy/s	86.8 M cpy/s	11.110 GB/sec	348 %
`CopyMatrix_Expand<double>`	48.8 M cpy/s	184.3 M cpy/s	23.585 GB/sec	378 %
`CopyMatrixD_SSE2`	87.8 M cpy/s	316.7 M cpy/s	40.540 GB/sec	361 %

Intel Pentium 4415Y @ 1.6GHz

PC : Microsoft SurfaceGo
CPU : Intel Pentium 4415Y @ 1.6GHz (1.6G Hz? x 2)
MEM : 8 GB
OS : Windows10 Home (64bit)

Compile: VisualStudio 2017, 速度優先

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
MultMatrix_ForC	19.7 M mul/s	38.1 M mul/s	4.266 Gflops	194 %
MultMatrix_ForR	19.6 M mul/s	37.8 M mul/s	4.233 Gflops	193 %
MultMatrix_ExpC	19.5 M mul/s	38.8 M mul/s	4.351 Gflops	199 %
MultMatrix_ExpR	19.7 M mul/s	39.5 M mul/s	4.420 Gflops	200 %
MultMatrixF_SSE2	78.7 M mul/s	157.2 M mul/s	17.601 Gflops	200 %
MultMatrix_ForC	19.5 M mul/s	37.0 M mul/s	4.147 Gflops	190 %
MultMatrix_ForR	19.3 M mul/s	38.0 M mul/s	4.261 Gflops	197 %
MultMatrix_ExpC	19.6 M mul/s	37.5 M mul/s	4.197 Gflops	191 %
MultMatrix_ExpR	19.6 M mul/s	39.1 M mul/s	4.379 Gflops	200 %
MultMatrixD_SSE2	41.9 M mul/s	83.5 M mul/s	9.355 Gflops	199 %
CalcInverse	11.9 M inv/s	27.2 M inv/s	6.715 Gflops	228 %
CalcInverse	10.5 M inv/s	25.9 M inv/s	6.394 Gflops	246 %
CopyMatrix_memcpy	157.9 M cpy/s	274.7 M cpy/s	35.167 GB/sec	174 %
CopyMatrix_Expand	93.1 M cpy/s	186.5 M cpy/s	23.868 GB/sec	200 %
CopyMatrixD_SSE2	158.3 M cpy/s	315.2 M cpy/s	40.342 GB/sec	199 %

Intel Core i7-6700K

CPU : Intel Core i7-6700K(定格動作、Single=4.2GHz　Multi=4.0GHz, Logical=8core, Physical=4core)
MEM : 16GB (DDR4-3000 DualChannel, Overclocked)
OS : Windows10 Professional (64bit)

Compile: VisualStudio 2015, 速度優先

Function name	Single Thread	Multi Thread	Performance(MT)	MT/ST
`MultMatrix_ForC<float>`	55.5 M mul/s	215.7 M mul/s	24.160 Gflops	389 %
`MultMatrix_ForR<float>`	55.6 M mul/s	214.4 M mul/s	24.017 Gflops	385 %
`MultMatrix_ExpC<float>`	60.6 M mul/s	232.6 M mul/s	26.049 Gflops	384 %
`MultMatrix_ExpR<float>`	59.5 M mul/s	232.4 M mul/s	26.028 Gflops	391 %
`MultMatrixF_SSE2`	207.2 M mul/s	794.2 M mul/s	88.955 Gflops	383 %
`MultMatrix_ForC<double>`	55.9 M mul/s	213.0 M mul/s	23.861 Gflops	381 %
`MultMatrix_ForR<double>`	55.8 M mul/s	215.7 M mul/s	24.163 Gflops	387 %
`MultMatrix_ExpC<double>`	60.4 M mul/s	229.3 M mul/s	25.687 Gflops	380 %
`MultMatrix_ExpR<double>`	60.4 M mul/s	230.8 M mul/s	25.854 Gflops	382 %
`MultMatrixD_SSE2`	110.6 M mul/s	422.0 M mul/s	47.266 Gflops	381 %
`MultMatrixD_AVX`	220.9 M mul/s	859.0 M mul/s	96.203 Gflops	389 %
`MultMatrixD_AVX_FMA`	247.0 M mul/s	959.3 M mul/s	107.445 Gflops	388 %
`MultMatrixD_AVX_FMA_type2`	266.1 M mul/s	1025.5 M mul/s	114.857 Gflops	385 %
`CalcInverse<float>`	30.7 M inv/s	133.3 M inv/s	32.928 Gflops	434 %
`CalcInverse<double>`	27.1 M inv/s	130.0 M inv/s	32.109 Gflops	480 %
`CalcInverseD_AVX2_FMA`	61.8 M inv/s	262.6 M inv/s	64.873 Gflops	425 %
`CopyMatrix_memcpy<double>`	461.7 M cpy/s	1670.1 M cpy/s	213.766 GB/sec	362 %
`CopyMatrix_Expand<double>`	240.3 M cpy/s	923.4 M cpy/s	118.198 GB/sec	384 %
`CopyMatrixD_SSE2`	462.2 M cpy/s	1775.9 M cpy/s	227.311 GB/sec	384 %
`CopyMatrixD_AVX`	652.0 M cpy/s	2290.2 M cpy/s	293.147 GB/sec	351 %

終わりに

各種実験結果の総論である。

ARM系でのg++ O3オプションは、自動的にベクトル化され、著しい性能向上する場合がある。NanoPi-NEO2のMultMatrix_ForC
ただし、上手くベクトル化できず、性能が出ないときもある。NanoPi-NEO2のMultMatrix_ForR
自力で書いたNEONは、やはり速い(笑
小容量のデータコピーは、memcpyより直接代入したほうが速いケースが多い。
Arm Cortex-A53 と Atom z3795 はクロック当たりの性能はほぼ一緒に見える。
i7-6700K と Cortex-A53 の性能差は20倍程度ある。
逆行列計算(公式Ver)は、SIMD使ってもそこまで速くなれない。。。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

4x4行列同士の掛算を高速化してみる ～Part2～

概要

ソースコード

コンパイルスイッチ

4x4行列floatの掛け算 SSE2版

4x4行列floatの掛け算 NEON命令版

4x4行列floatの掛け算 NEON版 Type2

測定結果。

NanoPi-NEO

NanoPi NEO2

NanoPi NEO4

Rock Pi S

Jetson Nano

Intel Atom z3795

Intel Pentium 4415Y @ 1.6GHz

Intel Core i7-6700K

終わりに

4x4行列同士の掛算を高速化してみる～Part2～