15
8

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

4x4行列同士の掛算を高速化してみる ~Part2~

Last updated at Posted at 2017-07-10

概要

4x4行列同士の掛算を高速化してみるの続編。

今回新たに以下を追加。

  • NEON(Arm)版追加。(ただしdouble使えないのでfloatのみ)
  • float の4x4行列掛け算 (通常版、SSE2版、NEON版)
  • 逆行列(公式Ver)計算追加 (通常版、AVX2_FMA版)
  • 行列のコピー処理計測 (memcpy版、代入版、各種SIMD版)
  • マルチプラットフォーム対応

ソースコード

全体はgit.
https://github.com/blue777/NanoPi-NEO

今回の関連するコードは以下の2つ。
コンパイルはPerfTest_Matrix.cppだけをやればOK!

  • PerfTest_Matrix.cpp
  • common/multithread_tools.h

コンパイルスイッチ

各種命令系の有効無効を行頭の部分で編集できるようにしてみました。
使用するコンパイラ・実行環境によって適当に切り替えてください。

PerfTest_Matrix.cpp
//#define ENABLE_SSE2
//#define ENABLE_AVX
//#define ENABLE_AVX_FMA
//#define ENABLE_AVX2_FMA
#define ENABLE_NEON

4x4行列floatの掛け算 SSE2版

MultMatrixF_SSE2

void	MultMatrixF_SSE2( float result[16], const float base[16], const float mult[16] )
{
	__m128	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

	xmm4	= _mm_loadu_ps( &mult[0] );
	xmm5	= _mm_loadu_ps( &mult[4] );
	xmm6	= _mm_loadu_ps( &mult[8] );
	xmm7	= _mm_loadu_ps( &mult[12] );

	// column0
	xmm0	= _mm_load1_ps( &base[0] );
	xmm1	= _mm_load1_ps( &base[1] );
	xmm2	= _mm_load1_ps( &base[2] );
	xmm3	= _mm_load1_ps( &base[3] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[0], xmm0 );

	// column1
	xmm0	= _mm_load1_ps( &base[4] );
	xmm1	= _mm_load1_ps( &base[5] );
	xmm2	= _mm_load1_ps( &base[6] );
	xmm3	= _mm_load1_ps( &base[7] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[4], xmm0 );

	// column2
	xmm0	= _mm_load1_ps( &base[8] );
	xmm1	= _mm_load1_ps( &base[9] );
	xmm2	= _mm_load1_ps( &base[10] );
	xmm3	= _mm_load1_ps( &base[11] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[8], xmm0 );

	// column3
	xmm0	= _mm_load1_ps( &base[12] );
	xmm1	= _mm_load1_ps( &base[13] );
	xmm2	= _mm_load1_ps( &base[14] );
	xmm3	= _mm_load1_ps( &base[15] );

	xmm0	= _mm_mul_ps( xmm0, xmm4 );
	xmm1	= _mm_mul_ps( xmm1, xmm5 );
	xmm2	= _mm_mul_ps( xmm2, xmm6 );
	xmm3	= _mm_mul_ps( xmm3, xmm7 );

	xmm0	= _mm_add_ps( xmm0, xmm1 );
	xmm2	= _mm_add_ps( xmm2, xmm3 );
	xmm0	= _mm_add_ps( xmm0, xmm2 );

	_mm_storeu_ps( &result[12], xmm0 );
}

4x4行列floatの掛け算 NEON命令版

SSE2のfloat版とほぼ同等の命令構成で書いた行列掛け算。

MultMatrixF_NEON
void	MultMatrixF_NEON( float result[16], const float base[16], const float mult[16] )
{
	float32x4_t	c0, c1, c2, c3;
	float32x4_t	r0, r1, r2, r3;

	c0	= vld1q_f32( &mult[0] );
	c1	= vld1q_f32( &mult[4] );
	c2	= vld1q_f32( &mult[8] );
	c3	= vld1q_f32( &mult[12] );

	// column 0
	r0	= vmulq_n_f32( c0, base[0] ); 
	r1	= vmulq_n_f32( c1, base[1] ); 
	r2	= vmulq_n_f32( c2, base[2] ); 
	r3	= vmulq_n_f32( c3, base[3] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[0], r0 );

	// column 1
	r0	= vmulq_n_f32( c0, base[4] ); 
	r1	= vmulq_n_f32( c1, base[5] ); 
	r2	= vmulq_n_f32( c2, base[6] ); 
	r3	= vmulq_n_f32( c3, base[7] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[4], r0 );

	// column 2
	r0	= vmulq_n_f32( c0, base[8] ); 
	r1	= vmulq_n_f32( c1, base[9] ); 
	r2	= vmulq_n_f32( c2, base[10] ); 
	r3	= vmulq_n_f32( c3, base[11] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[8], r0 );

	// column 3
	r0	= vmulq_n_f32( c0, base[12] ); 
	r1	= vmulq_n_f32( c1, base[13] ); 
	r2	= vmulq_n_f32( c2, base[14] ); 
	r3	= vmulq_n_f32( c3, base[15] ); 
	
	r0	= vaddq_f32( r0, r1 );
	r2	= vaddq_f32( r2, r3 );
	r0	= vaddq_f32( r0, r2 );

	vst1q_f32( &result[12], r0 );
}

4x4行列floatの掛け算 NEON版 Type2

使う命令をちょっと変えた版。
行数は少なくなるが、果たしてどうなるか。

MultMatrixF_NEON_type2
void	MultMatrixF_NEON_type2( float result[16], const float base[16], const float mult[16] )
{
	float32x4_t	c0, c1, c2, c3;
	float32x4_t	r0, r1, r2, r3;

	c0	= vld1q_f32( &mult[0] );
	c1	= vld1q_f32( &mult[4] );
	c2	= vld1q_f32( &mult[8] );
	c3	= vld1q_f32( &mult[12] );

	// column 0
	r0	= vmulq_n_f32(     c0, base[0] ); 
	r0	= vmlaq_n_f32( r0, c1, base[1] );
	r0	= vmlaq_n_f32( r0, c2, base[2] );
	r0	= vmlaq_n_f32( r0, c3, base[3] );

	vst1q_f32( &result[0], r0 );

	// column 1
	r0	= vmulq_n_f32(     c0, base[4] ); 
	r0	= vmlaq_n_f32( r0, c1, base[5] );
	r0	= vmlaq_n_f32( r0, c2, base[6] );
	r0	= vmlaq_n_f32( r0, c3, base[7] );

	vst1q_f32( &result[4], r0 );

	// column 2
	r0	= vmulq_n_f32(     c0, base[8] ); 
	r0	= vmlaq_n_f32( r0, c1, base[9] );
	r0	= vmlaq_n_f32( r0, c2, base[10] );
	r0	= vmlaq_n_f32( r0, c3, base[11] );

	vst1q_f32( &result[8], r0 );

	// column 3
	r0	= vmulq_n_f32(     c0, base[12] ); 
	r0	= vmlaq_n_f32( r0, c1, base[13] );
	r0	= vmlaq_n_f32( r0, c2, base[14] );
	r0	= vmlaq_n_f32( r0, c3, base[15] );

	vst1q_f32( &result[12], r0 );
}

測定結果。

手持ちの環境で測ったみた。
あと6機種ほど測れそうなのがあるが、ひとまずメイン環境のみで。

NanoPi-NEO

RaspberryPiの仲間。

CPU : Allwinner H3 (Cortex-A7 1.2GHz x 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 32bit (Official rom image, kernel=3.4.39)

Compile Command : g++ -O3 -pthread -std=c++11 -mfpu=neon PerfTest_Matrix.cpp

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 2.3 M mul/s 9.0 M mul/s 1.006 Gflops 396 %
MultMatrix_ForR<float> 2.3 M mul/s 9.2 M mul/s 1.029 Gflops 400 %
MultMatrix_ExpC<float> 2.9 M mul/s 11.4 M mul/s 1.274 Gflops 393 %
MultMatrix_ExpR<float> 2.9 M mul/s 11.6 M mul/s 1.296 Gflops 401 %
MultMatrixF_NEON 5.8 M mul/s 22.5 M mul/s 2.518 Gflops 385 %
MultMatrixF_NEON_type2 6.0 M mul/s 23.0 M mul/s 2.579 Gflops 386 %
MultMatrix_ForC<double> 1.7 M mul/s 6.7 M mul/s 0.749 Gflops 403 %
MultMatrix_ForR<double> 1.7 M mul/s 6.7 M mul/s 0.747 Gflops 400 %
MultMatrix_ExpC<double> 2.0 M mul/s 8.1 M mul/s 0.907 Gflops 397 %
MultMatrix_ExpR<double> 2.0 M mul/s 8.1 M mul/s 0.912 Gflops 400 %
CalcInverse<float> 1.9 M inv/s 7.5 M inv/s 1.842 Gflops 399 %
CalcInverse<double> 1.2 M inv/s 4.6 M inv/s 1.131 Gflops 397 %
CopyMatrix_memcpy<double> 13.6 M cpy/s 55.8 M cpy/s 7.140 GB/sec 411 %
CopyMatrix_Expand<double> 27.7 M cpy/s 111.1 M cpy/s 14.224 GB/sec 401 %
CopyMatrixD_NEON 21.0 M cpy/s 83.9 M cpy/s 10.738 GB/sec 400 %

NanoPi NEO2

RaspberryPiの仲間。

CPU : Allwinner H5 (Cortex-A53 1.0 GHz * 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 64bit (Official rom image, kernel=4.11.2)

CompileCommand : g++ -std=c++11 -pthread -O3 PerfTest_Matrix.cpp

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 5.8 M mul/s 24.4 M mul/s 2.729 Gflops 421 %
MultMatrix_ForR<float> 2.5 M mul/s 10.1 M mul/s 1.134 Gflops 397 %
MultMatrix_ExpC<float> 2.7 M mul/s 11.2 M mul/s 1.253 Gflops 413 %
MultMatrix_ExpR<float> 2.8 M mul/s 11.2 M mul/s 1.253 Gflops 404 %
MultMatrixF_NEON 10.2 M mul/s 41.2 M mul/s 4.619 Gflops 406 %
MultMatrixF_NEON_type2 9.1 M mul/s 36.7 M mul/s 4.110 Gflops 403 %
MultMatrix_ForC<double> 2.6 M mul/s 10.2 M mul/s 1.144 Gflops 397 %
MultMatrix_ForR<double> 2.3 M mul/s 9.3 M mul/s 1.042 Gflops 404 %
MultMatrix_ExpC<double> 2.6 M mul/s 10.2 M mul/s 1.140 Gflops 398 %
MultMatrix_ExpR<double> 2.5 M mul/s 10.2 M mul/s 1.139 Gflops 403 %
CalcInverse<float> 2.1 M inv/s 8.6 M inv/s 2.115 Gflops 403 %
CalcInverse<double> 1.8 M inv/s 7.5 M inv/s 1.854 Gflops 409 %
CopyMatrix_memcpy<double> 20.1 M cpy/s 79.8 M cpy/s 10.209 GB/sec 397 %
CopyMatrix_Expand<double> 29.5 M cpy/s 117.2 M cpy/s 14.998 GB/sec 397 %
CopyMatrixD_NEON 23.3 M cpy/s 92.7 M cpy/s 11.869 GB/sec 397 %

NanoPi NEO4

RaspberryPiの仲間。

CPU : RockChip RK3399 (Cortex-A72 1.8GHz x2 + Cortex-A53 1.4GHz x4)
MEM : 1024 MB
OS : Ubuntu 18.04 64bit (Official rom image FriendlyDesktop)

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC 27.6 M mul/s 92.9 M mul/s 10.407 Gflops 336 %
MultMatrix_ForR 18.4 M mul/s 50.6 M mul/s 5.671 Gflops 275 %
MultMatrix_ExpC 16.0 M mul/s 48.6 M mul/s 5.442 Gflops 304 %
MultMatrix_ExpR 15.8 M mul/s 48.4 M mul/s 5.416 Gflops 306 %
MultMatrixF_NEON 53.2 M mul/s 169.8 M mul/s 19.022 Gflops 319 %
MultMatrixF_NEON_type2 39.7 M mul/s 132.3 M mul/s 14.815 Gflops 333 %
MultMatrix_ForC 12.7 M mul/s 40.6 M mul/s 4.553 Gflops 320 %
MultMatrix_ForR 16.9 M mul/s 52.0 M mul/s 5.819 Gflops 307 %
MultMatrix_ExpC 14.9 M mul/s 45.3 M mul/s 5.068 Gflops 303 %
MultMatrix_ExpR 15.1 M mul/s 45.5 M mul/s 5.091 Gflops 302 %
CalcInverse 10.2 M inv/s 39.5 M inv/s 9.766 Gflops 388 %
CalcInverse 4.7 M inv/s 23.0 M inv/s 5.683 Gflops 492 %
CopyMatrix_memcpy 81.1 M cpy/s 286.2 M cpy/s 36.628 GB/sec 353 %
CopyMatrix_Expand 104.4 M cpy/s 320.5 M cpy/s 41.025 GB/sec 307 %
CopyMatrixD_NEON 112.0 M cpy/s 353.5 M cpy/s 45.249 GB/sec 316 %

Rock Pi S

CPU : ARM Cortex-A35 (quad-core) @ ?.?? GHz
MEM : 512 MB
OS : Debian

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC 4.4 M mul/s 17.8 M mul/s 1.991 Gflops 400 %
MultMatrix_ForR 2.2 M mul/s 8.8 M mul/s 0.990 Gflops 400 %
MultMatrix_ExpC 2.5 M mul/s 9.9 M mul/s 1.106 Gflops 400 %
MultMatrix_ExpR 2.5 M mul/s 9.8 M mul/s 1.102 Gflops 399 %
MultMatrixF_NEON 8.8 M mul/s 35.1 M mul/s 3.927 Gflops 400 %
MultMatrixF_NEON_type2 7.9 M mul/s 31.5 M mul/s 3.526 Gflops 400 %
MultMatrix_ForC 2.4 M mul/s 9.8 M mul/s 1.093 Gflops 400 %
MultMatrix_ForR 2.2 M mul/s 8.8 M mul/s 0.989 Gflops 400 %
MultMatrix_ExpC 2.5 M mul/s 9.8 M mul/s 1.103 Gflops 400 %
MultMatrix_ExpR 2.5 M mul/s 9.8 M mul/s 1.103 Gflops 400 %
CalcInverse 1.7 M inv/s 6.9 M inv/s 1.694 Gflops 400 %
CalcInverse 1.4 M inv/s 5.5 M inv/s 1.355 Gflops 400 %
CopyMatrix_memcpy 17.8 M cpy/s 71.3 M cpy/s 9.121 GB/sec 400 %
CopyMatrix_Expand 19.2 M cpy/s 76.8 M cpy/s 9.828 GB/sec 400 %
CopyMatrixD_NEON 27.7 M cpy/s 110.8 M cpy/s 14.187 GB/sec 400 %

Jetson Nano

nVidia のDeepLearning用SBC
なぜか、pinレイアウトはRaspberryPiに準拠しているっぴので、ラズパイの仲間でもありそう。

CPU : ARM Cortex-A57 (quad-core) @ 1.43GHz
MEM : 4096 MB
OS : Jetson用。

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC 20.6 M mul/s 82.0 M mul/s 9.179 Gflops 397 %
MultMatrix_ForR 9.8 M mul/s 38.8 M mul/s 4.347 Gflops 397 %
MultMatrix_ExpC 10.0 M mul/s 39.5 M mul/s 4.429 Gflops 397 %
MultMatrix_ExpR 10.0 M mul/s 39.5 M mul/s 4.427 Gflops 397 %
MultMatrixF_NEON 39.9 M mul/s 158.8 M mul/s 17.780 Gflops 398 %
MultMatrixF_NEON_type2 27.9 M mul/s 111.1 M mul/s 12.441 Gflops 398 %
MultMatrix_ForC 8.0 M mul/s 31.9 M mul/s 3.573 Gflops 398 %
MultMatrix_ForR 9.7 M mul/s 38.7 M mul/s 4.333 Gflops 397 %
MultMatrix_ExpC 9.8 M mul/s 39.0 M mul/s 4.370 Gflops 397 %
MultMatrix_ExpR 9.7 M mul/s 38.7 M mul/s 4.333 Gflops 397 %
CalcInverse 6.0 M inv/s 24.0 M inv/s 5.937 Gflops 400 %
CalcInverse 2.3 M inv/s 9.0 M inv/s 2.232 Gflops 397 %
CopyMatrix_memcpy 55.5 M cpy/s 229.7 M cpy/s 29.398 GB/sec 414 %
CopyMatrix_Expand 72.7 M cpy/s 306.2 M cpy/s 39.194 GB/sec 421 %
CopyMatrixD_NEON 74.8 M cpy/s 311.5 M cpy/s 39.872 GB/sec 416 %

Intel Atom z3795

CPU : Intel Atom z3795 (2.4G Hz? x 4)
MEM : 4 GB
OS : Windows10 Professional (32bit)

Compile: VisualStudio 2015, 速度優先

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 7.4 M mul/s 28.1 M mul/s 3.143 Gflops 378 %
MultMatrix_ForR<float> 7.5 M mul/s 28.3 M mul/s 3.169 Gflops 378 %
MultMatrix_ExpC<float> 10.0 M mul/s 37.4 M mul/s 4.190 Gflops 376 %
MultMatrix_ExpR<float> 9.9 M mul/s 37.4 M mul/s 4.191 Gflops 376 %
MultMatrixF_SSE2 26.8 M mul/s 100.8 M mul/s 11.292 Gflops 377 %
MultMatrix_ForC<double> 7.0 M mul/s 26.4 M mul/s 2.951 Gflops 375 %
MultMatrix_ForR<double> 7.1 M mul/s 26.7 M mul/s 2.988 Gflops 375 %
MultMatrix_ExpC<double> 8.8 M mul/s 32.8 M mul/s 3.677 Gflops 374 %
MultMatrix_ExpR<double> 8.8 M mul/s 33.0 M mul/s 3.700 Gflops 376 %
MultMatrixD_SSE2 11.0 M mul/s 40.9 M mul/s 4.576 Gflops 373 %
CalcInverse<float> 3.5 M inv/s 13.1 M inv/s 3.230 Gflops 378 %
CalcInverse<double> 2.5 M inv/s 9.5 M inv/s 2.350 Gflops 377 %
CopyMatrix_memcpy<double> 24.9 M cpy/s 86.8 M cpy/s 11.110 GB/sec 348 %
CopyMatrix_Expand<double> 48.8 M cpy/s 184.3 M cpy/s 23.585 GB/sec 378 %
CopyMatrixD_SSE2 87.8 M cpy/s 316.7 M cpy/s 40.540 GB/sec 361 %

Intel Pentium 4415Y @ 1.6GHz

PC : Microsoft SurfaceGo
CPU : Intel Pentium 4415Y @ 1.6GHz (1.6G Hz? x 2)
MEM : 8 GB
OS : Windows10 Home (64bit)

Compile: VisualStudio 2017, 速度優先

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC 19.7 M mul/s 38.1 M mul/s 4.266 Gflops 194 %
MultMatrix_ForR 19.6 M mul/s 37.8 M mul/s 4.233 Gflops 193 %
MultMatrix_ExpC 19.5 M mul/s 38.8 M mul/s 4.351 Gflops 199 %
MultMatrix_ExpR 19.7 M mul/s 39.5 M mul/s 4.420 Gflops 200 %
MultMatrixF_SSE2 78.7 M mul/s 157.2 M mul/s 17.601 Gflops 200 %
MultMatrix_ForC 19.5 M mul/s 37.0 M mul/s 4.147 Gflops 190 %
MultMatrix_ForR 19.3 M mul/s 38.0 M mul/s 4.261 Gflops 197 %
MultMatrix_ExpC 19.6 M mul/s 37.5 M mul/s 4.197 Gflops 191 %
MultMatrix_ExpR 19.6 M mul/s 39.1 M mul/s 4.379 Gflops 200 %
MultMatrixD_SSE2 41.9 M mul/s 83.5 M mul/s 9.355 Gflops 199 %
CalcInverse 11.9 M inv/s 27.2 M inv/s 6.715 Gflops 228 %
CalcInverse 10.5 M inv/s 25.9 M inv/s 6.394 Gflops 246 %
CopyMatrix_memcpy 157.9 M cpy/s 274.7 M cpy/s 35.167 GB/sec 174 %
CopyMatrix_Expand 93.1 M cpy/s 186.5 M cpy/s 23.868 GB/sec 200 %
CopyMatrixD_SSE2 158.3 M cpy/s 315.2 M cpy/s 40.342 GB/sec 199 %

Intel Core i7-6700K

CPU : Intel Core i7-6700K(定格動作、Single=4.2GHz Multi=4.0GHz, Logical=8core, Physical=4core)
MEM : 16GB (DDR4-3000 DualChannel, Overclocked)
OS : Windows10 Professional (64bit)

Compile: VisualStudio 2015, 速度優先

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 55.5 M mul/s 215.7 M mul/s 24.160 Gflops 389 %
MultMatrix_ForR<float> 55.6 M mul/s 214.4 M mul/s 24.017 Gflops 385 %
MultMatrix_ExpC<float> 60.6 M mul/s 232.6 M mul/s 26.049 Gflops 384 %
MultMatrix_ExpR<float> 59.5 M mul/s 232.4 M mul/s 26.028 Gflops 391 %
MultMatrixF_SSE2 207.2 M mul/s 794.2 M mul/s 88.955 Gflops 383 %
MultMatrix_ForC<double> 55.9 M mul/s 213.0 M mul/s 23.861 Gflops 381 %
MultMatrix_ForR<double> 55.8 M mul/s 215.7 M mul/s 24.163 Gflops 387 %
MultMatrix_ExpC<double> 60.4 M mul/s 229.3 M mul/s 25.687 Gflops 380 %
MultMatrix_ExpR<double> 60.4 M mul/s 230.8 M mul/s 25.854 Gflops 382 %
MultMatrixD_SSE2 110.6 M mul/s 422.0 M mul/s 47.266 Gflops 381 %
MultMatrixD_AVX 220.9 M mul/s 859.0 M mul/s 96.203 Gflops 389 %
MultMatrixD_AVX_FMA 247.0 M mul/s 959.3 M mul/s 107.445 Gflops 388 %
MultMatrixD_AVX_FMA_type2 266.1 M mul/s 1025.5 M mul/s 114.857 Gflops 385 %
CalcInverse<float> 30.7 M inv/s 133.3 M inv/s 32.928 Gflops 434 %
CalcInverse<double> 27.1 M inv/s 130.0 M inv/s 32.109 Gflops 480 %
CalcInverseD_AVX2_FMA 61.8 M inv/s 262.6 M inv/s 64.873 Gflops 425 %
CopyMatrix_memcpy<double> 461.7 M cpy/s 1670.1 M cpy/s 213.766 GB/sec 362 %
CopyMatrix_Expand<double> 240.3 M cpy/s 923.4 M cpy/s 118.198 GB/sec 384 %
CopyMatrixD_SSE2 462.2 M cpy/s 1775.9 M cpy/s 227.311 GB/sec 384 %
CopyMatrixD_AVX 652.0 M cpy/s 2290.2 M cpy/s 293.147 GB/sec 351 %

終わりに

各種実験結果の総論である。

  • ARM系でのg++ O3オプションは、自動的にベクトル化され、著しい性能向上する場合がある。NanoPi-NEO2のMultMatrix_ForC
  • ただし、上手くベクトル化できず、性能が出ないときもある。NanoPi-NEO2のMultMatrix_ForR
  • 自力で書いたNEONは、やはり速い(笑
  • 小容量のデータコピーは、memcpyより直接代入したほうが速いケースが多い。
  • Arm Cortex-A53 と Atom z3795 はクロック当たりの性能はほぼ一緒に見える。
  • i7-6700K と Cortex-A53 の性能差は20倍程度ある。
  • 逆行列計算(公式Ver)は、SIMD使ってもそこまで速くなれない。。。
15
8
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
15
8

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?