Edited at

4x4行列同士の掛算を高速化してみる ~Part2~


概要

4x4行列同士の掛算を高速化してみるの続編。

今回新たに以下を追加。


  • NEON(Arm)版追加。(ただしdouble使えないのでfloatのみ)

  • float の4x4行列掛け算 (通常版、SSE2版、NEON版)

  • 逆行列(公式Ver)計算追加 (通常版、AVX2_FMA版)

  • 行列のコピー処理計測 (memcpy版、代入版、各種SIMD版)

  • マルチプラットフォーム対応


ソースコード

全体はgit.

https://github.com/blue777/NanoPi-NEO

今回の関連するコードは以下の2つ。

コンパイルはPerfTest_Matrix.cppだけをやればOK!


  • PerfTest_Matrix.cpp

  • common/multithread_tools.h


コンパイルスイッチ

各種命令系の有効無効を行頭の部分で編集できるようにしてみました。

使用するコンパイラ・実行環境によって適当に切り替えてください。


PerfTest_Matrix.cpp

//#define ENABLE_SSE2

//#define ENABLE_AVX
//#define ENABLE_AVX_FMA
//#define ENABLE_AVX2_FMA
#define ENABLE_NEON


4x4行列floatの掛け算 SSE2版


MultMatrixF_SSE2


void MultMatrixF_SSE2( float result[16], const float base[16], const float mult[16] )
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

xmm4 = _mm_loadu_ps( &mult[0] );
xmm5 = _mm_loadu_ps( &mult[4] );
xmm6 = _mm_loadu_ps( &mult[8] );
xmm7 = _mm_loadu_ps( &mult[12] );

// column0
xmm0 = _mm_load1_ps( &base[0] );
xmm1 = _mm_load1_ps( &base[1] );
xmm2 = _mm_load1_ps( &base[2] );
xmm3 = _mm_load1_ps( &base[3] );

xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );

xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[0], xmm0 );

// column1
xmm0 = _mm_load1_ps( &base[4] );
xmm1 = _mm_load1_ps( &base[5] );
xmm2 = _mm_load1_ps( &base[6] );
xmm3 = _mm_load1_ps( &base[7] );

xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );

xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[4], xmm0 );

// column2
xmm0 = _mm_load1_ps( &base[8] );
xmm1 = _mm_load1_ps( &base[9] );
xmm2 = _mm_load1_ps( &base[10] );
xmm3 = _mm_load1_ps( &base[11] );

xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );

xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[8], xmm0 );

// column3
xmm0 = _mm_load1_ps( &base[12] );
xmm1 = _mm_load1_ps( &base[13] );
xmm2 = _mm_load1_ps( &base[14] );
xmm3 = _mm_load1_ps( &base[15] );

xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );

xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[12], xmm0 );
}



4x4行列floatの掛け算 NEON命令版

SSE2のfloat版とほぼ同等の命令構成で書いた行列掛け算。


MultMatrixF_NEON

void    MultMatrixF_NEON( float result[16], const float base[16], const float mult[16] )

{
float32x4_t c0, c1, c2, c3;
float32x4_t r0, r1, r2, r3;

c0 = vld1q_f32( &mult[0] );
c1 = vld1q_f32( &mult[4] );
c2 = vld1q_f32( &mult[8] );
c3 = vld1q_f32( &mult[12] );

// column 0
r0 = vmulq_n_f32( c0, base[0] );
r1 = vmulq_n_f32( c1, base[1] );
r2 = vmulq_n_f32( c2, base[2] );
r3 = vmulq_n_f32( c3, base[3] );

r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );

vst1q_f32( &result[0], r0 );

// column 1
r0 = vmulq_n_f32( c0, base[4] );
r1 = vmulq_n_f32( c1, base[5] );
r2 = vmulq_n_f32( c2, base[6] );
r3 = vmulq_n_f32( c3, base[7] );

r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );

vst1q_f32( &result[4], r0 );

// column 2
r0 = vmulq_n_f32( c0, base[8] );
r1 = vmulq_n_f32( c1, base[9] );
r2 = vmulq_n_f32( c2, base[10] );
r3 = vmulq_n_f32( c3, base[11] );

r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );

vst1q_f32( &result[8], r0 );

// column 3
r0 = vmulq_n_f32( c0, base[12] );
r1 = vmulq_n_f32( c1, base[13] );
r2 = vmulq_n_f32( c2, base[14] );
r3 = vmulq_n_f32( c3, base[15] );

r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );

vst1q_f32( &result[12], r0 );
}



4x4行列floatの掛け算 NEON版 Type2

使う命令をちょっと変えた版。

行数は少なくなるが、果たしてどうなるか。


MultMatrixF_NEON_type2

void    MultMatrixF_NEON_type2( float result[16], const float base[16], const float mult[16] )

{
float32x4_t c0, c1, c2, c3;
float32x4_t r0, r1, r2, r3;

c0 = vld1q_f32( &mult[0] );
c1 = vld1q_f32( &mult[4] );
c2 = vld1q_f32( &mult[8] );
c3 = vld1q_f32( &mult[12] );

// column 0
r0 = vmulq_n_f32( c0, base[0] );
r0 = vmlaq_n_f32( r0, c1, base[1] );
r0 = vmlaq_n_f32( r0, c2, base[2] );
r0 = vmlaq_n_f32( r0, c3, base[3] );

vst1q_f32( &result[0], r0 );

// column 1
r0 = vmulq_n_f32( c0, base[4] );
r0 = vmlaq_n_f32( r0, c1, base[5] );
r0 = vmlaq_n_f32( r0, c2, base[6] );
r0 = vmlaq_n_f32( r0, c3, base[7] );

vst1q_f32( &result[4], r0 );

// column 2
r0 = vmulq_n_f32( c0, base[8] );
r0 = vmlaq_n_f32( r0, c1, base[9] );
r0 = vmlaq_n_f32( r0, c2, base[10] );
r0 = vmlaq_n_f32( r0, c3, base[11] );

vst1q_f32( &result[8], r0 );

// column 3
r0 = vmulq_n_f32( c0, base[12] );
r0 = vmlaq_n_f32( r0, c1, base[13] );
r0 = vmlaq_n_f32( r0, c2, base[14] );
r0 = vmlaq_n_f32( r0, c3, base[15] );

vst1q_f32( &result[12], r0 );
}



測定結果。

手持ちの環境で測ったみた。

あと6機種ほど測れそうなのがあるが、ひとまずメイン環境のみで。


NanoPi-NEO

RaspberryPiの仲間。

CPU : Allwinner H3 (Cortex-A7 1.2GHz x 4)

MEM : 512 MB

OS : Ubuntu 16.04 LTS 32bit (Official rom image, kernel=3.4.39)

Compile Command : g++ -O3 -pthread -std=c++11 -mfpu=neon PerfTest_Matrix.cpp

Function name
Single Thread
Multi Thread
Performance(MT)
MT/ST

MultMatrix_ForC<float>
2.3 M mul/s
9.0 M mul/s
1.006 Gflops
396 %

MultMatrix_ForR<float>
2.3 M mul/s
9.2 M mul/s
1.029 Gflops
400 %

MultMatrix_ExpC<float>
2.9 M mul/s
11.4 M mul/s
1.274 Gflops
393 %

MultMatrix_ExpR<float>
2.9 M mul/s
11.6 M mul/s
1.296 Gflops
401 %

MultMatrixF_NEON
5.8 M mul/s
22.5 M mul/s
2.518 Gflops
385 %

MultMatrixF_NEON_type2
6.0 M mul/s
23.0 M mul/s
2.579 Gflops
386 %

MultMatrix_ForC<double>
1.7 M mul/s
6.7 M mul/s
0.749 Gflops
403 %

MultMatrix_ForR<double>
1.7 M mul/s
6.7 M mul/s
0.747 Gflops
400 %

MultMatrix_ExpC<double>
2.0 M mul/s
8.1 M mul/s
0.907 Gflops
397 %

MultMatrix_ExpR<double>
2.0 M mul/s
8.1 M mul/s
0.912 Gflops
400 %

CalcInverse<float>
1.9 M inv/s
7.5 M inv/s
1.842 Gflops
399 %

CalcInverse<double>
1.2 M inv/s
4.6 M inv/s
1.131 Gflops
397 %

CopyMatrix_memcpy<double>
13.6 M cpy/s
55.8 M cpy/s
7.140 GB/sec
411 %

CopyMatrix_Expand<double>
27.7 M cpy/s
111.1 M cpy/s
14.224 GB/sec
401 %

CopyMatrixD_NEON
21.0 M cpy/s
83.9 M cpy/s
10.738 GB/sec
400 %


NanoPi NEO2

RaspberryPiの仲間。

CPU : Allwinner H5 (Cortex-A53 1.0 GHz * 4)

MEM : 512 MB

OS : Ubuntu 16.04 LTS 64bit (Official rom image, kernel=4.11.2)

CompileCommand : g++ -std=c++11 -pthread -O3 PerfTest_Matrix.cpp

Function name
Single Thread
Multi Thread
Performance(MT)
MT/ST

MultMatrix_ForC<float>
5.8 M mul/s
24.4 M mul/s
2.729 Gflops
421 %

MultMatrix_ForR<float>
2.5 M mul/s
10.1 M mul/s
1.134 Gflops
397 %

MultMatrix_ExpC<float>
2.7 M mul/s
11.2 M mul/s
1.253 Gflops
413 %

MultMatrix_ExpR<float>
2.8 M mul/s
11.2 M mul/s
1.253 Gflops
404 %

MultMatrixF_NEON
10.2 M mul/s
41.2 M mul/s
4.619 Gflops
406 %

MultMatrixF_NEON_type2
9.1 M mul/s
36.7 M mul/s
4.110 Gflops
403 %

MultMatrix_ForC<double>
2.6 M mul/s
10.2 M mul/s
1.144 Gflops
397 %

MultMatrix_ForR<double>
2.3 M mul/s
9.3 M mul/s
1.042 Gflops
404 %

MultMatrix_ExpC<double>
2.6 M mul/s
10.2 M mul/s
1.140 Gflops
398 %

MultMatrix_ExpR<double>
2.5 M mul/s
10.2 M mul/s
1.139 Gflops
403 %

CalcInverse<float>
2.1 M inv/s
8.6 M inv/s
2.115 Gflops
403 %

CalcInverse<double>
1.8 M inv/s
7.5 M inv/s
1.854 Gflops
409 %

CopyMatrix_memcpy<double>
20.1 M cpy/s
79.8 M cpy/s
10.209 GB/sec
397 %

CopyMatrix_Expand<double>
29.5 M cpy/s
117.2 M cpy/s
14.998 GB/sec
397 %

CopyMatrixD_NEON
23.3 M cpy/s
92.7 M cpy/s
11.869 GB/sec
397 %


NanoPi NEO4

RaspberryPiの仲間。

CPU : RockChip RK3399 (Cortex-A72 1.8GHz x2 + Cortex-A53 1.4GHz x4)

MEM : 1024 MB

OS : Ubuntu 18.04 64bit (Official rom image FriendlyDesktop)

Function name
Single Thread
Multi Thread
Performance(MT)
MT/ST

MultMatrix_ForC
27.6 M mul/s
92.9 M mul/s
10.407 Gflops
336 %

MultMatrix_ForR
18.4 M mul/s
50.6 M mul/s
5.671 Gflops
275 %

MultMatrix_ExpC
16.0 M mul/s
48.6 M mul/s
5.442 Gflops
304 %

MultMatrix_ExpR
15.8 M mul/s
48.4 M mul/s
5.416 Gflops
306 %

MultMatrixF_NEON
53.2 M mul/s
169.8 M mul/s
19.022 Gflops
319 %

MultMatrixF_NEON_type2
39.7 M mul/s
132.3 M mul/s
14.815 Gflops
333 %

MultMatrix_ForC
12.7 M mul/s
40.6 M mul/s
4.553 Gflops
320 %

MultMatrix_ForR
16.9 M mul/s
52.0 M mul/s
5.819 Gflops
307 %

MultMatrix_ExpC
14.9 M mul/s
45.3 M mul/s
5.068 Gflops
303 %

MultMatrix_ExpR
15.1 M mul/s
45.5 M mul/s
5.091 Gflops
302 %

CalcInverse
10.2 M inv/s
39.5 M inv/s
9.766 Gflops
388 %

CalcInverse
4.7 M inv/s
23.0 M inv/s
5.683 Gflops
492 %

CopyMatrix_memcpy
81.1 M cpy/s
286.2 M cpy/s
36.628 GB/sec
353 %

CopyMatrix_Expand
104.4 M cpy/s
320.5 M cpy/s
41.025 GB/sec
307 %

CopyMatrixD_NEON
112.0 M cpy/s
353.5 M cpy/s
45.249 GB/sec
316 %


Jetson Nano

nVidia のDeepLearning用SBC

なぜか、pinレイアウトはRaspberryPiに準拠しているっぴので、ラズパイの仲間でもありそう。

CPU : ARM Cortex-A57 (quad-core) @ 1.43GHz

MEM : 4096 MB

OS : Jetson用。

Function name
Single Thread
Multi Thread
Performance(MT)
MT/ST

MultMatrix_ForC
20.6 M mul/s
82.0 M mul/s
9.179 Gflops
397 %

MultMatrix_ForR
9.8 M mul/s
38.8 M mul/s
4.347 Gflops
397 %

MultMatrix_ExpC
10.0 M mul/s
39.5 M mul/s
4.429 Gflops
397 %

MultMatrix_ExpR
10.0 M mul/s
39.5 M mul/s
4.427 Gflops
397 %

MultMatrixF_NEON
39.9 M mul/s
158.8 M mul/s
17.780 Gflops
398 %

MultMatrixF_NEON_type2
27.9 M mul/s
111.1 M mul/s
12.441 Gflops
398 %

MultMatrix_ForC
8.0 M mul/s
31.9 M mul/s
3.573 Gflops
398 %

MultMatrix_ForR
9.7 M mul/s
38.7 M mul/s
4.333 Gflops
397 %

MultMatrix_ExpC
9.8 M mul/s
39.0 M mul/s
4.370 Gflops
397 %

MultMatrix_ExpR
9.7 M mul/s
38.7 M mul/s
4.333 Gflops
397 %

CalcInverse
6.0 M inv/s
24.0 M inv/s
5.937 Gflops
400 %

CalcInverse
2.3 M inv/s
9.0 M inv/s
2.232 Gflops
397 %

CopyMatrix_memcpy
55.5 M cpy/s
229.7 M cpy/s
29.398 GB/sec
414 %

CopyMatrix_Expand
72.7 M cpy/s
306.2 M cpy/s
39.194 GB/sec
421 %

CopyMatrixD_NEON
74.8 M cpy/s
311.5 M cpy/s
39.872 GB/sec
416 %


Intel Atom z3795

CPU : Intel Atom z3795 (2.4G Hz? x 4)

MEM : 4 GB

OS : Windows10 Professional (32bit)

Compile: VisualStudio 2015, 速度優先

Function name
Single Thread
Multi Thread
Performance(MT)
MT/ST

MultMatrix_ForC<float>
7.4 M mul/s
28.1 M mul/s
3.143 Gflops
378 %

MultMatrix_ForR<float>
7.5 M mul/s
28.3 M mul/s
3.169 Gflops
378 %

MultMatrix_ExpC<float>
10.0 M mul/s
37.4 M mul/s
4.190 Gflops
376 %

MultMatrix_ExpR<float>
9.9 M mul/s
37.4 M mul/s
4.191 Gflops
376 %

MultMatrixF_SSE2
26.8 M mul/s
100.8 M mul/s
11.292 Gflops
377 %

MultMatrix_ForC<double>
7.0 M mul/s
26.4 M mul/s
2.951 Gflops
375 %

MultMatrix_ForR<double>
7.1 M mul/s
26.7 M mul/s
2.988 Gflops
375 %

MultMatrix_ExpC<double>
8.8 M mul/s
32.8 M mul/s
3.677 Gflops
374 %

MultMatrix_ExpR<double>
8.8 M mul/s
33.0 M mul/s
3.700 Gflops
376 %

MultMatrixD_SSE2
11.0 M mul/s
40.9 M mul/s
4.576 Gflops
373 %

CalcInverse<float>
3.5 M inv/s
13.1 M inv/s
3.230 Gflops
378 %

CalcInverse<double>
2.5 M inv/s
9.5 M inv/s
2.350 Gflops
377 %

CopyMatrix_memcpy<double>
24.9 M cpy/s
86.8 M cpy/s
11.110 GB/sec
348 %

CopyMatrix_Expand<double>
48.8 M cpy/s
184.3 M cpy/s
23.585 GB/sec
378 %

CopyMatrixD_SSE2
87.8 M cpy/s
316.7 M cpy/s
40.540 GB/sec
361 %


Intel Pentium 4415Y @ 1.6GHz

PC : Microsoft SurfaceGo

CPU : Intel Pentium 4415Y @ 1.6GHz (1.6G Hz? x 2)

MEM : 8 GB

OS : Windows10 Home (64bit)

Compile: VisualStudio 2017, 速度優先

Function name
Single Thread
Multi Thread
Performance(MT)
MT/ST

MultMatrix_ForC
19.7 M mul/s
38.1 M mul/s
4.266 Gflops
194 %

MultMatrix_ForR
19.6 M mul/s
37.8 M mul/s
4.233 Gflops
193 %

MultMatrix_ExpC
19.5 M mul/s
38.8 M mul/s
4.351 Gflops
199 %

MultMatrix_ExpR
19.7 M mul/s
39.5 M mul/s
4.420 Gflops
200 %

MultMatrixF_SSE2
78.7 M mul/s
157.2 M mul/s
17.601 Gflops
200 %

MultMatrix_ForC
19.5 M mul/s
37.0 M mul/s
4.147 Gflops
190 %

MultMatrix_ForR
19.3 M mul/s
38.0 M mul/s
4.261 Gflops
197 %

MultMatrix_ExpC
19.6 M mul/s
37.5 M mul/s
4.197 Gflops
191 %

MultMatrix_ExpR
19.6 M mul/s
39.1 M mul/s
4.379 Gflops
200 %

MultMatrixD_SSE2
41.9 M mul/s
83.5 M mul/s
9.355 Gflops
199 %

CalcInverse
11.9 M inv/s
27.2 M inv/s
6.715 Gflops
228 %

CalcInverse
10.5 M inv/s
25.9 M inv/s
6.394 Gflops
246 %

CopyMatrix_memcpy
157.9 M cpy/s
274.7 M cpy/s
35.167 GB/sec
174 %

CopyMatrix_Expand
93.1 M cpy/s
186.5 M cpy/s
23.868 GB/sec
200 %

CopyMatrixD_SSE2
158.3 M cpy/s
315.2 M cpy/s
40.342 GB/sec
199 %


Intel Core i7-6700K

CPU : Intel Core i7-6700K(定格動作、Single=4.2GHz Multi=4.0GHz, Logical=8core, Physical=4core)

MEM : 16GB (DDR4-3000 DualChannel, Overclocked)

OS : Windows10 Professional (64bit)

Compile: VisualStudio 2015, 速度優先

Function name
Single Thread
Multi Thread
Performance(MT)
MT/ST

MultMatrix_ForC<float>
55.5 M mul/s
215.7 M mul/s
24.160 Gflops
389 %

MultMatrix_ForR<float>
55.6 M mul/s
214.4 M mul/s
24.017 Gflops
385 %

MultMatrix_ExpC<float>
60.6 M mul/s
232.6 M mul/s
26.049 Gflops
384 %

MultMatrix_ExpR<float>
59.5 M mul/s
232.4 M mul/s
26.028 Gflops
391 %

MultMatrixF_SSE2
207.2 M mul/s
794.2 M mul/s
88.955 Gflops
383 %

MultMatrix_ForC<double>
55.9 M mul/s
213.0 M mul/s
23.861 Gflops
381 %

MultMatrix_ForR<double>
55.8 M mul/s
215.7 M mul/s
24.163 Gflops
387 %

MultMatrix_ExpC<double>
60.4 M mul/s
229.3 M mul/s
25.687 Gflops
380 %

MultMatrix_ExpR<double>
60.4 M mul/s
230.8 M mul/s
25.854 Gflops
382 %

MultMatrixD_SSE2
110.6 M mul/s
422.0 M mul/s
47.266 Gflops
381 %

MultMatrixD_AVX
220.9 M mul/s
859.0 M mul/s
96.203 Gflops
389 %

MultMatrixD_AVX_FMA
247.0 M mul/s
959.3 M mul/s
107.445 Gflops
388 %

MultMatrixD_AVX_FMA_type2
266.1 M mul/s
1025.5 M mul/s
114.857 Gflops
385 %

CalcInverse<float>
30.7 M inv/s
133.3 M inv/s
32.928 Gflops
434 %

CalcInverse<double>
27.1 M inv/s
130.0 M inv/s
32.109 Gflops
480 %

CalcInverseD_AVX2_FMA
61.8 M inv/s
262.6 M inv/s
64.873 Gflops
425 %

CopyMatrix_memcpy<double>
461.7 M cpy/s
1670.1 M cpy/s
213.766 GB/sec
362 %

CopyMatrix_Expand<double>
240.3 M cpy/s
923.4 M cpy/s
118.198 GB/sec
384 %

CopyMatrixD_SSE2
462.2 M cpy/s
1775.9 M cpy/s
227.311 GB/sec
384 %

CopyMatrixD_AVX
652.0 M cpy/s
2290.2 M cpy/s
293.147 GB/sec
351 %


終わりに

各種実験結果の総論である。


  • ARM系でのg++ O3オプションは、自動的にベクトル化され、著しい性能向上する場合がある。NanoPi-NEO2のMultMatrix_ForC

  • ただし、上手くベクトル化できず、性能が出ないときもある。NanoPi-NEO2のMultMatrix_ForR

  • 自力で書いたNEONは、やはり速い(笑

  • 小容量のデータコピーは、memcpyより直接代入したほうが速いケースが多い。

  • Arm Cortex-A53 と Atom z3795 はクロック当たりの性能はほぼ一緒に見える。

  • i7-6700K と Cortex-A53 の性能差は20倍程度ある。

  • 逆行列計算(公式Ver)は、SIMD使ってもそこまで速くなれない。。。