概要

• NEON(Arm)版追加。(ただしdouble使えないのでfloatのみ)

• float の4x4行列掛け算 (通常版、SSE2版、NEON版)

• 逆行列(公式Ver)計算追加 (通常版、AVX2_FMA版)

• 行列のコピー処理計測 (memcpy版、代入版、各種SIMD版)

• マルチプラットフォーム対応

ソースコード

コンパイルはPerfTest_Matrix.cppだけをやればOK!

• PerfTest_Matrix.cpp

コンパイルスイッチ

```//#define ENABLE_SSE2
//#define ENABLE_AVX
//#define ENABLE_AVX_FMA
//#define ENABLE_AVX2_FMA
#define ENABLE_NEON
```

4x4行列floatの掛け算 SSE2版

MultMatrixF_SSE2

```void    MultMatrixF_SSE2( float result[16], const float base[16], const float mult[16] )
{
__m128  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

// column0

xmm0    = _mm_mul_ps( xmm0, xmm4 );
xmm1    = _mm_mul_ps( xmm1, xmm5 );
xmm2    = _mm_mul_ps( xmm2, xmm6 );
xmm3    = _mm_mul_ps( xmm3, xmm7 );

xmm0    = _mm_add_ps( xmm0, xmm1 );
xmm2    = _mm_add_ps( xmm2, xmm3 );
xmm0    = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[0], xmm0 );

// column1

xmm0    = _mm_mul_ps( xmm0, xmm4 );
xmm1    = _mm_mul_ps( xmm1, xmm5 );
xmm2    = _mm_mul_ps( xmm2, xmm6 );
xmm3    = _mm_mul_ps( xmm3, xmm7 );

xmm0    = _mm_add_ps( xmm0, xmm1 );
xmm2    = _mm_add_ps( xmm2, xmm3 );
xmm0    = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[4], xmm0 );

// column2

xmm0    = _mm_mul_ps( xmm0, xmm4 );
xmm1    = _mm_mul_ps( xmm1, xmm5 );
xmm2    = _mm_mul_ps( xmm2, xmm6 );
xmm3    = _mm_mul_ps( xmm3, xmm7 );

xmm0    = _mm_add_ps( xmm0, xmm1 );
xmm2    = _mm_add_ps( xmm2, xmm3 );
xmm0    = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[8], xmm0 );

// column3

xmm0    = _mm_mul_ps( xmm0, xmm4 );
xmm1    = _mm_mul_ps( xmm1, xmm5 );
xmm2    = _mm_mul_ps( xmm2, xmm6 );
xmm3    = _mm_mul_ps( xmm3, xmm7 );

xmm0    = _mm_add_ps( xmm0, xmm1 );
xmm2    = _mm_add_ps( xmm2, xmm3 );
xmm0    = _mm_add_ps( xmm0, xmm2 );

_mm_storeu_ps( &result[12], xmm0 );
}
```

4x4行列floatの掛け算 NEON命令版

SSE2のfloat版とほぼ同等の命令構成で書いた行列掛け算。

MultMatrixF_NEON

```void    MultMatrixF_NEON( float result[16], const float base[16], const float mult[16] )
{
float32x4_t c0, c1, c2, c3;
float32x4_t r0, r1, r2, r3;

c0  = vld1q_f32( &mult[0] );
c1  = vld1q_f32( &mult[4] );
c2  = vld1q_f32( &mult[8] );
c3  = vld1q_f32( &mult[12] );

// column 0
r0  = vmulq_n_f32( c0, base[0] );
r1  = vmulq_n_f32( c1, base[1] );
r2  = vmulq_n_f32( c2, base[2] );
r3  = vmulq_n_f32( c3, base[3] );

r0  = vaddq_f32( r0, r1 );
r2  = vaddq_f32( r2, r3 );
r0  = vaddq_f32( r0, r2 );

vst1q_f32( &result[0], r0 );

// column 1
r0  = vmulq_n_f32( c0, base[4] );
r1  = vmulq_n_f32( c1, base[5] );
r2  = vmulq_n_f32( c2, base[6] );
r3  = vmulq_n_f32( c3, base[7] );

r0  = vaddq_f32( r0, r1 );
r2  = vaddq_f32( r2, r3 );
r0  = vaddq_f32( r0, r2 );

vst1q_f32( &result[4], r0 );

// column 2
r0  = vmulq_n_f32( c0, base[8] );
r1  = vmulq_n_f32( c1, base[9] );
r2  = vmulq_n_f32( c2, base[10] );
r3  = vmulq_n_f32( c3, base[11] );

r0  = vaddq_f32( r0, r1 );
r2  = vaddq_f32( r2, r3 );
r0  = vaddq_f32( r0, r2 );

vst1q_f32( &result[8], r0 );

// column 3
r0  = vmulq_n_f32( c0, base[12] );
r1  = vmulq_n_f32( c1, base[13] );
r2  = vmulq_n_f32( c2, base[14] );
r3  = vmulq_n_f32( c3, base[15] );

r0  = vaddq_f32( r0, r1 );
r2  = vaddq_f32( r2, r3 );
r0  = vaddq_f32( r0, r2 );

vst1q_f32( &result[12], r0 );
}
```

4x4行列floatの掛け算 NEON版 Type2

MultMatrixF_NEON_type2

```void    MultMatrixF_NEON_type2( float result[16], const float base[16], const float mult[16] )
{
float32x4_t c0, c1, c2, c3;
float32x4_t r0, r1, r2, r3;

c0  = vld1q_f32( &mult[0] );
c1  = vld1q_f32( &mult[4] );
c2  = vld1q_f32( &mult[8] );
c3  = vld1q_f32( &mult[12] );

// column 0
r0  = vmulq_n_f32(     c0, base[0] );
r0  = vmlaq_n_f32( r0, c1, base[1] );
r0  = vmlaq_n_f32( r0, c2, base[2] );
r0  = vmlaq_n_f32( r0, c3, base[3] );

vst1q_f32( &result[0], r0 );

// column 1
r0  = vmulq_n_f32(     c0, base[4] );
r0  = vmlaq_n_f32( r0, c1, base[5] );
r0  = vmlaq_n_f32( r0, c2, base[6] );
r0  = vmlaq_n_f32( r0, c3, base[7] );

vst1q_f32( &result[4], r0 );

// column 2
r0  = vmulq_n_f32(     c0, base[8] );
r0  = vmlaq_n_f32( r0, c1, base[9] );
r0  = vmlaq_n_f32( r0, c2, base[10] );
r0  = vmlaq_n_f32( r0, c3, base[11] );

vst1q_f32( &result[8], r0 );

// column 3
r0  = vmulq_n_f32(     c0, base[12] );
r0  = vmlaq_n_f32( r0, c1, base[13] );
r0  = vmlaq_n_f32( r0, c2, base[14] );
r0  = vmlaq_n_f32( r0, c3, base[15] );

vst1q_f32( &result[12], r0 );
}
```

測定結果。

あと６機種ほど測れそうなのがあるが、ひとまずメイン環境のみで。

NanoPi-NEO

RaspberryPiの仲間。

CPU : Allwinner H3 (Cortex-A7 1.2GHz x 4)

MEM : 512 MB

OS : Ubuntu 16.04 LTS 32bit (Official rom image, kernel=3.4.39)

Compile Command : `g++ -O3 -pthread -std=c++11 -mfpu=neon PerfTest_Matrix.cpp`

Function name
Performance(MT)
MT/ST

`MultMatrix_ForC<float>`
2.3 M mul/s
9.0 M mul/s
1.006 Gflops
396 %

`MultMatrix_ForR<float>`
2.3 M mul/s
9.2 M mul/s
1.029 Gflops
400 %

`MultMatrix_ExpC<float>`
2.9 M mul/s
11.4 M mul/s
1.274 Gflops
393 %

`MultMatrix_ExpR<float>`
2.9 M mul/s
11.6 M mul/s
1.296 Gflops
401 %

`MultMatrixF_NEON`
5.8 M mul/s
22.5 M mul/s
2.518 Gflops
385 %

`MultMatrixF_NEON_type2`
6.0 M mul/s
23.0 M mul/s
2.579 Gflops
386 %

`MultMatrix_ForC<double>`
1.7 M mul/s
6.7 M mul/s
0.749 Gflops
403 %

`MultMatrix_ForR<double>`
1.7 M mul/s
6.7 M mul/s
0.747 Gflops
400 %

`MultMatrix_ExpC<double>`
2.0 M mul/s
8.1 M mul/s
0.907 Gflops
397 %

`MultMatrix_ExpR<double>`
2.0 M mul/s
8.1 M mul/s
0.912 Gflops
400 %

`CalcInverse<float>`
1.9 M inv/s
7.5 M inv/s
1.842 Gflops
399 %

`CalcInverse<double>`
1.2 M inv/s
4.6 M inv/s
1.131 Gflops
397 %

`CopyMatrix_memcpy<double>`
13.6 M cpy/s
55.8 M cpy/s
7.140 GB/sec
411 %

`CopyMatrix_Expand<double>`
27.7 M cpy/s
111.1 M cpy/s
14.224 GB/sec
401 %

`CopyMatrixD_NEON`
21.0 M cpy/s
83.9 M cpy/s
10.738 GB/sec
400 %

NanoPi NEO2

RaspberryPiの仲間。

CPU : Allwinner H5 (Cortex-A53 1.0 GHz * 4)

MEM : 512 MB

OS : Ubuntu 16.04 LTS 64bit (Official rom image, kernel=4.11.2)

CompileCommand : `g++ -std=c++11 -pthread -O3 PerfTest_Matrix.cpp`

Function name
Performance(MT)
MT/ST

`MultMatrix_ForC<float>`
5.8 M mul/s
24.4 M mul/s
2.729 Gflops
421 %

`MultMatrix_ForR<float>`
2.5 M mul/s
10.1 M mul/s
1.134 Gflops
397 %

`MultMatrix_ExpC<float>`
2.7 M mul/s
11.2 M mul/s
1.253 Gflops
413 %

`MultMatrix_ExpR<float>`
2.8 M mul/s
11.2 M mul/s
1.253 Gflops
404 %

`MultMatrixF_NEON`
10.2 M mul/s
41.2 M mul/s
4.619 Gflops
406 %

`MultMatrixF_NEON_type2`
9.1 M mul/s
36.7 M mul/s
4.110 Gflops
403 %

`MultMatrix_ForC<double>`
2.6 M mul/s
10.2 M mul/s
1.144 Gflops
397 %

`MultMatrix_ForR<double>`
2.3 M mul/s
9.3 M mul/s
1.042 Gflops
404 %

`MultMatrix_ExpC<double>`
2.6 M mul/s
10.2 M mul/s
1.140 Gflops
398 %

`MultMatrix_ExpR<double>`
2.5 M mul/s
10.2 M mul/s
1.139 Gflops
403 %

`CalcInverse<float>`
2.1 M inv/s
8.6 M inv/s
2.115 Gflops
403 %

`CalcInverse<double>`
1.8 M inv/s
7.5 M inv/s
1.854 Gflops
409 %

`CopyMatrix_memcpy<double>`
20.1 M cpy/s
79.8 M cpy/s
10.209 GB/sec
397 %

`CopyMatrix_Expand<double>`
29.5 M cpy/s
117.2 M cpy/s
14.998 GB/sec
397 %

`CopyMatrixD_NEON`
23.3 M cpy/s
92.7 M cpy/s
11.869 GB/sec
397 %

NanoPi NEO4

RaspberryPiの仲間。

CPU : RockChip RK3399 (Cortex-A72 1.8GHz x2 + Cortex-A53 1.4GHz x4)

MEM : 1024 MB

OS : Ubuntu 18.04 64bit (Official rom image FriendlyDesktop)

Function name
Performance(MT)
MT/ST

MultMatrix_ForC
27.6 M mul/s
92.9 M mul/s
10.407 Gflops
336 %

MultMatrix_ForR
18.4 M mul/s
50.6 M mul/s
5.671 Gflops
275 %

MultMatrix_ExpC
16.0 M mul/s
48.6 M mul/s
5.442 Gflops
304 %

MultMatrix_ExpR
15.8 M mul/s
48.4 M mul/s
5.416 Gflops
306 %

MultMatrixF_NEON
53.2 M mul/s
169.8 M mul/s
19.022 Gflops
319 %

MultMatrixF_NEON_type2
39.7 M mul/s
132.3 M mul/s
14.815 Gflops
333 %

MultMatrix_ForC
12.7 M mul/s
40.6 M mul/s
4.553 Gflops
320 %

MultMatrix_ForR
16.9 M mul/s
52.0 M mul/s
5.819 Gflops
307 %

MultMatrix_ExpC
14.9 M mul/s
45.3 M mul/s
5.068 Gflops
303 %

MultMatrix_ExpR
15.1 M mul/s
45.5 M mul/s
5.091 Gflops
302 %

CalcInverse
10.2 M inv/s
39.5 M inv/s
9.766 Gflops
388 %

CalcInverse
4.7 M inv/s
23.0 M inv/s
5.683 Gflops
492 %

CopyMatrix_memcpy
81.1 M cpy/s
286.2 M cpy/s
36.628 GB/sec
353 %

CopyMatrix_Expand
104.4 M cpy/s
320.5 M cpy/s
41.025 GB/sec
307 %

CopyMatrixD_NEON
112.0 M cpy/s
353.5 M cpy/s
45.249 GB/sec
316 %

Jetson Nano

nVidia のDeepLearning用SBC

なぜか、pinレイアウトはRaspberryPiに準拠しているっぴので、ラズパイの仲間でもありそう。

CPU : ARM Cortex-A57 (quad-core) @ 1.43GHz

MEM : 4096 MB

OS : Jetson用。

Function name
Performance(MT)
MT/ST

MultMatrix_ForC
20.6 M mul/s
82.0 M mul/s
9.179 Gflops
397 %

MultMatrix_ForR
9.8 M mul/s
38.8 M mul/s
4.347 Gflops
397 %

MultMatrix_ExpC
10.0 M mul/s
39.5 M mul/s
4.429 Gflops
397 %

MultMatrix_ExpR
10.0 M mul/s
39.5 M mul/s
4.427 Gflops
397 %

MultMatrixF_NEON
39.9 M mul/s
158.8 M mul/s
17.780 Gflops
398 %

MultMatrixF_NEON_type2
27.9 M mul/s
111.1 M mul/s
12.441 Gflops
398 %

MultMatrix_ForC
8.0 M mul/s
31.9 M mul/s
3.573 Gflops
398 %

MultMatrix_ForR
9.7 M mul/s
38.7 M mul/s
4.333 Gflops
397 %

MultMatrix_ExpC
9.8 M mul/s
39.0 M mul/s
4.370 Gflops
397 %

MultMatrix_ExpR
9.7 M mul/s
38.7 M mul/s
4.333 Gflops
397 %

CalcInverse
6.0 M inv/s
24.0 M inv/s
5.937 Gflops
400 %

CalcInverse
2.3 M inv/s
9.0 M inv/s
2.232 Gflops
397 %

CopyMatrix_memcpy
55.5 M cpy/s
229.7 M cpy/s
29.398 GB/sec
414 %

CopyMatrix_Expand
72.7 M cpy/s
306.2 M cpy/s
39.194 GB/sec
421 %

CopyMatrixD_NEON
74.8 M cpy/s
311.5 M cpy/s
39.872 GB/sec
416 %

Intel Atom z3795

CPU : Intel Atom z3795 (2.4G Hz? x 4)

MEM : 4 GB

OS : Windows10 Professional (32bit)

Compile: VisualStudio 2015, 速度優先

Function name
Performance(MT)
MT/ST

`MultMatrix_ForC<float>`
7.4 M mul/s
28.1 M mul/s
3.143 Gflops
378 %

`MultMatrix_ForR<float>`
7.5 M mul/s
28.3 M mul/s
3.169 Gflops
378 %

`MultMatrix_ExpC<float>`
10.0 M mul/s
37.4 M mul/s
4.190 Gflops
376 %

`MultMatrix_ExpR<float>`
9.9 M mul/s
37.4 M mul/s
4.191 Gflops
376 %

`MultMatrixF_SSE2`
26.8 M mul/s
100.8 M mul/s
11.292 Gflops
377 %

`MultMatrix_ForC<double>`
7.0 M mul/s
26.4 M mul/s
2.951 Gflops
375 %

`MultMatrix_ForR<double>`
7.1 M mul/s
26.7 M mul/s
2.988 Gflops
375 %

`MultMatrix_ExpC<double>`
8.8 M mul/s
32.8 M mul/s
3.677 Gflops
374 %

`MultMatrix_ExpR<double>`
8.8 M mul/s
33.0 M mul/s
3.700 Gflops
376 %

`MultMatrixD_SSE2`
11.0 M mul/s
40.9 M mul/s
4.576 Gflops
373 %

`CalcInverse<float>`
3.5 M inv/s
13.1 M inv/s
3.230 Gflops
378 %

`CalcInverse<double>`
2.5 M inv/s
9.5 M inv/s
2.350 Gflops
377 %

`CopyMatrix_memcpy<double>`
24.9 M cpy/s
86.8 M cpy/s
11.110 GB/sec
348 %

`CopyMatrix_Expand<double>`
48.8 M cpy/s
184.3 M cpy/s
23.585 GB/sec
378 %

`CopyMatrixD_SSE2`
87.8 M cpy/s
316.7 M cpy/s
40.540 GB/sec
361 %

Intel Pentium 4415Y @ 1.6GHz

PC : Microsoft SurfaceGo

CPU : Intel Pentium 4415Y @ 1.6GHz (1.6G Hz? x 2)

MEM : 8 GB

OS : Windows10 Home (64bit)

Compile: VisualStudio 2017, 速度優先

Function name
Performance(MT)
MT/ST

MultMatrix_ForC
19.7 M mul/s
38.1 M mul/s
4.266 Gflops
194 %

MultMatrix_ForR
19.6 M mul/s
37.8 M mul/s
4.233 Gflops
193 %

MultMatrix_ExpC
19.5 M mul/s
38.8 M mul/s
4.351 Gflops
199 %

MultMatrix_ExpR
19.7 M mul/s
39.5 M mul/s
4.420 Gflops
200 %

MultMatrixF_SSE2
78.7 M mul/s
157.2 M mul/s
17.601 Gflops
200 %

MultMatrix_ForC
19.5 M mul/s
37.0 M mul/s
4.147 Gflops
190 %

MultMatrix_ForR
19.3 M mul/s
38.0 M mul/s
4.261 Gflops
197 %

MultMatrix_ExpC
19.6 M mul/s
37.5 M mul/s
4.197 Gflops
191 %

MultMatrix_ExpR
19.6 M mul/s
39.1 M mul/s
4.379 Gflops
200 %

MultMatrixD_SSE2
41.9 M mul/s
83.5 M mul/s
9.355 Gflops
199 %

CalcInverse
11.9 M inv/s
27.2 M inv/s
6.715 Gflops
228 %

CalcInverse
10.5 M inv/s
25.9 M inv/s
6.394 Gflops
246 %

CopyMatrix_memcpy
157.9 M cpy/s
274.7 M cpy/s
35.167 GB/sec
174 %

CopyMatrix_Expand
93.1 M cpy/s
186.5 M cpy/s
23.868 GB/sec
200 %

CopyMatrixD_SSE2
158.3 M cpy/s
315.2 M cpy/s
40.342 GB/sec
199 %

Intel Core i7-6700K

CPU : Intel Core i7-6700K(定格動作、Single=4.2GHz　Multi=4.0GHz, Logical=8core, Physical=4core)

MEM : 16GB (DDR4-3000 DualChannel, Overclocked)

OS : Windows10 Professional (64bit)

Compile: VisualStudio 2015, 速度優先

Function name
Performance(MT)
MT/ST

`MultMatrix_ForC<float>`
55.5 M mul/s
215.7 M mul/s
24.160 Gflops
389 %

`MultMatrix_ForR<float>`
55.6 M mul/s
214.4 M mul/s
24.017 Gflops
385 %

`MultMatrix_ExpC<float>`
60.6 M mul/s
232.6 M mul/s
26.049 Gflops
384 %

`MultMatrix_ExpR<float>`
59.5 M mul/s
232.4 M mul/s
26.028 Gflops
391 %

`MultMatrixF_SSE2`
207.2 M mul/s
794.2 M mul/s
88.955 Gflops
383 %

`MultMatrix_ForC<double>`
55.9 M mul/s
213.0 M mul/s
23.861 Gflops
381 %

`MultMatrix_ForR<double>`
55.8 M mul/s
215.7 M mul/s
24.163 Gflops
387 %

`MultMatrix_ExpC<double>`
60.4 M mul/s
229.3 M mul/s
25.687 Gflops
380 %

`MultMatrix_ExpR<double>`
60.4 M mul/s
230.8 M mul/s
25.854 Gflops
382 %

`MultMatrixD_SSE2`
110.6 M mul/s
422.0 M mul/s
47.266 Gflops
381 %

`MultMatrixD_AVX`
220.9 M mul/s
859.0 M mul/s
96.203 Gflops
389 %

`MultMatrixD_AVX_FMA`
247.0 M mul/s
959.3 M mul/s
107.445 Gflops
388 %

`MultMatrixD_AVX_FMA_type2`
266.1 M mul/s
1025.5 M mul/s
114.857 Gflops
385 %

`CalcInverse<float>`
30.7 M inv/s
133.3 M inv/s
32.928 Gflops
434 %

`CalcInverse<double>`
27.1 M inv/s
130.0 M inv/s
32.109 Gflops
480 %

`CalcInverseD_AVX2_FMA`
61.8 M inv/s
262.6 M inv/s
64.873 Gflops
425 %

`CopyMatrix_memcpy<double>`
461.7 M cpy/s
1670.1 M cpy/s
213.766 GB/sec
362 %

`CopyMatrix_Expand<double>`
240.3 M cpy/s
923.4 M cpy/s
118.198 GB/sec
384 %

`CopyMatrixD_SSE2`
462.2 M cpy/s
1775.9 M cpy/s
227.311 GB/sec
384 %

`CopyMatrixD_AVX`
652.0 M cpy/s
2290.2 M cpy/s
293.147 GB/sec
351 %

終わりに

• ARM系でのg++ O3オプションは、自動的にベクトル化され、著しい性能向上する場合がある。NanoPi-NEO2のMultMatrix_ForC

• ただし、上手くベクトル化できず、性能が出ないときもある。NanoPi-NEO2のMultMatrix_ForR

• 自力で書いたNEONは、やはり速い(笑

• 小容量のデータコピーは、memcpyより直接代入したほうが速いケースが多い。

• Arm Cortex-A53 と Atom z3795 はクロック当たりの性能はほぼ一緒に見える。

• i7-6700K と Cortex-A53 の性能差は20倍程度ある。

• 逆行列計算(公式Ver)は、SIMD使ってもそこまで速くなれない。。。