概要
今回新たに以下を追加。
- NEON(Arm)版追加。(ただしdouble使えないのでfloatのみ)
- float の4x4行列掛け算 (通常版、SSE2版、NEON版)
- 逆行列(公式Ver)計算追加 (通常版、AVX2_FMA版)
- 行列のコピー処理計測 (memcpy版、代入版、各種SIMD版)
- マルチプラットフォーム対応
ソースコード
全体はgit.
https://github.com/blue777/NanoPi-NEO
今回の関連するコードは以下の2つ。
コンパイルはPerfTest_Matrix.cppだけをやればOK!
- PerfTest_Matrix.cpp
- common/multithread_tools.h
コンパイルスイッチ
各種命令系の有効無効を行頭の部分で編集できるようにしてみました。
使用するコンパイラ・実行環境によって適当に切り替えてください。
//#define ENABLE_SSE2
//#define ENABLE_AVX
//#define ENABLE_AVX_FMA
//#define ENABLE_AVX2_FMA
#define ENABLE_NEON
4x4行列floatの掛け算 SSE2版
void MultMatrixF_SSE2( float result[16], const float base[16], const float mult[16] )
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm4 = _mm_loadu_ps( &mult[0] );
xmm5 = _mm_loadu_ps( &mult[4] );
xmm6 = _mm_loadu_ps( &mult[8] );
xmm7 = _mm_loadu_ps( &mult[12] );
// column0
xmm0 = _mm_load1_ps( &base[0] );
xmm1 = _mm_load1_ps( &base[1] );
xmm2 = _mm_load1_ps( &base[2] );
xmm3 = _mm_load1_ps( &base[3] );
xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );
xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );
_mm_storeu_ps( &result[0], xmm0 );
// column1
xmm0 = _mm_load1_ps( &base[4] );
xmm1 = _mm_load1_ps( &base[5] );
xmm2 = _mm_load1_ps( &base[6] );
xmm3 = _mm_load1_ps( &base[7] );
xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );
xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );
_mm_storeu_ps( &result[4], xmm0 );
// column2
xmm0 = _mm_load1_ps( &base[8] );
xmm1 = _mm_load1_ps( &base[9] );
xmm2 = _mm_load1_ps( &base[10] );
xmm3 = _mm_load1_ps( &base[11] );
xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );
xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );
_mm_storeu_ps( &result[8], xmm0 );
// column3
xmm0 = _mm_load1_ps( &base[12] );
xmm1 = _mm_load1_ps( &base[13] );
xmm2 = _mm_load1_ps( &base[14] );
xmm3 = _mm_load1_ps( &base[15] );
xmm0 = _mm_mul_ps( xmm0, xmm4 );
xmm1 = _mm_mul_ps( xmm1, xmm5 );
xmm2 = _mm_mul_ps( xmm2, xmm6 );
xmm3 = _mm_mul_ps( xmm3, xmm7 );
xmm0 = _mm_add_ps( xmm0, xmm1 );
xmm2 = _mm_add_ps( xmm2, xmm3 );
xmm0 = _mm_add_ps( xmm0, xmm2 );
_mm_storeu_ps( &result[12], xmm0 );
}
4x4行列floatの掛け算 NEON命令版
SSE2のfloat版とほぼ同等の命令構成で書いた行列掛け算。
void MultMatrixF_NEON( float result[16], const float base[16], const float mult[16] )
{
float32x4_t c0, c1, c2, c3;
float32x4_t r0, r1, r2, r3;
c0 = vld1q_f32( &mult[0] );
c1 = vld1q_f32( &mult[4] );
c2 = vld1q_f32( &mult[8] );
c3 = vld1q_f32( &mult[12] );
// column 0
r0 = vmulq_n_f32( c0, base[0] );
r1 = vmulq_n_f32( c1, base[1] );
r2 = vmulq_n_f32( c2, base[2] );
r3 = vmulq_n_f32( c3, base[3] );
r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );
vst1q_f32( &result[0], r0 );
// column 1
r0 = vmulq_n_f32( c0, base[4] );
r1 = vmulq_n_f32( c1, base[5] );
r2 = vmulq_n_f32( c2, base[6] );
r3 = vmulq_n_f32( c3, base[7] );
r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );
vst1q_f32( &result[4], r0 );
// column 2
r0 = vmulq_n_f32( c0, base[8] );
r1 = vmulq_n_f32( c1, base[9] );
r2 = vmulq_n_f32( c2, base[10] );
r3 = vmulq_n_f32( c3, base[11] );
r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );
vst1q_f32( &result[8], r0 );
// column 3
r0 = vmulq_n_f32( c0, base[12] );
r1 = vmulq_n_f32( c1, base[13] );
r2 = vmulq_n_f32( c2, base[14] );
r3 = vmulq_n_f32( c3, base[15] );
r0 = vaddq_f32( r0, r1 );
r2 = vaddq_f32( r2, r3 );
r0 = vaddq_f32( r0, r2 );
vst1q_f32( &result[12], r0 );
}
4x4行列floatの掛け算 NEON版 Type2
使う命令をちょっと変えた版。
行数は少なくなるが、果たしてどうなるか。
void MultMatrixF_NEON_type2( float result[16], const float base[16], const float mult[16] )
{
float32x4_t c0, c1, c2, c3;
float32x4_t r0, r1, r2, r3;
c0 = vld1q_f32( &mult[0] );
c1 = vld1q_f32( &mult[4] );
c2 = vld1q_f32( &mult[8] );
c3 = vld1q_f32( &mult[12] );
// column 0
r0 = vmulq_n_f32( c0, base[0] );
r0 = vmlaq_n_f32( r0, c1, base[1] );
r0 = vmlaq_n_f32( r0, c2, base[2] );
r0 = vmlaq_n_f32( r0, c3, base[3] );
vst1q_f32( &result[0], r0 );
// column 1
r0 = vmulq_n_f32( c0, base[4] );
r0 = vmlaq_n_f32( r0, c1, base[5] );
r0 = vmlaq_n_f32( r0, c2, base[6] );
r0 = vmlaq_n_f32( r0, c3, base[7] );
vst1q_f32( &result[4], r0 );
// column 2
r0 = vmulq_n_f32( c0, base[8] );
r0 = vmlaq_n_f32( r0, c1, base[9] );
r0 = vmlaq_n_f32( r0, c2, base[10] );
r0 = vmlaq_n_f32( r0, c3, base[11] );
vst1q_f32( &result[8], r0 );
// column 3
r0 = vmulq_n_f32( c0, base[12] );
r0 = vmlaq_n_f32( r0, c1, base[13] );
r0 = vmlaq_n_f32( r0, c2, base[14] );
r0 = vmlaq_n_f32( r0, c3, base[15] );
vst1q_f32( &result[12], r0 );
}
測定結果。
手持ちの環境で測ったみた。
あと6機種ほど測れそうなのがあるが、ひとまずメイン環境のみで。
NanoPi-NEO
RaspberryPiの仲間。
CPU : Allwinner H3 (Cortex-A7 1.2GHz x 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 32bit (Official rom image, kernel=3.4.39)
Compile Command : g++ -O3 -pthread -std=c++11 -mfpu=neon PerfTest_Matrix.cpp
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC<float> |
2.3 M mul/s | 9.0 M mul/s | 1.006 Gflops | 396 % |
MultMatrix_ForR<float> |
2.3 M mul/s | 9.2 M mul/s | 1.029 Gflops | 400 % |
MultMatrix_ExpC<float> |
2.9 M mul/s | 11.4 M mul/s | 1.274 Gflops | 393 % |
MultMatrix_ExpR<float> |
2.9 M mul/s | 11.6 M mul/s | 1.296 Gflops | 401 % |
MultMatrixF_NEON |
5.8 M mul/s | 22.5 M mul/s | 2.518 Gflops | 385 % |
MultMatrixF_NEON_type2 |
6.0 M mul/s | 23.0 M mul/s | 2.579 Gflops | 386 % |
MultMatrix_ForC<double> |
1.7 M mul/s | 6.7 M mul/s | 0.749 Gflops | 403 % |
MultMatrix_ForR<double> |
1.7 M mul/s | 6.7 M mul/s | 0.747 Gflops | 400 % |
MultMatrix_ExpC<double> |
2.0 M mul/s | 8.1 M mul/s | 0.907 Gflops | 397 % |
MultMatrix_ExpR<double> |
2.0 M mul/s | 8.1 M mul/s | 0.912 Gflops | 400 % |
CalcInverse<float> |
1.9 M inv/s | 7.5 M inv/s | 1.842 Gflops | 399 % |
CalcInverse<double> |
1.2 M inv/s | 4.6 M inv/s | 1.131 Gflops | 397 % |
CopyMatrix_memcpy<double> |
13.6 M cpy/s | 55.8 M cpy/s | 7.140 GB/sec | 411 % |
CopyMatrix_Expand<double> |
27.7 M cpy/s | 111.1 M cpy/s | 14.224 GB/sec | 401 % |
CopyMatrixD_NEON |
21.0 M cpy/s | 83.9 M cpy/s | 10.738 GB/sec | 400 % |
NanoPi NEO2
RaspberryPiの仲間。
CPU : Allwinner H5 (Cortex-A53 1.0 GHz * 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 64bit (Official rom image, kernel=4.11.2)
CompileCommand : g++ -std=c++11 -pthread -O3 PerfTest_Matrix.cpp
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC<float> |
5.8 M mul/s | 24.4 M mul/s | 2.729 Gflops | 421 % |
MultMatrix_ForR<float> |
2.5 M mul/s | 10.1 M mul/s | 1.134 Gflops | 397 % |
MultMatrix_ExpC<float> |
2.7 M mul/s | 11.2 M mul/s | 1.253 Gflops | 413 % |
MultMatrix_ExpR<float> |
2.8 M mul/s | 11.2 M mul/s | 1.253 Gflops | 404 % |
MultMatrixF_NEON |
10.2 M mul/s | 41.2 M mul/s | 4.619 Gflops | 406 % |
MultMatrixF_NEON_type2 |
9.1 M mul/s | 36.7 M mul/s | 4.110 Gflops | 403 % |
MultMatrix_ForC<double> |
2.6 M mul/s | 10.2 M mul/s | 1.144 Gflops | 397 % |
MultMatrix_ForR<double> |
2.3 M mul/s | 9.3 M mul/s | 1.042 Gflops | 404 % |
MultMatrix_ExpC<double> |
2.6 M mul/s | 10.2 M mul/s | 1.140 Gflops | 398 % |
MultMatrix_ExpR<double> |
2.5 M mul/s | 10.2 M mul/s | 1.139 Gflops | 403 % |
CalcInverse<float> |
2.1 M inv/s | 8.6 M inv/s | 2.115 Gflops | 403 % |
CalcInverse<double> |
1.8 M inv/s | 7.5 M inv/s | 1.854 Gflops | 409 % |
CopyMatrix_memcpy<double> |
20.1 M cpy/s | 79.8 M cpy/s | 10.209 GB/sec | 397 % |
CopyMatrix_Expand<double> |
29.5 M cpy/s | 117.2 M cpy/s | 14.998 GB/sec | 397 % |
CopyMatrixD_NEON |
23.3 M cpy/s | 92.7 M cpy/s | 11.869 GB/sec | 397 % |
NanoPi NEO4
RaspberryPiの仲間。
CPU : RockChip RK3399 (Cortex-A72 1.8GHz x2 + Cortex-A53 1.4GHz x4)
MEM : 1024 MB
OS : Ubuntu 18.04 64bit (Official rom image FriendlyDesktop)
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC | 27.6 M mul/s | 92.9 M mul/s | 10.407 Gflops | 336 % |
MultMatrix_ForR | 18.4 M mul/s | 50.6 M mul/s | 5.671 Gflops | 275 % |
MultMatrix_ExpC | 16.0 M mul/s | 48.6 M mul/s | 5.442 Gflops | 304 % |
MultMatrix_ExpR | 15.8 M mul/s | 48.4 M mul/s | 5.416 Gflops | 306 % |
MultMatrixF_NEON | 53.2 M mul/s | 169.8 M mul/s | 19.022 Gflops | 319 % |
MultMatrixF_NEON_type2 | 39.7 M mul/s | 132.3 M mul/s | 14.815 Gflops | 333 % |
MultMatrix_ForC | 12.7 M mul/s | 40.6 M mul/s | 4.553 Gflops | 320 % |
MultMatrix_ForR | 16.9 M mul/s | 52.0 M mul/s | 5.819 Gflops | 307 % |
MultMatrix_ExpC | 14.9 M mul/s | 45.3 M mul/s | 5.068 Gflops | 303 % |
MultMatrix_ExpR | 15.1 M mul/s | 45.5 M mul/s | 5.091 Gflops | 302 % |
CalcInverse | 10.2 M inv/s | 39.5 M inv/s | 9.766 Gflops | 388 % |
CalcInverse | 4.7 M inv/s | 23.0 M inv/s | 5.683 Gflops | 492 % |
CopyMatrix_memcpy | 81.1 M cpy/s | 286.2 M cpy/s | 36.628 GB/sec | 353 % |
CopyMatrix_Expand | 104.4 M cpy/s | 320.5 M cpy/s | 41.025 GB/sec | 307 % |
CopyMatrixD_NEON | 112.0 M cpy/s | 353.5 M cpy/s | 45.249 GB/sec | 316 % |
Rock Pi S
CPU : ARM Cortex-A35 (quad-core) @ ?.?? GHz
MEM : 512 MB
OS : Debian
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC | 4.4 M mul/s | 17.8 M mul/s | 1.991 Gflops | 400 % |
MultMatrix_ForR | 2.2 M mul/s | 8.8 M mul/s | 0.990 Gflops | 400 % |
MultMatrix_ExpC | 2.5 M mul/s | 9.9 M mul/s | 1.106 Gflops | 400 % |
MultMatrix_ExpR | 2.5 M mul/s | 9.8 M mul/s | 1.102 Gflops | 399 % |
MultMatrixF_NEON | 8.8 M mul/s | 35.1 M mul/s | 3.927 Gflops | 400 % |
MultMatrixF_NEON_type2 | 7.9 M mul/s | 31.5 M mul/s | 3.526 Gflops | 400 % |
MultMatrix_ForC | 2.4 M mul/s | 9.8 M mul/s | 1.093 Gflops | 400 % |
MultMatrix_ForR | 2.2 M mul/s | 8.8 M mul/s | 0.989 Gflops | 400 % |
MultMatrix_ExpC | 2.5 M mul/s | 9.8 M mul/s | 1.103 Gflops | 400 % |
MultMatrix_ExpR | 2.5 M mul/s | 9.8 M mul/s | 1.103 Gflops | 400 % |
CalcInverse | 1.7 M inv/s | 6.9 M inv/s | 1.694 Gflops | 400 % |
CalcInverse | 1.4 M inv/s | 5.5 M inv/s | 1.355 Gflops | 400 % |
CopyMatrix_memcpy | 17.8 M cpy/s | 71.3 M cpy/s | 9.121 GB/sec | 400 % |
CopyMatrix_Expand | 19.2 M cpy/s | 76.8 M cpy/s | 9.828 GB/sec | 400 % |
CopyMatrixD_NEON | 27.7 M cpy/s | 110.8 M cpy/s | 14.187 GB/sec | 400 % |
Jetson Nano
nVidia のDeepLearning用SBC
なぜか、pinレイアウトはRaspberryPiに準拠しているっぴので、ラズパイの仲間でもありそう。
CPU : ARM Cortex-A57 (quad-core) @ 1.43GHz
MEM : 4096 MB
OS : Jetson用。
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC | 20.6 M mul/s | 82.0 M mul/s | 9.179 Gflops | 397 % |
MultMatrix_ForR | 9.8 M mul/s | 38.8 M mul/s | 4.347 Gflops | 397 % |
MultMatrix_ExpC | 10.0 M mul/s | 39.5 M mul/s | 4.429 Gflops | 397 % |
MultMatrix_ExpR | 10.0 M mul/s | 39.5 M mul/s | 4.427 Gflops | 397 % |
MultMatrixF_NEON | 39.9 M mul/s | 158.8 M mul/s | 17.780 Gflops | 398 % |
MultMatrixF_NEON_type2 | 27.9 M mul/s | 111.1 M mul/s | 12.441 Gflops | 398 % |
MultMatrix_ForC | 8.0 M mul/s | 31.9 M mul/s | 3.573 Gflops | 398 % |
MultMatrix_ForR | 9.7 M mul/s | 38.7 M mul/s | 4.333 Gflops | 397 % |
MultMatrix_ExpC | 9.8 M mul/s | 39.0 M mul/s | 4.370 Gflops | 397 % |
MultMatrix_ExpR | 9.7 M mul/s | 38.7 M mul/s | 4.333 Gflops | 397 % |
CalcInverse | 6.0 M inv/s | 24.0 M inv/s | 5.937 Gflops | 400 % |
CalcInverse | 2.3 M inv/s | 9.0 M inv/s | 2.232 Gflops | 397 % |
CopyMatrix_memcpy | 55.5 M cpy/s | 229.7 M cpy/s | 29.398 GB/sec | 414 % |
CopyMatrix_Expand | 72.7 M cpy/s | 306.2 M cpy/s | 39.194 GB/sec | 421 % |
CopyMatrixD_NEON | 74.8 M cpy/s | 311.5 M cpy/s | 39.872 GB/sec | 416 % |
Intel Atom z3795
CPU : Intel Atom z3795 (2.4G Hz? x 4)
MEM : 4 GB
OS : Windows10 Professional (32bit)
Compile: VisualStudio 2015, 速度優先
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC<float> |
7.4 M mul/s | 28.1 M mul/s | 3.143 Gflops | 378 % |
MultMatrix_ForR<float> |
7.5 M mul/s | 28.3 M mul/s | 3.169 Gflops | 378 % |
MultMatrix_ExpC<float> |
10.0 M mul/s | 37.4 M mul/s | 4.190 Gflops | 376 % |
MultMatrix_ExpR<float> |
9.9 M mul/s | 37.4 M mul/s | 4.191 Gflops | 376 % |
MultMatrixF_SSE2 |
26.8 M mul/s | 100.8 M mul/s | 11.292 Gflops | 377 % |
MultMatrix_ForC<double> |
7.0 M mul/s | 26.4 M mul/s | 2.951 Gflops | 375 % |
MultMatrix_ForR<double> |
7.1 M mul/s | 26.7 M mul/s | 2.988 Gflops | 375 % |
MultMatrix_ExpC<double> |
8.8 M mul/s | 32.8 M mul/s | 3.677 Gflops | 374 % |
MultMatrix_ExpR<double> |
8.8 M mul/s | 33.0 M mul/s | 3.700 Gflops | 376 % |
MultMatrixD_SSE2 |
11.0 M mul/s | 40.9 M mul/s | 4.576 Gflops | 373 % |
CalcInverse<float> |
3.5 M inv/s | 13.1 M inv/s | 3.230 Gflops | 378 % |
CalcInverse<double> |
2.5 M inv/s | 9.5 M inv/s | 2.350 Gflops | 377 % |
CopyMatrix_memcpy<double> |
24.9 M cpy/s | 86.8 M cpy/s | 11.110 GB/sec | 348 % |
CopyMatrix_Expand<double> |
48.8 M cpy/s | 184.3 M cpy/s | 23.585 GB/sec | 378 % |
CopyMatrixD_SSE2 |
87.8 M cpy/s | 316.7 M cpy/s | 40.540 GB/sec | 361 % |
Intel Pentium 4415Y @ 1.6GHz
PC : Microsoft SurfaceGo
CPU : Intel Pentium 4415Y @ 1.6GHz (1.6G Hz? x 2)
MEM : 8 GB
OS : Windows10 Home (64bit)
Compile: VisualStudio 2017, 速度優先
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC | 19.7 M mul/s | 38.1 M mul/s | 4.266 Gflops | 194 % |
MultMatrix_ForR | 19.6 M mul/s | 37.8 M mul/s | 4.233 Gflops | 193 % |
MultMatrix_ExpC | 19.5 M mul/s | 38.8 M mul/s | 4.351 Gflops | 199 % |
MultMatrix_ExpR | 19.7 M mul/s | 39.5 M mul/s | 4.420 Gflops | 200 % |
MultMatrixF_SSE2 | 78.7 M mul/s | 157.2 M mul/s | 17.601 Gflops | 200 % |
MultMatrix_ForC | 19.5 M mul/s | 37.0 M mul/s | 4.147 Gflops | 190 % |
MultMatrix_ForR | 19.3 M mul/s | 38.0 M mul/s | 4.261 Gflops | 197 % |
MultMatrix_ExpC | 19.6 M mul/s | 37.5 M mul/s | 4.197 Gflops | 191 % |
MultMatrix_ExpR | 19.6 M mul/s | 39.1 M mul/s | 4.379 Gflops | 200 % |
MultMatrixD_SSE2 | 41.9 M mul/s | 83.5 M mul/s | 9.355 Gflops | 199 % |
CalcInverse | 11.9 M inv/s | 27.2 M inv/s | 6.715 Gflops | 228 % |
CalcInverse | 10.5 M inv/s | 25.9 M inv/s | 6.394 Gflops | 246 % |
CopyMatrix_memcpy | 157.9 M cpy/s | 274.7 M cpy/s | 35.167 GB/sec | 174 % |
CopyMatrix_Expand | 93.1 M cpy/s | 186.5 M cpy/s | 23.868 GB/sec | 200 % |
CopyMatrixD_SSE2 | 158.3 M cpy/s | 315.2 M cpy/s | 40.342 GB/sec | 199 % |
Intel Core i7-6700K
CPU : Intel Core i7-6700K(定格動作、Single=4.2GHz Multi=4.0GHz, Logical=8core, Physical=4core)
MEM : 16GB (DDR4-3000 DualChannel, Overclocked)
OS : Windows10 Professional (64bit)
Compile: VisualStudio 2015, 速度優先
Function name | Single Thread | Multi Thread | Performance(MT) | MT/ST |
---|---|---|---|---|
MultMatrix_ForC<float> |
55.5 M mul/s | 215.7 M mul/s | 24.160 Gflops | 389 % |
MultMatrix_ForR<float> |
55.6 M mul/s | 214.4 M mul/s | 24.017 Gflops | 385 % |
MultMatrix_ExpC<float> |
60.6 M mul/s | 232.6 M mul/s | 26.049 Gflops | 384 % |
MultMatrix_ExpR<float> |
59.5 M mul/s | 232.4 M mul/s | 26.028 Gflops | 391 % |
MultMatrixF_SSE2 |
207.2 M mul/s | 794.2 M mul/s | 88.955 Gflops | 383 % |
MultMatrix_ForC<double> |
55.9 M mul/s | 213.0 M mul/s | 23.861 Gflops | 381 % |
MultMatrix_ForR<double> |
55.8 M mul/s | 215.7 M mul/s | 24.163 Gflops | 387 % |
MultMatrix_ExpC<double> |
60.4 M mul/s | 229.3 M mul/s | 25.687 Gflops | 380 % |
MultMatrix_ExpR<double> |
60.4 M mul/s | 230.8 M mul/s | 25.854 Gflops | 382 % |
MultMatrixD_SSE2 |
110.6 M mul/s | 422.0 M mul/s | 47.266 Gflops | 381 % |
MultMatrixD_AVX |
220.9 M mul/s | 859.0 M mul/s | 96.203 Gflops | 389 % |
MultMatrixD_AVX_FMA |
247.0 M mul/s | 959.3 M mul/s | 107.445 Gflops | 388 % |
MultMatrixD_AVX_FMA_type2 |
266.1 M mul/s | 1025.5 M mul/s | 114.857 Gflops | 385 % |
CalcInverse<float> |
30.7 M inv/s | 133.3 M inv/s | 32.928 Gflops | 434 % |
CalcInverse<double> |
27.1 M inv/s | 130.0 M inv/s | 32.109 Gflops | 480 % |
CalcInverseD_AVX2_FMA |
61.8 M inv/s | 262.6 M inv/s | 64.873 Gflops | 425 % |
CopyMatrix_memcpy<double> |
461.7 M cpy/s | 1670.1 M cpy/s | 213.766 GB/sec | 362 % |
CopyMatrix_Expand<double> |
240.3 M cpy/s | 923.4 M cpy/s | 118.198 GB/sec | 384 % |
CopyMatrixD_SSE2 |
462.2 M cpy/s | 1775.9 M cpy/s | 227.311 GB/sec | 384 % |
CopyMatrixD_AVX |
652.0 M cpy/s | 2290.2 M cpy/s | 293.147 GB/sec | 351 % |
終わりに
各種実験結果の総論である。
- ARM系でのg++ O3オプションは、自動的にベクトル化され、著しい性能向上する場合がある。NanoPi-NEO2のMultMatrix_ForC
- ただし、上手くベクトル化できず、性能が出ないときもある。NanoPi-NEO2のMultMatrix_ForR
- 自力で書いたNEONは、やはり速い(笑
- 小容量のデータコピーは、memcpyより直接代入したほうが速いケースが多い。
- Arm Cortex-A53 と Atom z3795 はクロック当たりの性能はほぼ一緒に見える。
- i7-6700K と Cortex-A53 の性能差は20倍程度ある。
- 逆行列計算(公式Ver)は、SIMD使ってもそこまで速くなれない。。。