C
neon
AVX2
nanopineo
nanopi-neo4

4x4行列同士の掛算を高速化してみる ~Part2~

概要

4x4行列同士の掛算を高速化してみるの続編。

今回新たに以下を追加。

  • NEON(Arm)版追加。(ただしdouble使えないのでfloatのみ)
  • float の4x4行列掛け算 (通常版、SSE2版、NEON版)
  • 逆行列(公式Ver)計算追加 (通常版、AVX2_FMA版)
  • 行列のコピー処理計測 (memcpy版、代入版、各種SIMD版)
  • マルチプラットフォーム対応

ソースコード

全体はgit.
https://github.com/blue777/NanoPi-NEO

今回の関連するコードは以下の2つ。
コンパイルはPerfTest_Matrix.cppだけをやればOK!

  • PerfTest_Matrix.cpp
  • common/multithread_tools.h

コンパイルスイッチ

各種命令系の有効無効を行頭の部分で編集できるようにしてみました。
使用するコンパイラ・実行環境によって適当に切り替えてください。

PerfTest_Matrix.cpp
//#define ENABLE_SSE2
//#define ENABLE_AVX
//#define ENABLE_AVX_FMA
//#define ENABLE_AVX2_FMA
#define ENABLE_NEON

4x4行列floatの掛け算 SSE2版

MultMatrixF_SSE2
void    MultMatrixF_SSE2( float result[16], const float base[16], const float mult[16] )
{
    __m128  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

    xmm4    = _mm_loadu_ps( &mult[0] );
    xmm5    = _mm_loadu_ps( &mult[4] );
    xmm6    = _mm_loadu_ps( &mult[8] );
    xmm7    = _mm_loadu_ps( &mult[12] );

    // column0
    xmm0    = _mm_load1_ps( &base[0] );
    xmm1    = _mm_load1_ps( &base[1] );
    xmm2    = _mm_load1_ps( &base[2] );
    xmm3    = _mm_load1_ps( &base[3] );

    xmm0    = _mm_mul_ps( xmm0, xmm4 );
    xmm1    = _mm_mul_ps( xmm1, xmm5 );
    xmm2    = _mm_mul_ps( xmm2, xmm6 );
    xmm3    = _mm_mul_ps( xmm3, xmm7 );

    xmm0    = _mm_add_ps( xmm0, xmm1 );
    xmm2    = _mm_add_ps( xmm2, xmm3 );
    xmm0    = _mm_add_ps( xmm0, xmm2 );

    _mm_storeu_ps( &result[0], xmm0 );

    // column1
    xmm0    = _mm_load1_ps( &base[4] );
    xmm1    = _mm_load1_ps( &base[5] );
    xmm2    = _mm_load1_ps( &base[6] );
    xmm3    = _mm_load1_ps( &base[7] );

    xmm0    = _mm_mul_ps( xmm0, xmm4 );
    xmm1    = _mm_mul_ps( xmm1, xmm5 );
    xmm2    = _mm_mul_ps( xmm2, xmm6 );
    xmm3    = _mm_mul_ps( xmm3, xmm7 );

    xmm0    = _mm_add_ps( xmm0, xmm1 );
    xmm2    = _mm_add_ps( xmm2, xmm3 );
    xmm0    = _mm_add_ps( xmm0, xmm2 );

    _mm_storeu_ps( &result[4], xmm0 );

    // column2
    xmm0    = _mm_load1_ps( &base[8] );
    xmm1    = _mm_load1_ps( &base[9] );
    xmm2    = _mm_load1_ps( &base[10] );
    xmm3    = _mm_load1_ps( &base[11] );

    xmm0    = _mm_mul_ps( xmm0, xmm4 );
    xmm1    = _mm_mul_ps( xmm1, xmm5 );
    xmm2    = _mm_mul_ps( xmm2, xmm6 );
    xmm3    = _mm_mul_ps( xmm3, xmm7 );

    xmm0    = _mm_add_ps( xmm0, xmm1 );
    xmm2    = _mm_add_ps( xmm2, xmm3 );
    xmm0    = _mm_add_ps( xmm0, xmm2 );

    _mm_storeu_ps( &result[8], xmm0 );

    // column3
    xmm0    = _mm_load1_ps( &base[12] );
    xmm1    = _mm_load1_ps( &base[13] );
    xmm2    = _mm_load1_ps( &base[14] );
    xmm3    = _mm_load1_ps( &base[15] );

    xmm0    = _mm_mul_ps( xmm0, xmm4 );
    xmm1    = _mm_mul_ps( xmm1, xmm5 );
    xmm2    = _mm_mul_ps( xmm2, xmm6 );
    xmm3    = _mm_mul_ps( xmm3, xmm7 );

    xmm0    = _mm_add_ps( xmm0, xmm1 );
    xmm2    = _mm_add_ps( xmm2, xmm3 );
    xmm0    = _mm_add_ps( xmm0, xmm2 );

    _mm_storeu_ps( &result[12], xmm0 );
}

4x4行列floatの掛け算 NEON命令版

SSE2のfloat版とほぼ同等の命令構成で書いた行列掛け算。

MultMatrixF_NEON
void    MultMatrixF_NEON( float result[16], const float base[16], const float mult[16] )
{
    float32x4_t c0, c1, c2, c3;
    float32x4_t r0, r1, r2, r3;

    c0  = vld1q_f32( &mult[0] );
    c1  = vld1q_f32( &mult[4] );
    c2  = vld1q_f32( &mult[8] );
    c3  = vld1q_f32( &mult[12] );

    // column 0
    r0  = vmulq_n_f32( c0, base[0] ); 
    r1  = vmulq_n_f32( c1, base[1] ); 
    r2  = vmulq_n_f32( c2, base[2] ); 
    r3  = vmulq_n_f32( c3, base[3] ); 

    r0  = vaddq_f32( r0, r1 );
    r2  = vaddq_f32( r2, r3 );
    r0  = vaddq_f32( r0, r2 );

    vst1q_f32( &result[0], r0 );

    // column 1
    r0  = vmulq_n_f32( c0, base[4] ); 
    r1  = vmulq_n_f32( c1, base[5] ); 
    r2  = vmulq_n_f32( c2, base[6] ); 
    r3  = vmulq_n_f32( c3, base[7] ); 

    r0  = vaddq_f32( r0, r1 );
    r2  = vaddq_f32( r2, r3 );
    r0  = vaddq_f32( r0, r2 );

    vst1q_f32( &result[4], r0 );

    // column 2
    r0  = vmulq_n_f32( c0, base[8] ); 
    r1  = vmulq_n_f32( c1, base[9] ); 
    r2  = vmulq_n_f32( c2, base[10] ); 
    r3  = vmulq_n_f32( c3, base[11] ); 

    r0  = vaddq_f32( r0, r1 );
    r2  = vaddq_f32( r2, r3 );
    r0  = vaddq_f32( r0, r2 );

    vst1q_f32( &result[8], r0 );

    // column 3
    r0  = vmulq_n_f32( c0, base[12] ); 
    r1  = vmulq_n_f32( c1, base[13] ); 
    r2  = vmulq_n_f32( c2, base[14] ); 
    r3  = vmulq_n_f32( c3, base[15] ); 

    r0  = vaddq_f32( r0, r1 );
    r2  = vaddq_f32( r2, r3 );
    r0  = vaddq_f32( r0, r2 );

    vst1q_f32( &result[12], r0 );
}

4x4行列floatの掛け算 NEON版 Type2

使う命令をちょっと変えた版。
行数は少なくなるが、果たしてどうなるか。

MultMatrixF_NEON_type2
void    MultMatrixF_NEON_type2( float result[16], const float base[16], const float mult[16] )
{
    float32x4_t c0, c1, c2, c3;
    float32x4_t r0, r1, r2, r3;

    c0  = vld1q_f32( &mult[0] );
    c1  = vld1q_f32( &mult[4] );
    c2  = vld1q_f32( &mult[8] );
    c3  = vld1q_f32( &mult[12] );

    // column 0
    r0  = vmulq_n_f32(     c0, base[0] ); 
    r0  = vmlaq_n_f32( r0, c1, base[1] );
    r0  = vmlaq_n_f32( r0, c2, base[2] );
    r0  = vmlaq_n_f32( r0, c3, base[3] );

    vst1q_f32( &result[0], r0 );

    // column 1
    r0  = vmulq_n_f32(     c0, base[4] ); 
    r0  = vmlaq_n_f32( r0, c1, base[5] );
    r0  = vmlaq_n_f32( r0, c2, base[6] );
    r0  = vmlaq_n_f32( r0, c3, base[7] );

    vst1q_f32( &result[4], r0 );

    // column 2
    r0  = vmulq_n_f32(     c0, base[8] ); 
    r0  = vmlaq_n_f32( r0, c1, base[9] );
    r0  = vmlaq_n_f32( r0, c2, base[10] );
    r0  = vmlaq_n_f32( r0, c3, base[11] );

    vst1q_f32( &result[8], r0 );

    // column 3
    r0  = vmulq_n_f32(     c0, base[12] ); 
    r0  = vmlaq_n_f32( r0, c1, base[13] );
    r0  = vmlaq_n_f32( r0, c2, base[14] );
    r0  = vmlaq_n_f32( r0, c3, base[15] );

    vst1q_f32( &result[12], r0 );
}

測定結果。

手持ちの環境で測ったみた。
あと6機種ほど測れそうなのがあるが、ひとまずメイン環境のみで。

NanoPi-NEO

RaspberryPiの仲間。

CPU : Allwinner H3 (Cortex-A7 1.2GHz x 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 32bit (Official rom image, kernel=3.4.39)

Compile Command : g++ -O3 -pthread -std=c++11 -mfpu=neon PerfTest_Matrix.cpp

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 2.3 M mul/s 9.0 M mul/s 1.006 Gflops 396 %
MultMatrix_ForR<float> 2.3 M mul/s 9.2 M mul/s 1.029 Gflops 400 %
MultMatrix_ExpC<float> 2.9 M mul/s 11.4 M mul/s 1.274 Gflops 393 %
MultMatrix_ExpR<float> 2.9 M mul/s 11.6 M mul/s 1.296 Gflops 401 %
MultMatrixF_NEON 5.8 M mul/s 22.5 M mul/s 2.518 Gflops 385 %
MultMatrixF_NEON_type2 6.0 M mul/s 23.0 M mul/s 2.579 Gflops 386 %
MultMatrix_ForC<double> 1.7 M mul/s 6.7 M mul/s 0.749 Gflops 403 %
MultMatrix_ForR<double> 1.7 M mul/s 6.7 M mul/s 0.747 Gflops 400 %
MultMatrix_ExpC<double> 2.0 M mul/s 8.1 M mul/s 0.907 Gflops 397 %
MultMatrix_ExpR<double> 2.0 M mul/s 8.1 M mul/s 0.912 Gflops 400 %
CalcInverse<float> 1.9 M inv/s 7.5 M inv/s 1.842 Gflops 399 %
CalcInverse<double> 1.2 M inv/s 4.6 M inv/s 1.131 Gflops 397 %
CopyMatrix_memcpy<double> 13.6 M cpy/s 55.8 M cpy/s 7.140 GB/sec 411 %
CopyMatrix_Expand<double> 27.7 M cpy/s 111.1 M cpy/s 14.224 GB/sec 401 %
CopyMatrixD_NEON 21.0 M cpy/s 83.9 M cpy/s 10.738 GB/sec 400 %

NanoPi NEO2

RaspberryPiの仲間。

CPU : Allwinner H5 (Cortex-A53 1.0 GHz * 4)
MEM : 512 MB
OS : Ubuntu 16.04 LTS 64bit (Official rom image, kernel=4.11.2)

CompileCommand : g++ -std=c++11 -pthread -O3 PerfTest_Matrix.cpp

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 5.8 M mul/s 24.4 M mul/s 2.729 Gflops 421 %
MultMatrix_ForR<float> 2.5 M mul/s 10.1 M mul/s 1.134 Gflops 397 %
MultMatrix_ExpC<float> 2.7 M mul/s 11.2 M mul/s 1.253 Gflops 413 %
MultMatrix_ExpR<float> 2.8 M mul/s 11.2 M mul/s 1.253 Gflops 404 %
MultMatrixF_NEON 10.2 M mul/s 41.2 M mul/s 4.619 Gflops 406 %
MultMatrixF_NEON_type2 9.1 M mul/s 36.7 M mul/s 4.110 Gflops 403 %
MultMatrix_ForC<double> 2.6 M mul/s 10.2 M mul/s 1.144 Gflops 397 %
MultMatrix_ForR<double> 2.3 M mul/s 9.3 M mul/s 1.042 Gflops 404 %
MultMatrix_ExpC<double> 2.6 M mul/s 10.2 M mul/s 1.140 Gflops 398 %
MultMatrix_ExpR<double> 2.5 M mul/s 10.2 M mul/s 1.139 Gflops 403 %
CalcInverse<float> 2.1 M inv/s 8.6 M inv/s 2.115 Gflops 403 %
CalcInverse<double> 1.8 M inv/s 7.5 M inv/s 1.854 Gflops 409 %
CopyMatrix_memcpy<double> 20.1 M cpy/s 79.8 M cpy/s 10.209 GB/sec 397 %
CopyMatrix_Expand<double> 29.5 M cpy/s 117.2 M cpy/s 14.998 GB/sec 397 %
CopyMatrixD_NEON 23.3 M cpy/s 92.7 M cpy/s 11.869 GB/sec 397 %

NanoPi NEO4

RaspberryPiの仲間。

CPU : RockChip RK3399 (Cortex-A72 1.8GHz x2 + Cortex-A53 1.4GHz x4)
MEM : 1024 MB
OS : Ubuntu 18.04 64bit (Official rom image FriendlyDesktop)

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC 27.6 M mul/s 92.9 M mul/s 10.407 Gflops 336 %
MultMatrix_ForR 18.4 M mul/s 50.6 M mul/s 5.671 Gflops 275 %
MultMatrix_ExpC 16.0 M mul/s 48.6 M mul/s 5.442 Gflops 304 %
MultMatrix_ExpR 15.8 M mul/s 48.4 M mul/s 5.416 Gflops 306 %
MultMatrixF_NEON 53.2 M mul/s 169.8 M mul/s 19.022 Gflops 319 %
MultMatrixF_NEON_type2 39.7 M mul/s 132.3 M mul/s 14.815 Gflops 333 %
MultMatrix_ForC 12.7 M mul/s 40.6 M mul/s 4.553 Gflops 320 %
MultMatrix_ForR 16.9 M mul/s 52.0 M mul/s 5.819 Gflops 307 %
MultMatrix_ExpC 14.9 M mul/s 45.3 M mul/s 5.068 Gflops 303 %
MultMatrix_ExpR 15.1 M mul/s 45.5 M mul/s 5.091 Gflops 302 %
CalcInverse 10.2 M inv/s 39.5 M inv/s 9.766 Gflops 388 %
CalcInverse 4.7 M inv/s 23.0 M inv/s 5.683 Gflops 492 %
CopyMatrix_memcpy 81.1 M cpy/s 286.2 M cpy/s 36.628 GB/sec 353 %
CopyMatrix_Expand 104.4 M cpy/s 320.5 M cpy/s 41.025 GB/sec 307 %
CopyMatrixD_NEON 112.0 M cpy/s 353.5 M cpy/s 45.249 GB/sec 316 %

Intel Atom z3795

CPU : Intel Atom z3795 (2.4G Hz? x 4)
MEM : 4 GB
OS : Windows10 Professional (32bit)

Compile: VisualStudio 2015, 速度優先

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 7.4 M mul/s 28.1 M mul/s 3.143 Gflops 378 %
MultMatrix_ForR<float> 7.5 M mul/s 28.3 M mul/s 3.169 Gflops 378 %
MultMatrix_ExpC<float> 10.0 M mul/s 37.4 M mul/s 4.190 Gflops 376 %
MultMatrix_ExpR<float> 9.9 M mul/s 37.4 M mul/s 4.191 Gflops 376 %
MultMatrixF_SSE2 26.8 M mul/s 100.8 M mul/s 11.292 Gflops 377 %
MultMatrix_ForC<double> 7.0 M mul/s 26.4 M mul/s 2.951 Gflops 375 %
MultMatrix_ForR<double> 7.1 M mul/s 26.7 M mul/s 2.988 Gflops 375 %
MultMatrix_ExpC<double> 8.8 M mul/s 32.8 M mul/s 3.677 Gflops 374 %
MultMatrix_ExpR<double> 8.8 M mul/s 33.0 M mul/s 3.700 Gflops 376 %
MultMatrixD_SSE2 11.0 M mul/s 40.9 M mul/s 4.576 Gflops 373 %
CalcInverse<float> 3.5 M inv/s 13.1 M inv/s 3.230 Gflops 378 %
CalcInverse<double> 2.5 M inv/s 9.5 M inv/s 2.350 Gflops 377 %
CopyMatrix_memcpy<double> 24.9 M cpy/s 86.8 M cpy/s 11.110 GB/sec 348 %
CopyMatrix_Expand<double> 48.8 M cpy/s 184.3 M cpy/s 23.585 GB/sec 378 %
CopyMatrixD_SSE2 87.8 M cpy/s 316.7 M cpy/s 40.540 GB/sec 361 %

Intel Core i7-6700K

CPU : Intel Core i7-6700K(定格動作、Single=4.2GHz Multi=4.0GHz, Logical=8core, Physical=4core)
MEM : 16GB (DDR4-3000 DualChannel, Overclocked)
OS : Windows10 Professional (64bit)

Compile: VisualStudio 2015, 速度優先

Function name Single Thread Multi Thread Performance(MT) MT/ST
MultMatrix_ForC<float> 55.5 M mul/s 215.7 M mul/s 24.160 Gflops 389 %
MultMatrix_ForR<float> 55.6 M mul/s 214.4 M mul/s 24.017 Gflops 385 %
MultMatrix_ExpC<float> 60.6 M mul/s 232.6 M mul/s 26.049 Gflops 384 %
MultMatrix_ExpR<float> 59.5 M mul/s 232.4 M mul/s 26.028 Gflops 391 %
MultMatrixF_SSE2 207.2 M mul/s 794.2 M mul/s 88.955 Gflops 383 %
MultMatrix_ForC<double> 55.9 M mul/s 213.0 M mul/s 23.861 Gflops 381 %
MultMatrix_ForR<double> 55.8 M mul/s 215.7 M mul/s 24.163 Gflops 387 %
MultMatrix_ExpC<double> 60.4 M mul/s 229.3 M mul/s 25.687 Gflops 380 %
MultMatrix_ExpR<double> 60.4 M mul/s 230.8 M mul/s 25.854 Gflops 382 %
MultMatrixD_SSE2 110.6 M mul/s 422.0 M mul/s 47.266 Gflops 381 %
MultMatrixD_AVX 220.9 M mul/s 859.0 M mul/s 96.203 Gflops 389 %
MultMatrixD_AVX_FMA 247.0 M mul/s 959.3 M mul/s 107.445 Gflops 388 %
MultMatrixD_AVX_FMA_type2 266.1 M mul/s 1025.5 M mul/s 114.857 Gflops 385 %
CalcInverse<float> 30.7 M inv/s 133.3 M inv/s 32.928 Gflops 434 %
CalcInverse<double> 27.1 M inv/s 130.0 M inv/s 32.109 Gflops 480 %
CalcInverseD_AVX2_FMA 61.8 M inv/s 262.6 M inv/s 64.873 Gflops 425 %
CopyMatrix_memcpy<double> 461.7 M cpy/s 1670.1 M cpy/s 213.766 GB/sec 362 %
CopyMatrix_Expand<double> 240.3 M cpy/s 923.4 M cpy/s 118.198 GB/sec 384 %
CopyMatrixD_SSE2 462.2 M cpy/s 1775.9 M cpy/s 227.311 GB/sec 384 %
CopyMatrixD_AVX 652.0 M cpy/s 2290.2 M cpy/s 293.147 GB/sec 351 %

終わりに

各種実験結果の総論である。

  • ARM系でのg++ O3オプションは、自動的にベクトル化され、著しい性能向上する場合がある。NanoPi-NEO2のMultMatrix_ForC
  • ただし、上手くベクトル化できず、性能が出ないときもある。NanoPi-NEO2のMultMatrix_ForR
  • 自力で書いたNEONは、やはり速い(笑
  • 小容量のデータコピーは、memcpyより直接代入したほうが速いケースが多い。
  • Arm Cortex-A53 と Atom z3795 はクロック当たりの性能はほぼ一緒に見える。
  • i7-6700K と Cortex-A53 の性能差は20倍程度ある。
  • 逆行列計算(公式Ver)は、SIMD使ってもそこまで速くなれない。。。