1
0

More than 3 years have passed since last update.

x86 SSE : _mm_div_ps (divps) の代替(除算命令を使わない除算)を試す

Last updated at Posted at 2021-09-01

近年(もう随分前になるのかな?)のプロセッサは除算をニュートン・ラフソン法で行っているらしいので、以前作った備忘録:除算命令を使わない除算を参考に divps の代替を作ってみます。

以下の環境

macOS Big Sur (11.5.2)
Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
gcc version 11.2.0 (GCC)

でコンパイルして実行します。

sample.c
#include <math.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <xmmintrin.h>

/*
 * DIVPS
 */
inline static __v4sf divps1(__v4sf a, __v4sf b)
{
    return _mm_div_ps(a, b);
}

/*
 * DIVPS の代替
 *
 *    x[1] = RCPPS(b)
 *    x[n+1] = x[n] * (2 - b * x[n])
 *
 */
inline static __v4sf divps2(__v4sf a, __v4sf b)
{
    __v4sf two = { 2, 2, 2, 2 };
    __v4sf x1 = _mm_rcp_ps(b);
    __v4sf x2 = x1 * (two - b * x1);
    return a * x2;
}

/*
 * 以下、動作確認用
 */
inline static uint64_t RDTSC()
{
#if defined(_MSC_VER) ||   \
    (defined(__clang__) && \
     (defined(__amd64) ||  \
      defined(__i386) ||   \
      defined(__x86) ||    \
      defined(__x86_64)))

    return __rdtsc();

#elif defined(__GNUC__) && \
    (defined(__amd64) ||   \
     defined(__i386) ||    \
     defined(__x86) ||     \
     defined(__x86_64))

    uint32_t eax, edx;
    __asm__ volatile ("RDTSC" : "=a" (eax), "=d" (edx));
    return ((uint64_t)edx << 32) | eax;

#else

    timespec c;
    clock_gettime(CLOCK_REALTIME, &c);
    return ((uint64_t)c.tv_sec * 1000 * 1000 * 1000 + c.tv_nsec);

#endif
}

static void print_tsc(uint64_t tsc, uint64_t cnt)
{
    uint64_t i = tsc / cnt;
    uint64_t f = (100 * (tsc % cnt) + (cnt / 2)) / cnt;

    printf("%" PRIu64 ".%02" PRIu64, i, f);
}

int main()
{
    __v4sf a = { 7, 7, 7, 7 };
    size_t i, fcount = (1 << 20);
    size_t vcount = fcount / 4;
    float *buffer = (float *)_mm_malloc(2 * fcount * sizeof(float), 16);
    __v4sf *p = (__v4sf *)buffer;
    __v4sf *q = (__v4sf *)(buffer + fcount);
    uint64_t tsc1, tsc2, tsc3;

    if (!buffer)
    {
        fprintf(stderr, "_mm_malloc 失敗\n");
        return 2;
    }
    for (i = 1; i <= fcount; i++)
    {
        buffer[i - 1] = (float)i;
        buffer[i - 1 + fcount] = (float)i;
    }

    tsc1 = RDTSC();

    for (i = 0; i < vcount; i += 8)
    {
        p[i + 0] = divps1(a, p[i + 0]);
        p[i + 1] = divps1(a, p[i + 1]);
        p[i + 2] = divps1(a, p[i + 2]);
        p[i + 3] = divps1(a, p[i + 3]);
        p[i + 4] = divps1(a, p[i + 4]);
        p[i + 5] = divps1(a, p[i + 5]);
        p[i + 6] = divps1(a, p[i + 6]);
        p[i + 7] = divps1(a, p[i + 7]);
    }

    tsc2 = RDTSC();

    for (i = 0; i < vcount; i += 8)
    {
        q[i + 0] = divps2(a, q[i + 0]);
        q[i + 1] = divps2(a, q[i + 1]);
        q[i + 2] = divps2(a, q[i + 2]);
        q[i + 3] = divps2(a, q[i + 3]);
        q[i + 4] = divps2(a, q[i + 4]);
        q[i + 5] = divps2(a, q[i + 5]);
        q[i + 6] = divps2(a, q[i + 6]);
        q[i + 7] = divps2(a, q[i + 7]);
    }

    tsc3 = RDTSC();

    _mm_free(buffer);

    printf("divps1=");
    print_tsc(tsc2 - tsc1, vcount);
    printf(", divps2=");
    print_tsc(tsc3 - tsc2, vcount);
    printf("\n");

    for (i = 0; i < 10; i++)
    {
        printf("%2d: (%f,%f,%f,%f)  (%f,%f,%f,%f)\n",
               (int)i,
               p[i][0], p[i][1], p[i][2], p[i][3],
               q[i][0], q[i][1], q[i][2], q[i][3]);
    }

    return 0;
}

実行結果

$ gcc -O3 -W -Wall sample.c && ./a.out
divps1=2.99, divps2=3.61
 0: (7.000000,3.500000,2.333333,1.750000)  (7.000000,3.500000,2.333333,1.750000)
 1: (1.400000,1.166667,1.000000,0.875000)  (1.400000,1.166667,1.000000,0.875000)
 2: (0.777778,0.700000,0.636364,0.583333)  (0.777778,0.700000,0.636364,0.583333)
 3: (0.538462,0.500000,0.466667,0.437500)  (0.538462,0.500000,0.466667,0.437500)
 4: (0.411765,0.388889,0.368421,0.350000)  (0.411765,0.388889,0.368421,0.350000)
 5: (0.333333,0.318182,0.304348,0.291667)  (0.333333,0.318182,0.304348,0.291667)
 6: (0.280000,0.269231,0.259259,0.250000)  (0.280000,0.269231,0.259259,0.250000)
 7: (0.241379,0.233333,0.225806,0.218750)  (0.241379,0.233333,0.225806,0.218750)
 8: (0.212121,0.205882,0.200000,0.194444)  (0.212121,0.205882,0.200000,0.194444)
 9: (0.189189,0.184211,0.179487,0.175000)  (0.189189,0.184211,0.179487,0.175000)

divps1 と divps2 の数値は、概ね __v4sf での割り算一回分(メモリ・アクセスを含む)の CPU サイクル数のハズです。アセンブリ出力を見ると、divps2 の呼び出し部は、divps1 の呼び出し部より2倍くらい長いのですが速度的には大差がありません。

アセンブリ出力(抜粋)
divps1呼び出し部
L5:
    movaps  %xmm0, %xmm1
    subq    $-128, %rax
    divps   -128(%rax), %xmm1
    movaps  %xmm1, -128(%rax)
    movaps  %xmm0, %xmm1
    divps   -112(%rax), %xmm1
    movaps  %xmm1, -112(%rax)
    movaps  %xmm0, %xmm1
    divps   -96(%rax), %xmm1
    movaps  %xmm1, -96(%rax)
    movaps  %xmm0, %xmm1
    divps   -80(%rax), %xmm1
    movaps  %xmm1, -80(%rax)
    movaps  %xmm0, %xmm1
    divps   -64(%rax), %xmm1
    movaps  %xmm1, -64(%rax)
    movaps  %xmm0, %xmm1
    divps   -48(%rax), %xmm1
    movaps  %xmm1, -48(%rax)
    movaps  %xmm0, %xmm1
    divps   -32(%rax), %xmm1
    movaps  %xmm1, -32(%rax)
    movaps  %xmm0, %xmm1
    divps   -16(%rax), %xmm1
    movaps  %xmm1, -16(%rax)
    cmpq    %rax, %rbx
    jne L5
divps2呼び出し部
L6:
    movaps  (%rax), %xmm2
    subq    $-128, %rax
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -128(%rax)
    movaps  -112(%rax), %xmm2
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -112(%rax)
    movaps  -96(%rax), %xmm2
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -96(%rax)
    movaps  -80(%rax), %xmm2
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -80(%rax)
    movaps  -64(%rax), %xmm2
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -64(%rax)
    movaps  -48(%rax), %xmm2
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -48(%rax)
    movaps  -32(%rax), %xmm2
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -32(%rax)
    movaps  -16(%rax), %xmm2
    rcpps   %xmm2, %xmm4
    movaps  %xmm2, %xmm3
    movaps  %xmm1, %xmm2
    mulps   %xmm4, %xmm3
    subps   %xmm3, %xmm2
    mulps   %xmm4, %xmm2
    mulps   %xmm0, %xmm2
    movaps  %xmm2, -16(%rax)
    cmpq    %rax, %rdx
    jne L6


逆数なら、a を掛ける必要がなくなったり、演算精度を落としてもいいのなら n2 の計算を省ぶことも考えられます。すると、divps を使うより良くなる余地が出てきます。ということで rcpps/rcpss が用意されているようです。(rsqrtps/rsqrtss も同様)

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0