More than 3 years have passed since last update.

x86 SSE : _mm_div_ps (divps) の代替（除算命令を使わない除算）を試す

Last updated at 2021-09-03Posted at 2021-09-01

近年(もう随分前になるのかな?)のプロセッサは除算をニュートン・ラフソン法で行っているらしいので、以前作った備忘録：除算命令を使わない除算を参考に divps の代替を作ってみます。

以下の環境

macOS Big Sur (11.5.2)
Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
gcc version 11.2.0 (GCC)

でコンパイルして実行します。

sample.c

# include <math.h>
# include <inttypes.h>
# include <stdio.h>
# include <stdint.h>
# include <string.h>
# include <time.h>
# include <xmmintrin.h>

/*
 * DIVPS
 */
inline static __v4sf divps1(__v4sf a, __v4sf b)
{
    return _mm_div_ps(a, b);
}

/*
 * DIVPS の代替
 *
 *    x[1] = RCPPS(b)
 *    x[n+1] = x[n] * (2 - b * x[n])
 *
 */
inline static __v4sf divps2(__v4sf a, __v4sf b)
{
    __v4sf two = { 2, 2, 2, 2 };
    __v4sf x1 = _mm_rcp_ps(b);
    __v4sf x2 = x1 * (two - b * x1);
    return a * x2;
}

/*
 * 以下、動作確認用
 */
inline static uint64_t RDTSC()
{
# if defined(_MSC_VER) ||   \
    (defined(__clang__) && \
     (defined(__amd64) ||  \
      defined(__i386) ||   \
      defined(__x86) ||    \
      defined(__x86_64)))

    return __rdtsc();

# elif defined(__GNUC__) && \
    (defined(__amd64) ||   \
     defined(__i386) ||    \
     defined(__x86) ||     \
     defined(__x86_64))

    uint32_t eax, edx;
    __asm__ volatile ("RDTSC" : "=a" (eax), "=d" (edx));
    return ((uint64_t)edx << 32) | eax;

# else

    timespec c;
    clock_gettime(CLOCK_REALTIME, &c);
    return ((uint64_t)c.tv_sec * 1000 * 1000 * 1000 + c.tv_nsec);

# endif
}

static void print_tsc(uint64_t tsc, uint64_t cnt)
{
    uint64_t i = tsc / cnt;
    uint64_t f = (100 * (tsc % cnt) + (cnt / 2)) / cnt;

    printf("%" PRIu64 ".%02" PRIu64, i, f);
}

int main()
{
    __v4sf a = { 7, 7, 7, 7 };
    size_t i, fcount = (1 << 20);
    size_t vcount = fcount / 4;
    float *buffer = (float *)_mm_malloc(2 * fcount * sizeof(float), 16);
    __v4sf *p = (__v4sf *)buffer;
    __v4sf *q = (__v4sf *)(buffer + fcount);
    uint64_t tsc1, tsc2, tsc3;

    if (!buffer)
    {
        fprintf(stderr, "_mm_malloc 失敗\n");
        return 2;
    }
    for (i = 1; i <= fcount; i++)
    {
        buffer[i - 1] = (float)i;
        buffer[i - 1 + fcount] = (float)i;
    }

    tsc1 = RDTSC();

    for (i = 0; i < vcount; i += 8)
    {
        p[i + 0] = divps1(a, p[i + 0]);
        p[i + 1] = divps1(a, p[i + 1]);
        p[i + 2] = divps1(a, p[i + 2]);
        p[i + 3] = divps1(a, p[i + 3]);
        p[i + 4] = divps1(a, p[i + 4]);
        p[i + 5] = divps1(a, p[i + 5]);
        p[i + 6] = divps1(a, p[i + 6]);
        p[i + 7] = divps1(a, p[i + 7]);
    }

    tsc2 = RDTSC();

    for (i = 0; i < vcount; i += 8)
    {
        q[i + 0] = divps2(a, q[i + 0]);
        q[i + 1] = divps2(a, q[i + 1]);
        q[i + 2] = divps2(a, q[i + 2]);
        q[i + 3] = divps2(a, q[i + 3]);
        q[i + 4] = divps2(a, q[i + 4]);
        q[i + 5] = divps2(a, q[i + 5]);
        q[i + 6] = divps2(a, q[i + 6]);
        q[i + 7] = divps2(a, q[i + 7]);
    }

    tsc3 = RDTSC();

    _mm_free(buffer);

    printf("divps1=");
    print_tsc(tsc2 - tsc1, vcount);
    printf(", divps2=");
    print_tsc(tsc3 - tsc2, vcount);
    printf("\n");

    for (i = 0; i < 10; i++)
    {
        printf("%2d: (%f,%f,%f,%f)  (%f,%f,%f,%f)\n",
               (int)i,
               p[i][0], p[i][1], p[i][2], p[i][3],
               q[i][0], q[i][1], q[i][2], q[i][3]);
    }

    return 0;
}

実行結果

$ gcc -O3 -W -Wall sample.c && ./a.out
divps1=2.99, divps2=3.61
 0: (7.000000,3.500000,2.333333,1.750000)  (7.000000,3.500000,2.333333,1.750000)
 1: (1.400000,1.166667,1.000000,0.875000)  (1.400000,1.166667,1.000000,0.875000)
 2: (0.777778,0.700000,0.636364,0.583333)  (0.777778,0.700000,0.636364,0.583333)
 3: (0.538462,0.500000,0.466667,0.437500)  (0.538462,0.500000,0.466667,0.437500)
 4: (0.411765,0.388889,0.368421,0.350000)  (0.411765,0.388889,0.368421,0.350000)
 5: (0.333333,0.318182,0.304348,0.291667)  (0.333333,0.318182,0.304348,0.291667)
 6: (0.280000,0.269231,0.259259,0.250000)  (0.280000,0.269231,0.259259,0.250000)
 7: (0.241379,0.233333,0.225806,0.218750)  (0.241379,0.233333,0.225806,0.218750)
 8: (0.212121,0.205882,0.200000,0.194444)  (0.212121,0.205882,0.200000,0.194444)
 9: (0.189189,0.184211,0.179487,0.175000)  (0.189189,0.184211,0.179487,0.175000)

divps1 と divps2 の数値は、概ね __v4sf での割り算一回分（メモリ・アクセスを含む）の CPU サイクル数のハズです。アセンブリ出力を見ると、divps2 の呼び出し部は、divps1 の呼び出し部より２倍くらい長いのですが速度的には大差がありません。

アセンブリ出力（抜粋）

divps1呼び出し部

L5:
	movaps	%xmm0, %xmm1
	subq	$-128, %rax
	divps	-128(%rax), %xmm1
	movaps	%xmm1, -128(%rax)
	movaps	%xmm0, %xmm1
	divps	-112(%rax), %xmm1
	movaps	%xmm1, -112(%rax)
	movaps	%xmm0, %xmm1
	divps	-96(%rax), %xmm1
	movaps	%xmm1, -96(%rax)
	movaps	%xmm0, %xmm1
	divps	-80(%rax), %xmm1
	movaps	%xmm1, -80(%rax)
	movaps	%xmm0, %xmm1
	divps	-64(%rax), %xmm1
	movaps	%xmm1, -64(%rax)
	movaps	%xmm0, %xmm1
	divps	-48(%rax), %xmm1
	movaps	%xmm1, -48(%rax)
	movaps	%xmm0, %xmm1
	divps	-32(%rax), %xmm1
	movaps	%xmm1, -32(%rax)
	movaps	%xmm0, %xmm1
	divps	-16(%rax), %xmm1
	movaps	%xmm1, -16(%rax)
	cmpq	%rax, %rbx
	jne	L5

divps2呼び出し部

L6:
	movaps	(%rax), %xmm2
	subq	$-128, %rax
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -128(%rax)
	movaps	-112(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -112(%rax)
	movaps	-96(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -96(%rax)
	movaps	-80(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -80(%rax)
	movaps	-64(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -64(%rax)
	movaps	-48(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -48(%rax)
	movaps	-32(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -32(%rax)
	movaps	-16(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -16(%rax)
	cmpq	%rax, %rdx
	jne	L6

逆数なら、a を掛ける必要がなくなったり、演算精度を落としてもいいのなら n2 の計算を省ぶことも考えられます。すると、divps を使うより良くなる余地が出てきます。ということで rcpps/rcpss が用意されているようです。(rsqrtps/rsqrtss も同様)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up