1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

x86 SSE : _mm_div_ps (divps) の代替(除算命令を使わない除算)を試す

Last updated at Posted at 2021-09-01

近年(もう随分前になるのかな?)のプロセッサは除算をニュートン・ラフソン法で行っているらしいので、以前作った備忘録:除算命令を使わない除算を参考に divps の代替を作ってみます。

以下の環境

macOS Big Sur (11.5.2)
Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
gcc version 11.2.0 (GCC)

でコンパイルして実行します。

sample.c
#include <math.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <xmmintrin.h>

/*
 * DIVPS
 */
inline static __v4sf divps1(__v4sf a, __v4sf b)
{
    return _mm_div_ps(a, b);
}

/*
 * DIVPS の代替
 *
 *    x[1] = RCPPS(b)
 *    x[n+1] = x[n] * (2 - b * x[n])
 *
 */
inline static __v4sf divps2(__v4sf a, __v4sf b)
{
    __v4sf two = { 2, 2, 2, 2 };
    __v4sf x1 = _mm_rcp_ps(b);
    __v4sf x2 = x1 * (two - b * x1);
    return a * x2;
}

/*
 * 以下、動作確認用
 */
inline static uint64_t RDTSC()
{
#if defined(_MSC_VER) ||   \
    (defined(__clang__) && \
     (defined(__amd64) ||  \
      defined(__i386) ||   \
      defined(__x86) ||    \
      defined(__x86_64)))

    return __rdtsc();

#elif defined(__GNUC__) && \
    (defined(__amd64) ||   \
     defined(__i386) ||    \
     defined(__x86) ||     \
     defined(__x86_64))

    uint32_t eax, edx;
    __asm__ volatile ("RDTSC" : "=a" (eax), "=d" (edx));
    return ((uint64_t)edx << 32) | eax;

#else

    timespec c;
    clock_gettime(CLOCK_REALTIME, &c);
    return ((uint64_t)c.tv_sec * 1000 * 1000 * 1000 + c.tv_nsec);

#endif
}

static void print_tsc(uint64_t tsc, uint64_t cnt)
{
    uint64_t i = tsc / cnt;
    uint64_t f = (100 * (tsc % cnt) + (cnt / 2)) / cnt;

    printf("%" PRIu64 ".%02" PRIu64, i, f);
}

int main()
{
    __v4sf a = { 7, 7, 7, 7 };
    size_t i, fcount = (1 << 20);
    size_t vcount = fcount / 4;
    float *buffer = (float *)_mm_malloc(2 * fcount * sizeof(float), 16);
    __v4sf *p = (__v4sf *)buffer;
    __v4sf *q = (__v4sf *)(buffer + fcount);
    uint64_t tsc1, tsc2, tsc3;

    if (!buffer)
    {
        fprintf(stderr, "_mm_malloc 失敗\n");
        return 2;
    }
    for (i = 1; i <= fcount; i++)
    {
        buffer[i - 1] = (float)i;
        buffer[i - 1 + fcount] = (float)i;
    }

    tsc1 = RDTSC();

    for (i = 0; i < vcount; i += 8)
    {
        p[i + 0] = divps1(a, p[i + 0]);
        p[i + 1] = divps1(a, p[i + 1]);
        p[i + 2] = divps1(a, p[i + 2]);
        p[i + 3] = divps1(a, p[i + 3]);
        p[i + 4] = divps1(a, p[i + 4]);
        p[i + 5] = divps1(a, p[i + 5]);
        p[i + 6] = divps1(a, p[i + 6]);
        p[i + 7] = divps1(a, p[i + 7]);
    }

    tsc2 = RDTSC();

    for (i = 0; i < vcount; i += 8)
    {
        q[i + 0] = divps2(a, q[i + 0]);
        q[i + 1] = divps2(a, q[i + 1]);
        q[i + 2] = divps2(a, q[i + 2]);
        q[i + 3] = divps2(a, q[i + 3]);
        q[i + 4] = divps2(a, q[i + 4]);
        q[i + 5] = divps2(a, q[i + 5]);
        q[i + 6] = divps2(a, q[i + 6]);
        q[i + 7] = divps2(a, q[i + 7]);
    }

    tsc3 = RDTSC();

    _mm_free(buffer);

    printf("divps1=");
    print_tsc(tsc2 - tsc1, vcount);
    printf(", divps2=");
    print_tsc(tsc3 - tsc2, vcount);
    printf("\n");

    for (i = 0; i < 10; i++)
    {
        printf("%2d: (%f,%f,%f,%f)  (%f,%f,%f,%f)\n",
               (int)i,
               p[i][0], p[i][1], p[i][2], p[i][3],
               q[i][0], q[i][1], q[i][2], q[i][3]);
    }

    return 0;
}

実行結果

$ gcc -O3 -W -Wall sample.c && ./a.out
divps1=2.99, divps2=3.61
 0: (7.000000,3.500000,2.333333,1.750000)  (7.000000,3.500000,2.333333,1.750000)
 1: (1.400000,1.166667,1.000000,0.875000)  (1.400000,1.166667,1.000000,0.875000)
 2: (0.777778,0.700000,0.636364,0.583333)  (0.777778,0.700000,0.636364,0.583333)
 3: (0.538462,0.500000,0.466667,0.437500)  (0.538462,0.500000,0.466667,0.437500)
 4: (0.411765,0.388889,0.368421,0.350000)  (0.411765,0.388889,0.368421,0.350000)
 5: (0.333333,0.318182,0.304348,0.291667)  (0.333333,0.318182,0.304348,0.291667)
 6: (0.280000,0.269231,0.259259,0.250000)  (0.280000,0.269231,0.259259,0.250000)
 7: (0.241379,0.233333,0.225806,0.218750)  (0.241379,0.233333,0.225806,0.218750)
 8: (0.212121,0.205882,0.200000,0.194444)  (0.212121,0.205882,0.200000,0.194444)
 9: (0.189189,0.184211,0.179487,0.175000)  (0.189189,0.184211,0.179487,0.175000)

divps1 と divps2 の数値は、概ね __v4sf での割り算一回分(メモリ・アクセスを含む)の CPU サイクル数のハズです。アセンブリ出力を見ると、divps2 の呼び出し部は、divps1 の呼び出し部より2倍くらい長いのですが速度的には大差がありません。

アセンブリ出力(抜粋)
divps1呼び出し部
L5:
	movaps	%xmm0, %xmm1
	subq	$-128, %rax
	divps	-128(%rax), %xmm1
	movaps	%xmm1, -128(%rax)
	movaps	%xmm0, %xmm1
	divps	-112(%rax), %xmm1
	movaps	%xmm1, -112(%rax)
	movaps	%xmm0, %xmm1
	divps	-96(%rax), %xmm1
	movaps	%xmm1, -96(%rax)
	movaps	%xmm0, %xmm1
	divps	-80(%rax), %xmm1
	movaps	%xmm1, -80(%rax)
	movaps	%xmm0, %xmm1
	divps	-64(%rax), %xmm1
	movaps	%xmm1, -64(%rax)
	movaps	%xmm0, %xmm1
	divps	-48(%rax), %xmm1
	movaps	%xmm1, -48(%rax)
	movaps	%xmm0, %xmm1
	divps	-32(%rax), %xmm1
	movaps	%xmm1, -32(%rax)
	movaps	%xmm0, %xmm1
	divps	-16(%rax), %xmm1
	movaps	%xmm1, -16(%rax)
	cmpq	%rax, %rbx
	jne	L5
divps2呼び出し部
L6:
	movaps	(%rax), %xmm2
	subq	$-128, %rax
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -128(%rax)
	movaps	-112(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -112(%rax)
	movaps	-96(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -96(%rax)
	movaps	-80(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -80(%rax)
	movaps	-64(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -64(%rax)
	movaps	-48(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -48(%rax)
	movaps	-32(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -32(%rax)
	movaps	-16(%rax), %xmm2
	rcpps	%xmm2, %xmm4
	movaps	%xmm2, %xmm3
	movaps	%xmm1, %xmm2
	mulps	%xmm4, %xmm3
	subps	%xmm3, %xmm2
	mulps	%xmm4, %xmm2
	mulps	%xmm0, %xmm2
	movaps	%xmm2, -16(%rax)
	cmpq	%rax, %rdx
	jne	L6

逆数なら、a を掛ける必要がなくなったり、演算精度を落としてもいいのなら n2 の計算を省ぶことも考えられます。すると、divps を使うより良くなる余地が出てきます。ということで rcpps/rcpss が用意されているようです。(rsqrtps/rsqrtss も同様)

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?