近年(もう随分前になるのかな?)のプロセッサは除算をニュートン・ラフソン法で行っているらしいので、以前作った備忘録:除算命令を使わない除算を参考に divps の代替を作ってみます。
以下の環境
macOS Big Sur (11.5.2)
Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
gcc version 11.2.0 (GCC)
でコンパイルして実行します。
sample.c
#include <math.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <xmmintrin.h>
/*
* DIVPS
*/
inline static __v4sf divps1(__v4sf a, __v4sf b)
{
return _mm_div_ps(a, b);
}
/*
* DIVPS の代替
*
* x[1] = RCPPS(b)
* x[n+1] = x[n] * (2 - b * x[n])
*
*/
inline static __v4sf divps2(__v4sf a, __v4sf b)
{
__v4sf two = { 2, 2, 2, 2 };
__v4sf x1 = _mm_rcp_ps(b);
__v4sf x2 = x1 * (two - b * x1);
return a * x2;
}
/*
* 以下、動作確認用
*/
inline static uint64_t RDTSC()
{
#if defined(_MSC_VER) || \
(defined(__clang__) && \
(defined(__amd64) || \
defined(__i386) || \
defined(__x86) || \
defined(__x86_64)))
return __rdtsc();
#elif defined(__GNUC__) && \
(defined(__amd64) || \
defined(__i386) || \
defined(__x86) || \
defined(__x86_64))
uint32_t eax, edx;
__asm__ volatile ("RDTSC" : "=a" (eax), "=d" (edx));
return ((uint64_t)edx << 32) | eax;
#else
timespec c;
clock_gettime(CLOCK_REALTIME, &c);
return ((uint64_t)c.tv_sec * 1000 * 1000 * 1000 + c.tv_nsec);
#endif
}
static void print_tsc(uint64_t tsc, uint64_t cnt)
{
uint64_t i = tsc / cnt;
uint64_t f = (100 * (tsc % cnt) + (cnt / 2)) / cnt;
printf("%" PRIu64 ".%02" PRIu64, i, f);
}
int main()
{
__v4sf a = { 7, 7, 7, 7 };
size_t i, fcount = (1 << 20);
size_t vcount = fcount / 4;
float *buffer = (float *)_mm_malloc(2 * fcount * sizeof(float), 16);
__v4sf *p = (__v4sf *)buffer;
__v4sf *q = (__v4sf *)(buffer + fcount);
uint64_t tsc1, tsc2, tsc3;
if (!buffer)
{
fprintf(stderr, "_mm_malloc 失敗\n");
return 2;
}
for (i = 1; i <= fcount; i++)
{
buffer[i - 1] = (float)i;
buffer[i - 1 + fcount] = (float)i;
}
tsc1 = RDTSC();
for (i = 0; i < vcount; i += 8)
{
p[i + 0] = divps1(a, p[i + 0]);
p[i + 1] = divps1(a, p[i + 1]);
p[i + 2] = divps1(a, p[i + 2]);
p[i + 3] = divps1(a, p[i + 3]);
p[i + 4] = divps1(a, p[i + 4]);
p[i + 5] = divps1(a, p[i + 5]);
p[i + 6] = divps1(a, p[i + 6]);
p[i + 7] = divps1(a, p[i + 7]);
}
tsc2 = RDTSC();
for (i = 0; i < vcount; i += 8)
{
q[i + 0] = divps2(a, q[i + 0]);
q[i + 1] = divps2(a, q[i + 1]);
q[i + 2] = divps2(a, q[i + 2]);
q[i + 3] = divps2(a, q[i + 3]);
q[i + 4] = divps2(a, q[i + 4]);
q[i + 5] = divps2(a, q[i + 5]);
q[i + 6] = divps2(a, q[i + 6]);
q[i + 7] = divps2(a, q[i + 7]);
}
tsc3 = RDTSC();
_mm_free(buffer);
printf("divps1=");
print_tsc(tsc2 - tsc1, vcount);
printf(", divps2=");
print_tsc(tsc3 - tsc2, vcount);
printf("\n");
for (i = 0; i < 10; i++)
{
printf("%2d: (%f,%f,%f,%f) (%f,%f,%f,%f)\n",
(int)i,
p[i][0], p[i][1], p[i][2], p[i][3],
q[i][0], q[i][1], q[i][2], q[i][3]);
}
return 0;
}
実行結果
$ gcc -O3 -W -Wall sample.c && ./a.out
divps1=2.99, divps2=3.61
0: (7.000000,3.500000,2.333333,1.750000) (7.000000,3.500000,2.333333,1.750000)
1: (1.400000,1.166667,1.000000,0.875000) (1.400000,1.166667,1.000000,0.875000)
2: (0.777778,0.700000,0.636364,0.583333) (0.777778,0.700000,0.636364,0.583333)
3: (0.538462,0.500000,0.466667,0.437500) (0.538462,0.500000,0.466667,0.437500)
4: (0.411765,0.388889,0.368421,0.350000) (0.411765,0.388889,0.368421,0.350000)
5: (0.333333,0.318182,0.304348,0.291667) (0.333333,0.318182,0.304348,0.291667)
6: (0.280000,0.269231,0.259259,0.250000) (0.280000,0.269231,0.259259,0.250000)
7: (0.241379,0.233333,0.225806,0.218750) (0.241379,0.233333,0.225806,0.218750)
8: (0.212121,0.205882,0.200000,0.194444) (0.212121,0.205882,0.200000,0.194444)
9: (0.189189,0.184211,0.179487,0.175000) (0.189189,0.184211,0.179487,0.175000)
divps1 と divps2 の数値は、概ね __v4sf での割り算一回分(メモリ・アクセスを含む)の CPU サイクル数のハズです。アセンブリ出力を見ると、divps2 の呼び出し部は、divps1 の呼び出し部より2倍くらい長いのですが速度的には大差がありません。
アセンブリ出力(抜粋)
divps1呼び出し部
L5:
movaps %xmm0, %xmm1
subq $-128, %rax
divps -128(%rax), %xmm1
movaps %xmm1, -128(%rax)
movaps %xmm0, %xmm1
divps -112(%rax), %xmm1
movaps %xmm1, -112(%rax)
movaps %xmm0, %xmm1
divps -96(%rax), %xmm1
movaps %xmm1, -96(%rax)
movaps %xmm0, %xmm1
divps -80(%rax), %xmm1
movaps %xmm1, -80(%rax)
movaps %xmm0, %xmm1
divps -64(%rax), %xmm1
movaps %xmm1, -64(%rax)
movaps %xmm0, %xmm1
divps -48(%rax), %xmm1
movaps %xmm1, -48(%rax)
movaps %xmm0, %xmm1
divps -32(%rax), %xmm1
movaps %xmm1, -32(%rax)
movaps %xmm0, %xmm1
divps -16(%rax), %xmm1
movaps %xmm1, -16(%rax)
cmpq %rax, %rbx
jne L5
divps2呼び出し部
L6:
movaps (%rax), %xmm2
subq $-128, %rax
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -128(%rax)
movaps -112(%rax), %xmm2
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -112(%rax)
movaps -96(%rax), %xmm2
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -96(%rax)
movaps -80(%rax), %xmm2
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -80(%rax)
movaps -64(%rax), %xmm2
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -64(%rax)
movaps -48(%rax), %xmm2
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -48(%rax)
movaps -32(%rax), %xmm2
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -32(%rax)
movaps -16(%rax), %xmm2
rcpps %xmm2, %xmm4
movaps %xmm2, %xmm3
movaps %xmm1, %xmm2
mulps %xmm4, %xmm3
subps %xmm3, %xmm2
mulps %xmm4, %xmm2
mulps %xmm0, %xmm2
movaps %xmm2, -16(%rax)
cmpq %rax, %rdx
jne L6
逆数なら、a を掛ける必要がなくなったり、演算精度を落としてもいいのなら n2 の計算を省ぶことも考えられます。すると、divps を使うより良くなる余地が出てきます。ということで rcpps/rcpss が用意されているようです。(rsqrtps/rsqrtss も同様)