概要
paizaで、インラインアセンブラやってみた。
コード見つけたので、やってみた。
参考にしたページ
サンプルコード
#include <time.h>
#include <sys/time.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#define GHz 2.00
static inline uint64_t rdtsc() {
uint64_t ret;
#if defined _LP64
__asm__ volatile (
"rdtsc \n\t"
"mov $32, %%rdx\n\t"
"orq %%rdx, %%rax\n\t"
"mov %%rax, %0\n\t"
:"=m"(ret)
:
:"%rax", "%rdx"
);
#else
__asm__ volatile (
"rdtsc \n\t"
"mov %%eax, %0\n\t"
"mov %%edx, %1\n\t"
:"=m"(((uint32_t<span style="font-weight:bold;">)&ret)[0]), "=m"(((uint32_t</span>)&ret)[1])
:
:"%eax", "%edx"
);
#endif
return ret;
}
void print_GFLOPS(double flops, uint64_t cycles) {
double GFLOPS = flops * GHz / cycles;
double sec = (double) cycles * 1e-9 / GHz;
printf("GFLOPS @ %.2fGHz:\n %.3f [flops/clock] = %.3f [GFLOPS] (%.0f flops in %"PRIu64" clock = %f sec)\n",
GHz, flops / (double) cycles, GFLOPS, flops, cycles, sec);
}
void print_throughput(uint64_t instructions, uint64_t cycles) {
printf("Throughput:\n %.3f [instructions/clock] (%"PRIu64" instrucions in %"PRIu64" clock)\n",
(double) instructions / (double) cycles, instructions, cycles);
}
#define zero_all_xmm() \
do { \
__asm__ volatile (\
"xorps %xmm0, %xmm0\n\t" \
"xorps %xmm1, %xmm1\n\t" \
"xorps %xmm2, %xmm2\n\t" \
"xorps %xmm3, %xmm3\n\t" \
"xorps %xmm4, %xmm4\n\t" \
"xorps %xmm5, %xmm5\n\t" \
"xorps %xmm6, %xmm6\n\t" \
"xorps %xmm7, %xmm7\n\t" \
); \
} while (0)
#define LOOP (1 << 21)
void sse_mulps_addps_movups_shufps_fromL1() {
printf("-- sse_mulps_addps_movups_shufps_fromL1 --\n");
unsigned long long clk0,
clk1,
cycles;
zero_all_xmm();
const int instructions_per_loop = 16;
const int flops_per_instruction = 4;
const double flops = flops_per_instruction * instructions_per_loop * LOOP;
int i;
float __attribute__ ((aligned(16))) a[4] = {0.0, 0.0, 0.0, 0.0};
float __attribute__ ((aligned(16))) b[4] = {0.0, 0.0, 0.0, 0.0};
float __attribute__ ((aligned(16))) c[4] = {0.0, 0.0, 0.0, 0.0};
float __attribute__ ((aligned(16))) d[4] = {0.0, 0.0, 0.0, 0.0};
clk0 = rdtsc();
for (i = 0; i < LOOP; ++i)
{
__asm__ volatile (
"movups %0, %%xmm4\n\t"
"shufps $0, %%xmm4, %%xmm4\n\t"
"mulps %%xmm4, %%xmm0\n\t"
"addps %%xmm0, %%xmm8\n\t"
"movups %1, %%xmm5\n\t"
"shufps $0, %%xmm5, %%xmm5\n\t"
"mulps %%xmm5, %%xmm1\n\t"
"addps %%xmm1, %%xmm9\n\t"
"movups %2, %%xmm6\n\t"
"shufps $0, %%xmm6, %%xmm6\n\t"
"mulps %%xmm6, %%xmm2\n\t"
"addps %%xmm2, %%xmm10\n\t"
"movups %3, %%xmm7\n\t"
"shufps $0, %%xmm7, %%xmm7\n\t"
"mulps %%xmm7, %%xmm3\n\t"
"addps %%xmm3, %%xmm11\n\t"
"movups %0, %%xmm4\n\t"
"shufps $0, %%xmm4, %%xmm4\n\t"
"mulps %%xmm4, %%xmm0\n\t"
"addps %%xmm0, %%xmm8\n\t"
"movups %1, %%xmm5\n\t"
"shufps $0, %%xmm5, %%xmm5\n\t"
"mulps %%xmm5, %%xmm1\n\t"
"addps %%xmm1, %%xmm9\n\t"
"movups %2, %%xmm6\n\t"
"shufps $0, %%xmm6, %%xmm6\n\t"
"mulps %%xmm6, %%xmm2\n\t"
"addps %%xmm2, %%xmm10\n\t"
"movups %3, %%xmm7\n\t"
"shufps $0, %%xmm7, %%xmm7\n\t"
"mulps %%xmm7, %%xmm3\n\t"
"addps %%xmm3, %%xmm11\n\t"
:
:"m"(a[0]), "m"(b[0]), "m"(c[0]), "m"(d[0])
);
}
clk1 = rdtsc();
cycles = clk1 - clk0;
print_GFLOPS(flops, cycles);
print_throughput(instructions_per_loop * LOOP, cycles);
}
int main() {
sse_mulps_addps_movups_shufps_fromL1();
return 0;
}
実行結果
-- sse_mulps_addps_movups_shufps_fromL1 --
GFLOPS @ 2.00GHz:
7.522 [flops/clock] = 15.044 [GFLOPS] (134217728 flops in 17843454 clock = 0.008922 sec)
Throughput:
1.880 [instructions/clock] (33554432 instrucions in 17843454 clock)
成果物
以上。