0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

paizaでインラインアセンブラ その5

Posted at

概要

paizaで、インラインアセンブラやってみた。
コード見つけたので、やってみた。

参考にしたページ

サンプルコード

#include <time.h>
#include <sys/time.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>

#define GHz 2.00

static inline uint64_t rdtsc() {
    uint64_t ret;
#if defined _LP64
    __asm__ volatile (
        "rdtsc  \n\t"
        "mov    $32, %%rdx\n\t"
        "orq    %%rdx, %%rax\n\t"
        "mov    %%rax, %0\n\t"
        :"=m"(ret)
        :
        :"%rax", "%rdx"
    );
#else
    __asm__ volatile (
        "rdtsc  \n\t"
        "mov    %%eax, %0\n\t"
        "mov    %%edx, %1\n\t"
        :"=m"(((uint32_t<span style="font-weight:bold;">)&ret)[0]), "=m"(((uint32_t</span>)&ret)[1])
        :
        :"%eax", "%edx"
    );
#endif
    return ret;
}

void print_GFLOPS(double flops, uint64_t cycles) {
    double GFLOPS = flops * GHz / cycles;
    double sec = (double) cycles * 1e-9 / GHz;
    printf("GFLOPS @ %.2fGHz:\n  %.3f [flops/clock] = %.3f [GFLOPS]  (%.0f flops in %"PRIu64" clock = %f sec)\n",
         GHz, flops / (double) cycles, GFLOPS, flops, cycles, sec);
}

void print_throughput(uint64_t instructions, uint64_t cycles) {
    printf("Throughput:\n  %.3f [instructions/clock]   (%"PRIu64" instrucions in %"PRIu64" clock)\n",
         (double) instructions / (double) cycles, instructions, cycles);
}

#define zero_all_xmm() \
    do { \
        __asm__ volatile (\
            "xorps %xmm0, %xmm0\n\t" \
            "xorps %xmm1, %xmm1\n\t" \
            "xorps %xmm2, %xmm2\n\t" \
            "xorps %xmm3, %xmm3\n\t" \
            "xorps %xmm4, %xmm4\n\t" \
            "xorps %xmm5, %xmm5\n\t" \
            "xorps %xmm6, %xmm6\n\t" \
            "xorps %xmm7, %xmm7\n\t" \
        ); \
    } while (0)

#define LOOP (1 << 21)

void sse_mulps_addps_movups_shufps_fromL1() {
    printf("-- sse_mulps_addps_movups_shufps_fromL1 --\n");
    unsigned long long clk0, 
        clk1, 
        cycles;
    zero_all_xmm();
    const int instructions_per_loop = 16;
    const int flops_per_instruction = 4;
    const double flops = flops_per_instruction * instructions_per_loop * LOOP;
    int i;
    float __attribute__ ((aligned(16))) a[4] = {0.0, 0.0, 0.0, 0.0};
    float __attribute__ ((aligned(16))) b[4] = {0.0, 0.0, 0.0, 0.0};
    float __attribute__ ((aligned(16))) c[4] = {0.0, 0.0, 0.0, 0.0};
    float __attribute__ ((aligned(16))) d[4] = {0.0, 0.0, 0.0, 0.0};
    clk0 = rdtsc();
    for (i = 0; i < LOOP; ++i) 
    {
        __asm__ volatile (
            "movups %0, %%xmm4\n\t"
            "shufps $0, %%xmm4, %%xmm4\n\t"
            "mulps  %%xmm4, %%xmm0\n\t"
            "addps  %%xmm0, %%xmm8\n\t"
            "movups %1, %%xmm5\n\t"
            "shufps $0, %%xmm5, %%xmm5\n\t"
            "mulps  %%xmm5, %%xmm1\n\t"
            "addps  %%xmm1, %%xmm9\n\t"
            "movups %2, %%xmm6\n\t"
            "shufps $0, %%xmm6, %%xmm6\n\t"
            "mulps  %%xmm6, %%xmm2\n\t"
            "addps  %%xmm2, %%xmm10\n\t"
            "movups %3, %%xmm7\n\t"
            "shufps $0, %%xmm7, %%xmm7\n\t"
            "mulps  %%xmm7, %%xmm3\n\t"
            "addps  %%xmm3, %%xmm11\n\t"
            "movups %0, %%xmm4\n\t"
            "shufps $0, %%xmm4, %%xmm4\n\t"
            "mulps  %%xmm4, %%xmm0\n\t"
            "addps  %%xmm0, %%xmm8\n\t"
            "movups %1, %%xmm5\n\t"
            "shufps $0, %%xmm5, %%xmm5\n\t"
            "mulps  %%xmm5, %%xmm1\n\t"
            "addps  %%xmm1, %%xmm9\n\t"
            "movups %2, %%xmm6\n\t"
            "shufps $0, %%xmm6, %%xmm6\n\t"
            "mulps  %%xmm6, %%xmm2\n\t"
            "addps  %%xmm2, %%xmm10\n\t"
            "movups %3, %%xmm7\n\t"
            "shufps $0, %%xmm7, %%xmm7\n\t"
            "mulps  %%xmm7, %%xmm3\n\t"
            "addps  %%xmm3, %%xmm11\n\t"
            :
            :"m"(a[0]), "m"(b[0]), "m"(c[0]), "m"(d[0])
        );
    }
    clk1 = rdtsc();
    cycles = clk1 - clk0;
    print_GFLOPS(flops, cycles);
    print_throughput(instructions_per_loop * LOOP, cycles);
}

int main() {
    sse_mulps_addps_movups_shufps_fromL1();
    return 0;
}


実行結果

-- sse_mulps_addps_movups_shufps_fromL1 --
GFLOPS @ 2.00GHz:
  7.522 [flops/clock] = 15.044 [GFLOPS]  (134217728 flops in 17843454 clock = 0.008922 sec)
Throughput:
  1.880 [instructions/clock]   (33554432 instrucions in 17843454 clock)

成果物

以上。

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?