paizaでインラインアセンブラその5

Posted at 2024-09-03

概要

paizaで、インラインアセンブラやってみた。
コード見つけたので、やってみた。

参考にしたページ

サンプルコード

#include <time.h>
#include <sys/time.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>

#define GHz 2.00

static inline uint64_t rdtsc() {
    uint64_t ret;
#if defined _LP64
    __asm__ volatile (
        "rdtsc  \n\t"
        "mov    $32, %%rdx\n\t"
        "orq    %%rdx, %%rax\n\t"
        "mov    %%rax, %0\n\t"
        :"=m"(ret)
        :
        :"%rax", "%rdx"
    );
#else
    __asm__ volatile (
        "rdtsc  \n\t"
        "mov    %%eax, %0\n\t"
        "mov    %%edx, %1\n\t"
        :"=m"(((uint32_t<span style="font-weight:bold;">)&ret)[0]), "=m"(((uint32_t</span>)&ret)[1])
        :
        :"%eax", "%edx"
    );
#endif
    return ret;
}

void print_GFLOPS(double flops, uint64_t cycles) {
    double GFLOPS = flops * GHz / cycles;
    double sec = (double) cycles * 1e-9 / GHz;
    printf("GFLOPS @ %.2fGHz:\n  %.3f [flops/clock] = %.3f [GFLOPS]  (%.0f flops in %"PRIu64" clock = %f sec)\n",
         GHz, flops / (double) cycles, GFLOPS, flops, cycles, sec);
}

void print_throughput(uint64_t instructions, uint64_t cycles) {
    printf("Throughput:\n  %.3f [instructions/clock]   (%"PRIu64" instrucions in %"PRIu64" clock)\n",
         (double) instructions / (double) cycles, instructions, cycles);
}

#define zero_all_xmm() \
    do { \
        __asm__ volatile (\
            "xorps %xmm0, %xmm0\n\t" \
            "xorps %xmm1, %xmm1\n\t" \
            "xorps %xmm2, %xmm2\n\t" \
            "xorps %xmm3, %xmm3\n\t" \
            "xorps %xmm4, %xmm4\n\t" \
            "xorps %xmm5, %xmm5\n\t" \
            "xorps %xmm6, %xmm6\n\t" \
            "xorps %xmm7, %xmm7\n\t" \
        ); \
    } while (0)

#define LOOP (1 << 21)

void sse_mulps_addps_movups_shufps_fromL1() {
    printf("-- sse_mulps_addps_movups_shufps_fromL1 --\n");
    unsigned long long clk0, 
        clk1, 
        cycles;
    zero_all_xmm();
    const int instructions_per_loop = 16;
    const int flops_per_instruction = 4;
    const double flops = flops_per_instruction * instructions_per_loop * LOOP;
    int i;
    float __attribute__ ((aligned(16))) a[4] = {0.0, 0.0, 0.0, 0.0};
    float __attribute__ ((aligned(16))) b[4] = {0.0, 0.0, 0.0, 0.0};
    float __attribute__ ((aligned(16))) c[4] = {0.0, 0.0, 0.0, 0.0};
    float __attribute__ ((aligned(16))) d[4] = {0.0, 0.0, 0.0, 0.0};
    clk0 = rdtsc();
    for (i = 0; i < LOOP; ++i) 
    {
        __asm__ volatile (
            "movups %0, %%xmm4\n\t"
            "shufps $0, %%xmm4, %%xmm4\n\t"
            "mulps  %%xmm4, %%xmm0\n\t"
            "addps  %%xmm0, %%xmm8\n\t"
            "movups %1, %%xmm5\n\t"
            "shufps $0, %%xmm5, %%xmm5\n\t"
            "mulps  %%xmm5, %%xmm1\n\t"
            "addps  %%xmm1, %%xmm9\n\t"
            "movups %2, %%xmm6\n\t"
            "shufps $0, %%xmm6, %%xmm6\n\t"
            "mulps  %%xmm6, %%xmm2\n\t"
            "addps  %%xmm2, %%xmm10\n\t"
            "movups %3, %%xmm7\n\t"
            "shufps $0, %%xmm7, %%xmm7\n\t"
            "mulps  %%xmm7, %%xmm3\n\t"
            "addps  %%xmm3, %%xmm11\n\t"
            "movups %0, %%xmm4\n\t"
            "shufps $0, %%xmm4, %%xmm4\n\t"
            "mulps  %%xmm4, %%xmm0\n\t"
            "addps  %%xmm0, %%xmm8\n\t"
            "movups %1, %%xmm5\n\t"
            "shufps $0, %%xmm5, %%xmm5\n\t"
            "mulps  %%xmm5, %%xmm1\n\t"
            "addps  %%xmm1, %%xmm9\n\t"
            "movups %2, %%xmm6\n\t"
            "shufps $0, %%xmm6, %%xmm6\n\t"
            "mulps  %%xmm6, %%xmm2\n\t"
            "addps  %%xmm2, %%xmm10\n\t"
            "movups %3, %%xmm7\n\t"
            "shufps $0, %%xmm7, %%xmm7\n\t"
            "mulps  %%xmm7, %%xmm3\n\t"
            "addps  %%xmm3, %%xmm11\n\t"
            :
            :"m"(a[0]), "m"(b[0]), "m"(c[0]), "m"(d[0])
        );
    }
    clk1 = rdtsc();
    cycles = clk1 - clk0;
    print_GFLOPS(flops, cycles);
    print_throughput(instructions_per_loop * LOOP, cycles);
}

int main() {
    sse_mulps_addps_movups_shufps_fromL1();
    return 0;
}

実行結果

-- sse_mulps_addps_movups_shufps_fromL1 --
GFLOPS @ 2.00GHz:
  7.522 [flops/clock] = 15.044 [GFLOPS]  (134217728 flops in 17843454 clock = 0.008922 sec)
Throughput:
  1.880 [instructions/clock]   (33554432 instrucions in 17843454 clock)

成果物

以上。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

paizaでインラインアセンブラ その5

概要

参考にしたページ

サンプルコード

実行結果

成果物

paizaでインラインアセンブラその5