avx512
More than 3 years have passed since last update.

src1, src2 の 512bit 内の 128bit に含まれる32bit値4個をそれぞれまぜまぜして、src1 を混ぜた結果と、src2 を混ぜた結果をあわせます。x86の伝統に従い、128bit 境界は超えられません。

_mm512_shuffle_epi32 と _mm512_shuffle_ps は似た挙動をしそうですが、あんまり似てないです。おいAVX512キモいぞ、と思ったのですが、SSE2からの伝統でした。問題無いですね。

#include <immintrin.h>
#include <stdio.h>

float in0[16] = {100,
                 101,
                 102,
                 103,
                 104,
                 105,
                 106,
                 107,
                 108,
                 109,
                 110,
                 111,
                 112,
                 113,
                 114,
                 115};

float in1[16] = {200,
                 201,
                 202,
                 203,
                 204,
                 205,
                 206,
                 207,
                 208,
                 209,
                 210,
                 211,
                 212,
                 213,
                 214,
                 215};

float out[16];

static inline __attribute__((always_inline)) void
test(const int imm)
{
    __m512 v0 = _mm512_loadu_ps(in0);
    __m512 v1 = _mm512_loadu_ps(in1);

    __m512 v = _mm512_shuffle_ps(v0, v1, imm);
    int i;

    _mm512_storeu_ps(out, v);

    for (i=0; i<16; i++) {
        printf("imm=%d%d%d%d, out[%d] = %f\n",
               (imm>>6)&0x3,
               (imm>>4)&0x3,
               (imm>>2)&0x3,
               (imm>>0)&0x3,
               i,
               out[i]);
    }
}

int
main()
{
    test(_MM_SHUFFLE(3,2,1,0));

    test(_MM_SHUFFLE(2,2,0,0));

    test(_MM_SHUFFLE(0,0,0,0));
}
imm=3210, out[0] = 100.000000
imm=3210, out[1] = 101.000000
imm=3210, out[2] = 202.000000
imm=3210, out[3] = 203.000000
imm=3210, out[4] = 104.000000
imm=3210, out[5] = 105.000000
imm=3210, out[6] = 206.000000
imm=3210, out[7] = 207.000000
imm=3210, out[8] = 108.000000
imm=3210, out[9] = 109.000000
imm=3210, out[10] = 210.000000
imm=3210, out[11] = 211.000000
imm=3210, out[12] = 112.000000
imm=3210, out[13] = 113.000000
imm=3210, out[14] = 214.000000
imm=3210, out[15] = 215.000000
imm=2200, out[0] = 100.000000
imm=2200, out[1] = 100.000000
imm=2200, out[2] = 202.000000
imm=2200, out[3] = 202.000000
imm=2200, out[4] = 104.000000
imm=2200, out[5] = 104.000000
imm=2200, out[6] = 206.000000
imm=2200, out[7] = 206.000000
imm=2200, out[8] = 108.000000
imm=2200, out[9] = 108.000000
imm=2200, out[10] = 210.000000
imm=2200, out[11] = 210.000000
imm=2200, out[12] = 112.000000
imm=2200, out[13] = 112.000000
imm=2200, out[14] = 214.000000
imm=2200, out[15] = 214.000000
imm=0000, out[0] = 100.000000
imm=0000, out[1] = 100.000000
imm=0000, out[2] = 200.000000
imm=0000, out[3] = 200.000000
imm=0000, out[4] = 104.000000
imm=0000, out[5] = 104.000000
imm=0000, out[6] = 204.000000
imm=0000, out[7] = 204.000000
imm=0000, out[8] = 108.000000
imm=0000, out[9] = 108.000000
imm=0000, out[10] = 208.000000
imm=0000, out[11] = 208.000000
imm=0000, out[12] = 112.000000
imm=0000, out[13] = 112.000000
imm=0000, out[14] = 212.000000
imm=0000, out[15] = 212.000000

そろそろ読んでる人達も飽きてきたと思うので、次回、@tanakmura がついに登場した最強のshuffleである vpermt2b について書いてshuffleを終わらせようと思います。