LoginSignup
1
1

More than 5 years have passed since last update.

vshufps

Posted at

src1, src2 の 512bit 内の 128bit に含まれる32bit値4個をそれぞれまぜまぜして、src1 を混ぜた結果と、src2 を混ぜた結果をあわせます。x86の伝統に従い、128bit 境界は超えられません。

_mm512_shuffle_epi32 と _mm512_shuffle_ps は似た挙動をしそうですが、あんまり似てないです。おいAVX512キモいぞ、と思ったのですが、SSE2からの伝統でした。問題無いですね。

#include <immintrin.h>
#include <stdio.h>

float in0[16] = {100,
                 101,
                 102,
                 103,
                 104,
                 105,
                 106,
                 107,
                 108,
                 109,
                 110,
                 111,
                 112,
                 113,
                 114,
                 115};

float in1[16] = {200,
                 201,
                 202,
                 203,
                 204,
                 205,
                 206,
                 207,
                 208,
                 209,
                 210,
                 211,
                 212,
                 213,
                 214,
                 215};

float out[16];

static inline __attribute__((always_inline)) void
test(const int imm)
{
    __m512 v0 = _mm512_loadu_ps(in0);
    __m512 v1 = _mm512_loadu_ps(in1);

    __m512 v = _mm512_shuffle_ps(v0, v1, imm);
    int i;

    _mm512_storeu_ps(out, v);

    for (i=0; i<16; i++) {
        printf("imm=%d%d%d%d, out[%d] = %f\n",
               (imm>>6)&0x3,
               (imm>>4)&0x3,
               (imm>>2)&0x3,
               (imm>>0)&0x3,
               i,
               out[i]);
    }
}

int
main()
{
    test(_MM_SHUFFLE(3,2,1,0));

    test(_MM_SHUFFLE(2,2,0,0));

    test(_MM_SHUFFLE(0,0,0,0));
}
imm=3210, out[0] = 100.000000
imm=3210, out[1] = 101.000000
imm=3210, out[2] = 202.000000
imm=3210, out[3] = 203.000000
imm=3210, out[4] = 104.000000
imm=3210, out[5] = 105.000000
imm=3210, out[6] = 206.000000
imm=3210, out[7] = 207.000000
imm=3210, out[8] = 108.000000
imm=3210, out[9] = 109.000000
imm=3210, out[10] = 210.000000
imm=3210, out[11] = 211.000000
imm=3210, out[12] = 112.000000
imm=3210, out[13] = 113.000000
imm=3210, out[14] = 214.000000
imm=3210, out[15] = 215.000000
imm=2200, out[0] = 100.000000
imm=2200, out[1] = 100.000000
imm=2200, out[2] = 202.000000
imm=2200, out[3] = 202.000000
imm=2200, out[4] = 104.000000
imm=2200, out[5] = 104.000000
imm=2200, out[6] = 206.000000
imm=2200, out[7] = 206.000000
imm=2200, out[8] = 108.000000
imm=2200, out[9] = 108.000000
imm=2200, out[10] = 210.000000
imm=2200, out[11] = 210.000000
imm=2200, out[12] = 112.000000
imm=2200, out[13] = 112.000000
imm=2200, out[14] = 214.000000
imm=2200, out[15] = 214.000000
imm=0000, out[0] = 100.000000
imm=0000, out[1] = 100.000000
imm=0000, out[2] = 200.000000
imm=0000, out[3] = 200.000000
imm=0000, out[4] = 104.000000
imm=0000, out[5] = 104.000000
imm=0000, out[6] = 204.000000
imm=0000, out[7] = 204.000000
imm=0000, out[8] = 108.000000
imm=0000, out[9] = 108.000000
imm=0000, out[10] = 208.000000
imm=0000, out[11] = 208.000000
imm=0000, out[12] = 112.000000
imm=0000, out[13] = 112.000000
imm=0000, out[14] = 212.000000
imm=0000, out[15] = 212.000000

そろそろ読んでる人達も飽きてきたと思うので、次回、@tanakmura がついに登場した最強のshuffleである vpermt2b について書いてshuffleを終わらせようと思います。

1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1