src1, src2 の 512bit 内の 128bit に含まれる32bit値4個をそれぞれまぜまぜして、src1 を混ぜた結果と、src2 を混ぜた結果をあわせます。x86の伝統に従い、128bit 境界は超えられません。
_mm512_shuffle_epi32 と _mm512_shuffle_ps は似た挙動をしそうですが、あんまり似てないです。おいAVX512キモいぞ、と思ったのですが、SSE2からの伝統でした。問題無いですね。
#include <immintrin.h>
#include <stdio.h>
float in0[16] = {100,
101,
102,
103,
104,
105,
106,
107,
108,
109,
110,
111,
112,
113,
114,
115};
float in1[16] = {200,
201,
202,
203,
204,
205,
206,
207,
208,
209,
210,
211,
212,
213,
214,
215};
float out[16];
static inline __attribute__((always_inline)) void
test(const int imm)
{
__m512 v0 = _mm512_loadu_ps(in0);
__m512 v1 = _mm512_loadu_ps(in1);
__m512 v = _mm512_shuffle_ps(v0, v1, imm);
int i;
_mm512_storeu_ps(out, v);
for (i=0; i<16; i++) {
printf("imm=%d%d%d%d, out[%d] = %f\n",
(imm>>6)&0x3,
(imm>>4)&0x3,
(imm>>2)&0x3,
(imm>>0)&0x3,
i,
out[i]);
}
}
int
main()
{
test(_MM_SHUFFLE(3,2,1,0));
test(_MM_SHUFFLE(2,2,0,0));
test(_MM_SHUFFLE(0,0,0,0));
}
imm=3210, out[0] = 100.000000
imm=3210, out[1] = 101.000000
imm=3210, out[2] = 202.000000
imm=3210, out[3] = 203.000000
imm=3210, out[4] = 104.000000
imm=3210, out[5] = 105.000000
imm=3210, out[6] = 206.000000
imm=3210, out[7] = 207.000000
imm=3210, out[8] = 108.000000
imm=3210, out[9] = 109.000000
imm=3210, out[10] = 210.000000
imm=3210, out[11] = 211.000000
imm=3210, out[12] = 112.000000
imm=3210, out[13] = 113.000000
imm=3210, out[14] = 214.000000
imm=3210, out[15] = 215.000000
imm=2200, out[0] = 100.000000
imm=2200, out[1] = 100.000000
imm=2200, out[2] = 202.000000
imm=2200, out[3] = 202.000000
imm=2200, out[4] = 104.000000
imm=2200, out[5] = 104.000000
imm=2200, out[6] = 206.000000
imm=2200, out[7] = 206.000000
imm=2200, out[8] = 108.000000
imm=2200, out[9] = 108.000000
imm=2200, out[10] = 210.000000
imm=2200, out[11] = 210.000000
imm=2200, out[12] = 112.000000
imm=2200, out[13] = 112.000000
imm=2200, out[14] = 214.000000
imm=2200, out[15] = 214.000000
imm=0000, out[0] = 100.000000
imm=0000, out[1] = 100.000000
imm=0000, out[2] = 200.000000
imm=0000, out[3] = 200.000000
imm=0000, out[4] = 104.000000
imm=0000, out[5] = 104.000000
imm=0000, out[6] = 204.000000
imm=0000, out[7] = 204.000000
imm=0000, out[8] = 108.000000
imm=0000, out[9] = 108.000000
imm=0000, out[10] = 208.000000
imm=0000, out[11] = 208.000000
imm=0000, out[12] = 112.000000
imm=0000, out[13] = 112.000000
imm=0000, out[14] = 212.000000
imm=0000, out[15] = 212.000000
そろそろ読んでる人達も飽きてきたと思うので、次回、@tanakmura がついに登場した最強のshuffleである vpermt2b について書いてshuffleを終わらせようと思います。