scatter も実装されました。
レジスタに入っている16個のインデクスと、配列先頭アドレスを加算して、出てきた16個のアドレスに、入力値をストアします。
#include <immintrin.h>
#include <stdio.h>
#define N 4096
unsigned int out[N];
unsigned int index0[16];
int in0[16] = {100,101,102,103,
104,105,106,107,
108,109,110,111,
112,113,114,115};
int
main(void)
{
for (int i=0; i<16; i++) {
index0[i] = rand()%N;
}
__m512i in = _mm512_loadu_si512(in0);
__m512i index = _mm512_loadu_si512(index0);
_mm512_i32scatter_epi32(out, index, in, 4);
for (int i=0; i<16; i++) {
printf("index[%2d] = %4d, val[%2d] = %4d\n", i, index0[i], i, in0[i]);
}
for (int i=0; i<N; i++) {
if (out[i] != 0) {
printf("out[%d] = %d\n", i, out[i]);
}
}
}
index[ 0] = 1383, val[ 0] = 100
index[ 1] = 966, val[ 1] = 101
index[ 2] = 2153, val[ 2] = 102
index[ 3] = 2163, val[ 3] = 103
index[ 4] = 3153, val[ 4] = 104
index[ 5] = 3327, val[ 5] = 105
index[ 6] = 1098, val[ 6] = 106
index[ 7] = 2284, val[ 7] = 107
index[ 8] = 3881, val[ 8] = 108
index[ 9] = 3277, val[ 9] = 109
index[10] = 2234, val[10] = 110
index[11] = 1963, val[11] = 111
index[12] = 498, val[12] = 112
index[13] = 3835, val[13] = 113
index[14] = 2531, val[14] = 114
index[15] = 326, val[15] = 115
out[326] = 115
out[498] = 112
out[966] = 101
out[1098] = 106
out[1383] = 100
out[1963] = 111
out[2153] = 102
out[2163] = 103
out[2234] = 110
out[2284] = 107
out[2531] = 114
out[3153] = 104
out[3277] = 109
out[3327] = 105
out[3835] = 113
out[3881] = 108
以上です。