前回、C#/SIMDでソフトウェア無線向けのヘテロダイン変換を行ったので、CICフィルタもSIMDで実装してみた。
シンプルforで32million毎秒、SIMDで40million/sec(約25%高速化)、並列SIMDで94.5million毎秒(約3倍高速化)、という結果(前回と同じ環境で、R8, N3)。あまり早くならなかった。
SIMD実装は128bit幅のSIMDを使用し、longで計算している。そのためCICの段数を増やしても飽和する危険性が少ない。
並列SIMD実装は256bit幅のSIMDを使用し、intで計算している。CIC飽和の余裕が少ないが、4chを同時に処理できる。あくまでも4本のデータを同じように処理する程度の並列化なので、メモリアクセスがランダムになる。おそらくここがボトルネックになって遅いんだと思う。シーケンシャルにアクセスできるようにすればある程度早くなりそう。
class CIC_for
{
private readonly int cicR_;
public int CicR { get => cicR_; }
private readonly (int Re, int Im)[] cicBuff1;
private readonly (int Re, int Im)[] cicBuff2;
private int cicIndex;
public CIC_for(int CicR, int CicN)
{
cicR_ = CicR;
cicBuff1 = new (int Re, int Im)[CicN];
cicBuff2 = new (int Re, int Im)[CicN];
cicIndex = CicR;
}
public void Process((short Re, short Im)[] src, (int Re, int Im)[] dst)
{
if (src.Length != dst.Length * cicR_)
{ throw new Exception(); }
for (int i = 0, j = 0; i < src.Length; i++)
{
var cicAcc = (Re: (int)src[i].Re, Im: (int)src[i].Im);
for (int k = 0; k < cicBuff1.Length; k++)
{
cicAcc.Re += cicBuff1[k].Re;
cicAcc.Im += cicBuff1[k].Im;
cicBuff1[k] = cicAcc;
}
if (--cicIndex == 0)
{
cicIndex = cicR_;
for (int k = 0; k < cicBuff2.Length; k++)
{
var tmp = cicAcc;
cicAcc.Re -= cicBuff2[k].Re;
cicAcc.Im -= cicBuff2[k].Im;
cicBuff2[k] = tmp;
}
dst[j++] = cicAcc;
}
}
}
}
class CIC_SIMD
{
private readonly int cicR_;
public int CicR { get => cicR_; }
private readonly Vector128<long>[] cicBuff1;
private readonly Vector128<long>[] cicBuff2;
private int cicIndex;
public CIC_SIMD(int CicR, int CicN)
{
cicR_ = CicR;
cicBuff1 = new Vector128<long>[CicN];
cicBuff2 = new Vector128<long>[CicN];
cicIndex = CicR;
}
public void Process((short Re, short Im)[] src, (long Re, long Im)[] dst)
{
if (src.Length != dst.Length * cicR_)
{ throw new Exception(); }
unsafe
{
fixed ((short, short)* ptr1 = src)
fixed ((long, long)* ptr2 = dst)
{
var p1 = (short*)ptr1;
var p2 = (long*)ptr2;
for (int i = 0, j = 0; i < src.Length; i++)
{
var a = Avx2.ConvertToVector128Int64(p1 + i * 2);
for (int k = 0; k < cicBuff1.Length; k++)
{
a = cicBuff1[k] = Avx2.Add(a, cicBuff1[k]);
}
if (--cicIndex == 0)
{
cicIndex = cicR_;
for (int k = 0; k < cicBuff2.Length; k++)
{
var b = a;
a = Avx2.Subtract(a, cicBuff2[k]);
cicBuff2[k] = b;
}
Avx2.Store(p2 + j++ * 2, a);
}
}
}
}
}
}
class CIC_SIMD_parallel
{
private readonly int cicR_;
public int CicR { get => cicR_; }
private readonly Vector256<int>[] cicBuff1;
private readonly Vector256<int>[] cicBuff2;
private int cicIndex;
public CIC_SIMD_parallel(int CicR, int CicN)
{
cicR_ = CicR;
cicBuff1 = new Vector256<int>[CicN];
cicBuff2 = new Vector256<int>[CicN];
cicIndex = CicR;
}
public void Process(
(short Re, short Im)[] src1,
(short Re, short Im)[] src2,
(short Re, short Im)[] src3,
(short Re, short Im)[] src4,
(int Re, int Im)[] dst1,
(int Re, int Im)[] dst2,
(int Re, int Im)[] dst3,
(int Re, int Im)[] dst4)
{
if (src1.Length != src2.Length ||
src1.Length != src3.Length ||
src1.Length != src4.Length ||
src1.Length != dst1.Length * cicR_ ||
src1.Length != dst2.Length * cicR_ ||
src1.Length != dst3.Length * cicR_ ||
src1.Length != dst4.Length * cicR_)
{ throw new Exception(); }
unsafe
{
var foo = new (short, short)[4];
var bar = new (int, int)[4];
fixed ((short, short)* p1 = foo)
fixed ((int, int)* p2 = bar)
{
for (int i = 0, j = 0; i < src1.Length; i++)
{
foo[0] = src1[i];
foo[1] = src2[i];
foo[2] = src3[i];
foo[3] = src4[i];
var a = Avx2.ConvertToVector256Int32((short*)p1);
for (int k = 0; k < cicBuff1.Length; k++)
{
a = cicBuff1[k] = Avx2.Add(a, cicBuff1[k]);
}
if (--cicIndex == 0)
{
cicIndex = cicR_;
for (int k = 0; k < cicBuff2.Length; k++)
{
var b = a;
a = Avx2.Subtract(a, cicBuff2[k]);
cicBuff2[k] = b;
}
Avx2.Store((int*)p2, a);
dst1[j] = bar[0];
dst2[j] = bar[1];
dst3[j] = bar[2];
dst4[j] = bar[3];
j++;
}
}
}
}
}
}