前々回はヘテロダイン変換を、前回はCICフィルタを作った。満を持してFIRフィルタの実装を行う。
今回はfloat複素数入出力のforとSIMD、double複素数入出力のforとSIMDを実装した。
float SIMDは同時に8ポイントを並列で処理し、係数も8の倍数である必要がある(足りない場合はゼロで埋めた係数を渡す)。コンストラクタでバッファの大きさ(デフォルトは係数の2倍の長さ)を渡すが、これが大きければバッファコピーの頻度が減って早くなる代わりに、メモリ使用量が増える。早くなると言ってもせいぜい数%程度の差でしかないようだが。
double SIMDは1ポイントずつ処理する簡単な実装。当然、遅い。
float/double共にforでは0.3million毎秒前後(128タップ)、double SIMDは0.57million毎秒(同)、float SIMDは2.5million毎秒(同)、程度の速度。やはりSIMDで並列化すると数倍早くなる。実行するたびに速度がバラバラなのであまり信頼性はないが、最悪でも数倍の高速化はできる。
FIR_C32_SIMD.Processのtmpはnewでヒープから確保しているが、これをstackallocで確保すると、目に見えて遅くなる。まれにヒープより早くなることもあるけど、ほとんどの場合は遅くなる。ということで今回はヒープから取るようにしてある。もしかしたら環境によってはスタックから取ったほうが早いかもしれない。
class FIR_C32_for
{
private readonly Complex32[] firCoeff;
private readonly Complex32[] firBuff;
private int firBuffIndex;
public FIR_C32_for(Complex32[] firCoeff)
{
this.firCoeff = firCoeff;
firBuff = new Complex32[1 << (int)Math.Ceiling(Math.Log(firCoeff.Length, 2))];
}
public void Process(Complex32[] src, Complex32[] dst)
{
if (src.Length != dst.Length)
{ throw new Exception(); }
var firMask = firBuff.Length - 1;
for (int i = 0; i < src.Length; i++)
{
if (firBuffIndex == 0) { firBuffIndex = firBuff.Length; }
firBuff[--firBuffIndex] = src[i];
var firAcc = Complex32.Zero;
for (int k = 0; k < firCoeff.Length; k++)
{
firAcc += firBuff[(firBuffIndex + k) & firMask] * firCoeff[k];
}
dst[i] = firAcc;
}
}
}
class FIR_C32_SIMD
{
private readonly Vector256<float>[] coeff;
private readonly float[] buff;
private readonly int buffTmpLen;
private int index;
public FIR_C32_SIMD(Complex32[] coeff, int capacity = 2)
{
if (coeff.Length % 8 != 0) { throw new Exception("coeff.Length % 8 != 0"); }
if (capacity < 2) { throw new Exception("capacity"); }
this.coeff = new Vector256<float>[coeff.Length / 4];
unsafe
{
fixed (Complex32* p1 = coeff)
{
for (int i = 0; i < this.coeff.Length; i++)
{
this.coeff[i] = Avx2.LoadVector256((float*)p1 + i * 8);
}
}
}
buffTmpLen = coeff.Length * 2;
buff = new float[buffTmpLen * capacity];
}
public void Process(Complex32[] src, Complex32[] dst)
{
if (src.Length != dst.Length)
{ throw new Exception(); }
var tmp = new float[8];
unsafe
{
fixed (float* p1 = buff)
fixed (float* p2 = tmp)
{
for (int i = 0; i < src.Length; i++)
{
if (index == 0)
{
index = buff.Length - buffTmpLen;
Array.Copy(buff, 0, buff, index, buffTmpLen);
}
buff[--index] = src[i].Imaginary;
buff[--index] = src[i].Real;
var acc = Vector256<float>.Zero;
for (int k = 0; k < coeff.Length; k += 4)
{
var b1 = Avx.LoadVector256(index + k * 8 + 0 + p1);
var b2 = Avx.LoadVector256(index + k * 8 + 8 + p1);
var b3 = Avx.LoadVector256(index + k * 8 + 16 + p1);
var b4 = Avx.LoadVector256(index + k * 8 + 24 + p1);
var c1 = coeff[k + 0];
var c2 = coeff[k + 1];
var c3 = coeff[k + 2];
var c4 = coeff[k + 3];
acc = Avx.Add(acc, Avx.HorizontalAdd(
Avx.Add(
Avx.HorizontalSubtract(
Avx.Multiply(b1, c1),
Avx.Multiply(b2, c2)),
Avx.HorizontalSubtract(
Avx.Multiply(b3, c3),
Avx.Multiply(b4, c4))),
Avx.Add(
Avx.HorizontalAdd(
Avx.Multiply(b1, Avx.Shuffle(c1, c1, 0xB1)),
Avx.Multiply(b2, Avx.Shuffle(c2, c2, 0xB1))),
Avx.HorizontalAdd(
Avx.Multiply(b3, Avx.Shuffle(c3, c3, 0xB1)),
Avx.Multiply(b4, Avx.Shuffle(c4, c4, 0xB1))))));
}
Avx.Store(p2, Avx.HorizontalAdd(acc, Vector256<float>.Zero));
dst[i] = new Complex32(tmp[0] + tmp[4], tmp[1] + tmp[5]);
}
}
}
}
}
class FIR_C64_for
{
private readonly Complex[] firCoeff;
private readonly Complex[] firBuff;
private int firBuffIndex;
public FIR_C64_for(Complex[] firCoeff)
{
this.firCoeff = firCoeff;
firBuff = new Complex[1 << (int)Math.Ceiling(Math.Log(firCoeff.Length, 2))];
}
public void Process(Complex[] src, Complex[] dst)
{
if (src.Length != dst.Length)
{ throw new Exception(); }
var firMask = firBuff.Length - 1;
for (int i = 0; i < src.Length; i++)
{
if (firBuffIndex == 0) { firBuffIndex = firBuff.Length; }
firBuff[--firBuffIndex] = src[i];
var firAcc = Complex.Zero;
for (int k = 0; k < firCoeff.Length; k++)
{
firAcc += firBuff[(firBuffIndex + k) & firMask] * firCoeff[k];
}
dst[i] = firAcc;
}
}
}
class FIR_C64_SIMD
{
private readonly Vector128<double>[] coeff;
private readonly Vector128<double>[] buff;
private int index;
public FIR_C64_SIMD(Complex[] coeff)
{
this.coeff = new Vector128<double>[coeff.Length];
buff = new Vector128<double>[1 << (int)Math.Ceiling(Math.Log(coeff.Length, 2))];
unsafe
{
fixed (Complex* p1 = coeff)
{
for (int i = 0; i < coeff.Length; i++)
{
this.coeff[i] = Sse2.LoadVector128((double*)p1 + i * 2);
}
}
}
}
public void Process(Complex[] src, Complex[] dst)
{
if (src.Length != dst.Length)
{ throw new Exception(); }
var mask = buff.Length - 1;
unsafe
{
fixed (Complex* p1 = src)
fixed (Complex* p2 = dst)
{
for (int i = 0; i < src.Length; i++)
{
if (index == 0) { index = buff.Length; }
buff[--index] = Sse2.LoadVector128((double*)p1 + i * 2);
var acc = Vector128<double>.Zero;
for (int j = 0; j < coeff.Length; j++)
{
var a = buff[(index + j) & mask];
var b = coeff[j];
var c = Sse2.Multiply(a, b);
var d = Sse2.Multiply(a, Sse2.Shuffle(b, b, 1));
acc = Sse2.Add(acc, Sse2.UnpackLow(
Sse3.HorizontalSubtract(c, c),
Sse3.HorizontalAdd(d, d)));
}
Sse2.Store((double*)p2 + i * 2, acc);
}
}
}
}
}