処理を高速化したい

C++で書いた暗号化処理をなるべく高速化したいと考えています。
高速化するにあたり、ループの回数を減らしたり、OSやライブラリで提供されている処理を使用することで高速化したいと考えます。

現状のソースコード

下記のソースコードはNISTの論文通りに実装したソースコードです。

SHA256.cpp

void SHA256::compute(unsigned char** block, unsigned int* H) {

    //  メッセージの個数をカウントする
    int N = 0;
    while (block[N] != NULL) {
        N++;
    }

    unsigned int W[MESSAGE_BLOCK_SIZE];

    //  Hを初期化する
    memcpy(H, H0, sizeof(int) * INIT_HASH_LENGTH);

    //  6.2.2 SHA-256 Hash Computation
    for (int i = 0; i < N; i++) {
        //  変数定義
        unsigned int a, b, c, d, e, f, g, h, s0, s1, T1, T2;

        char* msg = (char*)block[i];

        //  1. Prepare the message schedule, { Wt }:
        for (int t = 0; t < MESSAGE_BLOCK_SIZE; t++) {
            if (t < 16) {
                int p = t * 4;
                W[t] = (unsigned int)((msg[p] & 0xff) << 24) | (unsigned int)((msg[p + 1] & 0xff) << 16) | (unsigned int)((msg[p + 2] & 0xff) << 8) | (unsigned int)(msg[p + 3] & 0xff);
            }
            else {
                W[t] = sigma1(W[(t - 2)]) + W[(t - 7)] + sigma0(W[(t - 15)]) + W[(t - 16)];
            }
        }

        //  2. Initialize the eight working variables, a, b, c, d, e, f, g, and h, with the (i-1)st hash value:
        a = H[0];
        b = H[1];
        c = H[2];
        d = H[3];
        e = H[4];
        f = H[5];
        g = H[6];
        h = H[7];

        //  3. For t=0 to 63:
        for (int t = 0; t < MESSAGE_BLOCK_SIZE; t++) {
            T1 = h + SIGMA1(e) + Ch(e, f, g) + K[t] + W[t];
            T2 = SIGMA0(a) + Maj(a, b, c);

            h = g;
            g = f;
            f = e;
            e = d + T1;
            d = c;
            c = b;
            b = a;
            a = T1 + T2;
        }

        //  4. Compute the ith intermediate hash value H(i):
        H[0] = (a + H[0]) & 0xffffffff;
        H[1] = (b + H[1]) & 0xffffffff;
        H[2] = (c + H[2]) & 0xffffffff;
        H[3] = (d + H[3]) & 0xffffffff;
        H[4] = (e + H[4]) & 0xffffffff;
        H[5] = (f + H[5]) & 0xffffffff;
        H[6] = (g + H[6]) & 0xffffffff;
        H[7] = (h + H[7]) & 0xffffffff;
    }
}

この状態で１秒間に何回暗号化処理を行うことができるか計測しました。

Hashrate(0):171801H/s
Hashrate(1):185725H/s
Hashrate(2):192288H/s
Hashrate(3):190813H/s
Hashrate(4):186704H/s
Hashrate(5):191354H/s
Hashrate(6):191056H/s
Hashrate(7):181171H/s
Hashrate(8):185610H/s
Hashrate(9):190519H/s
Average Hashrate:186704H/s

検証環境としては下記のスペックとなっています。

高速化方法の検討

下記のソースコードは openssl の SHA256 処理ですが、高速化するためにループの数を減らして高速化を実現しています。

openssl

static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
                                    size_t num)
{
    unsigned MD32_REG_T a, b, c, d, e, f, g, h, s0, s1, T1, T2;
    SHA_LONG X[16], l;
    int i;
    const unsigned char *data = in;
    printf( "data:%s", data );

    while (num--) {

        a = ctx->h[0];
        b = ctx->h[1];
        c = ctx->h[2];
        d = ctx->h[3];
        e = ctx->h[4];
        f = ctx->h[5];
        g = ctx->h[6];
        h = ctx->h[7];

        for (i = 0; i < 16; i++) {
            (void)HOST_c2l(data, l);
            printf( "l:%d", l );
            T1 = X[i] = l;
            T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i];
            T2 = Sigma0(a) + Maj(a, b, c);
            h = g;
            g = f;
            f = e;
            e = d + T1;
            d = c;
            c = b;
            b = a;
            a = T1 + T2;
        }

        for (; i < 64; i++) {
            s0 = X[(i + 1) & 0x0f];
            s0 = sigma0(s0);
            s1 = X[(i + 14) & 0x0f];
            s1 = sigma1(s1);

            T1 = X[i & 0xf] += s0 + s1 + X[(i + 9) & 0xf];
            T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i];
            T2 = Sigma0(a) + Maj(a, b, c);
            h = g;
            g = f;
            f = e;
            e = d + T1;
            d = c;
            c = b;
            b = a;
            a = T1 + T2;
        }

        ctx->h[0] += a;
        ctx->h[1] += b;
        ctx->h[2] += c;
        ctx->h[3] += d;
        ctx->h[4] += e;
        ctx->h[5] += f;
        ctx->h[6] += g;
        ctx->h[7] += h;

    }
}

W値を計算するループと a～h の8個の変数を計算するループは統合できそうです。
ループを統合したソースコードは下記のようになります。

SHA256.cpp

void SHA256::compute(unsigned char** block, unsigned int* H) {

    //  メッセージの個数をカウントする
    int N = 0;
    while (block[N] != NULL) {
        N++;
    }

    unsigned int W[MESSAGE_BLOCK_SIZE];

    //  Hを初期化する
    memcpy(H, H0, sizeof(int) * INIT_HASH_LENGTH);

    //  6.2.2 SHA-256 Hash Computation
    for (int i = 0; i < N; i++) {
        //  変数定義
        unsigned int a, b, c, d, e, f, g, h, s0, s1, T1, T2;

        char* msg = (char*)block[i];

        //  2. Initialize the eight working variables, a, b, c, d, e, f, g, and h, with the (i-1)st hash value:
        a = H[0];
        b = H[1];
        c = H[2];
        d = H[3];
        e = H[4];
        f = H[5];
        g = H[6];
        h = H[7];

        for (int t = 0; t < MESSAGE_BLOCK_SIZE; t++) {
            //  1. Prepare the message schedule, { Wt }:
            if (t < 16) {
                int p = t * 4;
                W[t] = (unsigned int)((msg[p] & 0xff) << 24) | (unsigned int)((msg[p + 1] & 0xff) << 16) | (unsigned int)((msg[p + 2] & 0xff) << 8) | (unsigned int)(msg[p + 3] & 0xff);
            }
            else {
                W[t] = sigma1(W[(t - 2)]) + W[(t - 7)] + sigma0(W[(t - 15)]) + W[(t - 16)];
            }

            //  3. For t=0 to 63:
            T1 = h + SIGMA1(e) + Ch(e, f, g) + K[t] + W[t];
            T2 = SIGMA0(a) + Maj(a, b, c);

            h = g;
            g = f;
            f = e;
            e = d + T1;
            d = c;
            c = b;
            b = a;
            a = T1 + T2;
        }

        //  4. Compute the ith intermediate hash value H(i):
        H[0] = (a + H[0]) & 0xffffffff;
        H[1] = (b + H[1]) & 0xffffffff;
        H[2] = (c + H[2]) & 0xffffffff;
        H[3] = (d + H[3]) & 0xffffffff;
        H[4] = (e + H[4]) & 0xffffffff;
        H[5] = (f + H[5]) & 0xffffffff;
        H[6] = (g + H[6]) & 0xffffffff;
        H[7] = (h + H[7]) & 0xffffffff;
    }
}

この状態で計測を行うと下記のように20万回/秒を超える結果となりました。

Hashrate(0):179199H/s
Hashrate(1):207113H/s
Hashrate(2):204449H/s
Hashrate(3):207590H/s
Hashrate(4):206775H/s
Hashrate(5):206487H/s
Hashrate(6):202569H/s
Hashrate(7):203702H/s
Hashrate(8):204270H/s
Hashrate(9):207116H/s
Average Hashrate:202927H/s

さらにW値を計算する際の0～15個目と16個目以降で計算方法が異なるため、if文が挿入されていますが、このif文を削除するために、ループを0～15と16～64に分割してみたのですが、あまり効果は得られませんでした。

結論

処理の順番を変えることで１割ほど処理性能を向上させることができました。このソースコードを元にGPU側での処理を実装したいと思います。

SHA-256 暗号化処理の高速化

処理を高速化したい

現状のソースコード

高速化方法の検討

結論