More than 3 years have passed since last update.

立っている最上位ビットの桁数を求める

Last updated at 2021-04-02Posted at 2021-04-01

はじめに

次のような、1になっているビットのうち最上位にあるものが2進数で何桁目にあるかを求めたい。ただし、第0ビットを1桁目と数える。値がx=0のときは0桁とする。

int digit_higher_bit(uint32_t x){  
  int rank = 0;
  while(x){
    rank++;
    x >>= 1;
  }
  return rank;
}

int main(){
  return rank_higher_bit(125);
  //return 7//
}

この処理をビット演算で高速化できないか。

結論：追記1

コメント欄でstd::countl_zero()使おうとアドバイスいただいた。C++20でなくても__builtin_clz()使えるなら使おう。
ただし、テストしてみると最速ではないので、wandboxでハードウェア命令が使われているかも影響するのかも。
また、各自の環境で__builtin_clz()相当の組み込みが使えるかは不明。

ということで、以下の記事の意味は多少薄れたのですが、記録として残します。

NLZの方法を参考に

この方法に類似の問題がNAKAMURA Minoru氏のwebサイト¹でNLZの算出という項目として議論されており、そこのコードのベンチマークをwandbox²で行ってみる。特に、コメント欄の「さき氏」のコードにはバグがあるが、それを修正した「hir0氏」のコードをnzl5revと呼ぶことにする。
同webページのnlz_routines.cおよびnlz_test.cをベースにして多少改良したもので検証する。

これをwandboxで実行すると次のようになる。ただし計算回数はREPEAT=100000000とした。

# g++ prog.cc -Wall -Wextra -I/opt/wandbox/boost-1.73.0/gcc-head/include -std=gnu++2b

nlz1:   1.200948 sec
nlz1sft:   2.017283 sec
nlz2:   2.793658 sec
nlz3:   2.347450 sec
nlz4:   0.229859 sec
nlz5:   0.502514 sec
nlz5rev:   1.994522 sec
0
Finish

# g++ prog.cc -Wall -Wextra -O2 -march=native -I/opt/wandbox/boost-1.73.0/gcc-head/include -std=gnu++2b "-O3"

nlz1:   0.248584 sec, (198858565)
nlz1sft:   0.152740 sec, (198858565)
nlz2:   0.187586 sec, (198858565)
nlz3:   0.212117 sec, (198858565)
nlz4:   0.212739 sec, (198858565)
nlz5:   0.343369 sec, (3099893180)
nlz5rev:   0.266015 sec, (198858565)
0
Finish

nlz1はハードウェア命令のpopcountを利用するが実際に利用されているかはWandboxのため不明。nlz1sftはpopcountの代わりに同webページにあるnumofbits5を使ったもの。最適化オプションによって速さの序列はかなり変わってくる。記事を書き始めた時点ではnlz5revはgoodと思ったが、その後、最適化オプションを弄るうちに序列がかなり変わって混乱。

実機でやってなくて申し訳ない。実機は某NEC SX-Aurora TSUBASA機が年度切り替わりのメンテナンス中という雑な言い訳で許して。。。

目的の桁数を求める

さて、目的の最上位ビットの桁数は32-NLTである。nlz5revの最後で31とXOR演算をせずに1を加えたものになる（※NLZでは全bitが1の時に31を返す、つまり、0ビット目を0桁目としている。よってここでは1を加える）。nlz1の場合にはpopcountに渡す引数を~xとはせずにxを渡せばよい。これらをinv_nlz5rev、inv_nlz1、inv_nlz1sftとした。検証用のコードは次のようになる。

/*
 *  This code STRONGLY respects 
 *  http://www.nminoru.jp/~nminoru/programming/bitcount.html
 */

# include <stdio.h>
# include <stdlib.h>
# include <stdint.h>
# include <inttypes.h>
# include <sys/time.h>


int digit_higher_bit(uint32_t x){  
  int rank = 0;
  while(x){
    rank++;
    x >>= 1;
  }
  return rank;
}

int inv_nlz1(uint32_t x)
{
    x = x | ( x >>  1 );
    x = x | ( x >>  2 );
    x = x | ( x >>  4 );
    x = x | ( x >>  8 );
    x = x | ( x >> 16 );

    return __builtin_popcount(x);
}

int numofbits5(long bits)
{
    bits = (bits & 0x55555555) + (bits >> 1 & 0x55555555);
    bits = (bits & 0x33333333) + (bits >> 2 & 0x33333333);
    bits = (bits & 0x0f0f0f0f) + (bits >> 4 & 0x0f0f0f0f);
    bits = (bits & 0x00ff00ff) + (bits >> 8 & 0x00ff00ff);
    return (bits & 0x0000ffff) + (bits >>16 & 0x0000ffff);
}

int inv_nlz1sft(uint32_t x)
{
    x = x | ( x >>  1 );
    x = x | ( x >>  2 );
    x = x | ( x >>  4 );
    x = x | ( x >>  8 );
    x = x | ( x >> 16 );

    return numofbits5(x);
}

int inv_nlz5rev(uint32_t x)
{
int c = 0;
if (x == 0) return 0;//32;
if (x & 0xffff0000) { x &= 0xffff0000; c |= 0x10; }
if (x & 0xff00ff00) { x &= 0xff00ff00; c |= 0x08; }
if (x & 0xf0f0f0f0) { x &= 0xf0f0f0f0; c |= 0x04; }
if (x & 0xcccccccc) { x &= 0xcccccccc; c |= 0x02; }
if (x & 0xaaaaaaaa) { c |= 0x01; }
return c+1;
//return c ^ 31;
}

//(追記1)
int inv_clz1(uint32_t x)
{
    return 32 - __builtin_clz(x);
}

int dummy(uint32_t x)
{
    return 0;
}

static double getdiff(const struct timeval *before, const struct timeval *after)
{
    return (after->tv_sec - before->tv_sec) + 1.0 * (after->tv_usec - before->tv_usec) / 1000000;
}

int main(){
  //check of correction
    for (int x = 0; x < 123456789; ++x){
        const int d0 = digit_higher_bit(x);
        const int d1 = inv_nlz1(x);
        const int d1s = inv_nlz1sft(x);
        const int d5r = inv_nlz5rev(x);
        const int c1 = inv_clz1(x);
        if(d0 != d1){
          printf("ERROR1: x = %d = 0x%x -> %d, %d\n",x, x, d0, d1);
        }
        if(d0 != d1s){
          printf("ERROR1s: x = %d = 0x%x -> %d, %d\n",x, x, d0, d1s);
        }
        if(d0 != d5r){
          printf("ERROR5r: x = %d = 0x%x -> %d, %d\n",x, x, d0, d5r);
        }
        if(d0 != c1){
          printf("ERRORc1: x = %d = 0x%x -> %d, %d\n",x, x, d0, c1);
        }
    }
  
    const int REPEAT = 100000000;
    const int DATA_SIZE = 65536;
    uint32_t data[DATA_SIZE];
  
      // Prepare input datas
    for (int i=0 ; i<REPEAT ; i++) {
        data[i % DATA_SIZE] = rand();
    }

    // Warm up data cache
    uint64_t sum=0;
    for (int i=0 ; i<REPEAT ; i++) {
        sum += dummy(data[i % DATA_SIZE]);
    }

    struct timeval before, after;

    // dummy
    sum=0;
    gettimeofday(&before, NULL);
    for (int i=0 ; i<REPEAT ; i++) {
        sum += dummy(data[i % DATA_SIZE]);
    }
    gettimeofday(&after, NULL);
    double base_time = getdiff(&before, &after);

    // simple loop
    sum=0;
    gettimeofday(&before, NULL);
    for (int i=0 ; i<REPEAT ; i++) {
        sum += digit_higher_bit(data[i % DATA_SIZE]);
    }
    gettimeofday(&after, NULL);
    printf("simple_loop: %10.6f sec (%" PRId64 ")\n", getdiff(&before, &after) - base_time, sum);

    // inv_nlz1
    sum=0;
    gettimeofday(&before, NULL);
    for (int i=0 ; i<REPEAT ; i++) {
        sum += inv_nlz1(data[i % DATA_SIZE]);
    }
    gettimeofday(&after, NULL);
    printf("inv_nlz1: %10.6f sec (%" PRId64 ")\n", getdiff(&before, &after) - base_time, sum);

    // inv_nlz1sft
    sum = 0;
    gettimeofday(&before, NULL);
    for (int i=0 ; i<REPEAT ; i++) {
        sum += inv_nlz1sft(data[i % DATA_SIZE]);
    }
    gettimeofday(&after, NULL);
    printf("inv_nlz1sft: %10.6f sec (%" PRId64 ")\n", getdiff(&before, &after) - base_time, sum);

    // inv_nlz5rev
    sum = 0;
    gettimeofday(&before, NULL);
    for (int i=0 ; i<REPEAT ; i++) {
        sum += inv_nlz5rev(data[i % DATA_SIZE]);
    }
    gettimeofday(&after, NULL);
    printf("inv_nlz5rev: %10.6f sec (%" PRId64 ")\n", getdiff(&before, &after) - base_time, sum);
  
    // inv_clz1 (追記1)
    sum = 0;
    gettimeofday(&before, NULL);
    for (int i=0 ; i<REPEAT ; i++) {
        sum += inv_clz1(data[i % DATA_SIZE]);
    }
    gettimeofday(&after, NULL);
    printf("inv_clz1: %10.6f sec (%" PRId64 ")\n", getdiff(&before, &after) - base_time, sum);
 
  return 0;
}

同じくWandboxで実行した。ただし、最適化なしの場合には時間が掛かり過ぎてKillされるので最初の結果検証用の123456789は適当に小さくする。実行結果は次のようにな。

# g++ prog.cc -Wall -Wextra -I/opt/wandbox/boost-1.73.0/gcc-head/include -std=gnu++2b

simple_loop:   9.933384 sec (3001141435)
inv_nlz1:   1.156976 sec (3001141435)
inv_nlz1sft:   1.983428 sec (3001141435)
inv_nlz5rev:   1.991893 sec (3001141435)
0
Finish

# g++ prog.cc -Wall -Wextra -O2 -march=native -I/opt/wandbox/boost-1.73.0/gcc-head/include -std=gnu++2b "-O3"

simple_loop:   2.042543 sec (3001141435)
inv_nlz1:   0.233110 sec (3001141435)
inv_nlz1sft:   0.148162 sec (3001141435)
inv_nlz5rev:   0.234953 sec (3001141435)
inv_clz1:   0.173916 sec (3001141435)
0
Finish

どれも目的のオリジナルのループdigit_higher_bitと一致する(ERRORが出ていないの意)。
また、どのnlzを応用した処理もオリジナルに比べて10倍程度速い。最も速いのはnlz1sftというのはいまいち腑に落ちないが、この辺りは最適化オプションでかなり変わってくる。
(追記1) clz使えるならclz使おう。

さいごに

時間取れたら実機でやります。。。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up