More than 5 years have passed since last update.

OpenMPでLockを使う

Last updated at 2019-09-05Posted at 2019-09-05

C言語のOpenMPでlockを使う際のメモ。通常のOpenMPのようなスレッド並列モデルではあえてlockを使うことはまずないと思うが、section領域の中で同期を取ったりする際にはlockを使う必要がある。section構文を使う際にもlockを使わずに一旦閉じて暗黙のbarrier同期を呼び出したり、その他のスレッド並列でもatomic構文やcritical構文で代用できるはずなのでそちらを使ったほうがデッドロックなどの通常のスレッド並列ではあまり出くわさないバグを生み出さないので特別な理由がない限りlockを使わない書き方をおすすめする。

section領域の中で入れ子領域を作り、その中のsingle構文内でMPI通信をする際にどうしてもlockが必要になったので動作を調べた。下記のコードはOpenMP4.5で動作を確認している。サンプルコードに出てくるOpenMPの入れ子はここで説明。

使い方

OpenMPのlockにはomp_lockとomp_nest_lockの二種類ある。omp_lockは1階層のlock、omp_nest_lockは階層的なlockが可能。

`omp_lock`

omp_lock関係の関数

omp_lock_t omp_lock;      // 変数宣言
omp_init_lock(&omp_lock); // 初期化

# pragma omp parallel 
{ 
omp_set_lock(&omp_lock);   // 最初に到達したスレッドが占有。他のスレッドは待ち状態。
func();
omp_unset_lock(&omp_lock); // 占有スレッドが到達したらlock解除。他の待ちスレッドが動作。
}
omp_destroy_lock(&omp_lock); // 終了処理

下記の適当なコードで動作確認。コメントで補足を入れている。

omp_lock.c

# include <stdio.h>
# include <stdlib.h>
# include <omp.h>

//lockを簡単に切り替えるためのマクロを定義
//#define __OMP_LOCK__

int main(int argc, char **argv)
{
# ifdef __OMP_LOCK__
  omp_lock_t omp_lock;       // lock変数の宣言
  omp_init_lock(&omp_lock);  // lock変数の初期化
# endif


  omp_set_nested(1);  // 今回の例ではわざわざ入れ子にする必要はないがループサイズnを相当増やさないと
                      // section内の出力がうまく混じってくれない。
                      // nが小さくてもそれなりに混ざるように入れ子にしている

  int n=8;  // ループサイズ

# pragma omp parallel sections
  {
# pragma omp section
    {

# ifdef __OMP_LOCK__
      omp_set_lock(&omp_lock);   // section構文で先に到達したスレッドがlock状態にする。他のスレッドは待ち状態になる
# endi

# pragma omp parallel for num_threads(n)
      for(int ix=0;ix<n;ix++) {
	printf("\x1b[31msection1 : %2d\n", ix);
	fflush(stdout);
      }

# ifdef __OMP_LOCK__
      omp_unset_lock(&omp_lock); // lock状態を解除
# endif
    }// end section

# pragma omp section
    {

# ifdef __OMP_LOCK__
      omp_set_lock(&omp_lock); // section構文で先に到達したスレッドがlock状態にする。他のスレッドは待ち状態になる
# endif

# pragma omp parallel for num_threads(n)
      for(int ix=0;ix<n;ix++) {
	printf("\x1b[32msection2 : %2d\n", ix);
	fflush(stdout);
      }

# ifdef __OMP_LOCK__
      omp_unset_lock(&omp_lock);  // lock状態を解除
# endif

    }// end section
  }//end parallel sections

# ifdef __OMP_LOCK__
  omp_destroy_lock(&omp_lock);  // lock変数を破壊
# endif

   return EXIT_SUCCESS;
}

__OMP_LOCK__マクロをオフにしたままロックを使わないで(何回か)実行すると、

$ gcc -fopenmp omp_lock.c
$ ./a.out
section2 :  5
section2 :  1
section2 :  3
section1 :  0
section1 :  3
section1 :  5
section2 :  4
section1 :  6
section2 :  2
section2 :  0
section2 :  6
section2 :  7
section1 :  1
section1 :  2
section1 :  7
section1 :  4

のように各セクションが並列に実行されるので出力がバラバラになる。ixの順序がバラバラなのは入れ子を使って並列にしているため。

$ gcc -fopenmp -D__OMP_LOCK__  omp_lock.c
$ ./a.out
section1 :  0
section1 :  1
section1 :  5
section1 :  7
section1 :  3
section1 :  2
section1 :  4
section1 :  6
section2 :  1
section2 :  3
section2 :  4
section2 :  6
section2 :  5
section2 :  0
section2 :  2
section2 :  7

__OMP_LOCK__フラグをオンにするとlock機構が働き、nを大きくしても各sectionの出力が混ざることはない。ただしどちらのsectionから出力されるかは不定。

`omp_nest_lock`

入れ子的なlockが可能とあるがいまいちどういう入れ子で使えるかわからないのでいろいろ試してみる。

omp_nest_lock関係の関数

omp_nest_lock_t omp_lock;      // 変数宣言
omp_init_nest_lock(&omp_lock); // 初期化

# pragma omp parallel 
{ 
omp_set_nest_lock(&omp_lock);   // 最初に到達したスレッドが占有。他のスレッドは待ち状態。
func();
omp_unset_nest_lock(&omp_lock); // 占有スレッドが到達したらlock解除。他の待ちスレッドが動作。
}
omp_destroy_nest_lock(&omp_lock); // 終了処理

lockを並べて書いた際に通常のomp_lockを並べて書くとコンパイルは通るが実行時にデッロロックが起こる。

omp_lock_t omp_lock;
omp_init_lock(&omp_lock);
omp_set_lock(omp_lock);   // デッドロック
omp_set_lock(omp_lock);   // デッドロック
func();
omp_unset_lock(omp_lock);
omp_unset_lock(omp_lock);

omp_nest_lockを並べて書くと入れ子状態の(後入れ先出しのスタックでの)lock処理が可能。

omp_nest_lock_t omp_lock;
omp_init_nest_lock(&omp_lock);
omp_set_nest_lock(&omp_lock);  // 1番目のlock
omp_set_nest_lock(&omp_lock);  // 2番目のlock 
func();
omp_unset_nest_lock(&omp_lock); // 2番目のunlock
omp_unset_nest_lock(&omp_lock); // 1番目のunlock

下記の適当なコードで動作確認。

omp_nest_lock.c

# include <stdio.h>
# include <stdlib.h>
# include <omp.h>


//ネストのレベル1,2でlockを簡単に切り替えるためのマクロを定義
//#define __OMP_NEST_1_LOCK__
//#define __OMP_NEST_2_LOCK__

void set_sections_nest1(int, omp_nest_lock_t*);
void lock_section_nest1(int, omp_nest_lock_t*);
void set_sections_nest2_A(int, int, omp_nest_lock_t*);
void set_sections_nest2_B(int, int, omp_nest_lock_t*);
void set_sections_nest2_C(int, int);
void lock_section_nest2(int, int, omp_nest_lock_t*);
void print_loop(int, int, int);

// section構文を2スレッド生成
void set_sections_nest1(int n, omp_nest_lock_t *omp_lock)
{
# pragma omp parallel sections num_threads(2)
  {
# pragma omp section
    lock_section_nest1(n, omp_lock);

# pragma omp section
    lock_section_nest1(n, omp_lock);

  }//end parallel sections
}


// section構文の中で__OMP_NEST_1_LOCK__のlock
void lock_section_nest1(int n, omp_nest_lock_t *omp_lock)
{
# ifdef __OMP_NEST_1_LOCK__
  omp_set_nest_lock(omp_lock);
# endif

  int tid = omp_get_thread_num();
  //set_sections_nest2_A(tid, n, omp_lock);
  //set_sections_nest2_B(tid, n, omp_lock);
  set_sections_nest2_C(tid, n);

# ifdef __OMP_NEST_1_LOCK__
  omp_unset_nest_lock(omp_lock);
# endif
}

// ダメな書き方。
// この書き方ではsection構文の中で使われる__OMP_NEST_2_LOCK__のlockで
// デッドロックを起こす。原因は入れ子的に生成されるsection構文内で*omp_lock変数が
// 競合を起こすから。*omp_lockをparallel領域を呼び出す際にprivate変数にしたいが
// ポインタ変数なので指定できない。
void set_sections_nest2_A(int tid, int n, omp_nest_lock_t *omp_lock)
{
  // sub sections
# pragma omp parallel sections num_threads(2)
  {
    // sub section
# pragma omp section
    lock_section_nest2(tid, n, omp_lock);

# pragma omp section
    lock_section_nest2(tid, n, omp_lock);

  }//end sub sections
}


// 一応大丈夫な書き方。
// parallel領域を入れ子的に作らなければomp_nest_lockを使用可能。
// この例ではすでに上の方で__OMP_NEST_1_LOCK__のlockが呼ばれて
// スレッドが占有状態になっているので__OMP_NEST_2_LOCK__のlock機構を使う意味は正直ない。
void set_sections_nest2_B(int tid, int n, omp_nest_lock_t *omp_lock)
{
  lock_section_nest2(tid, n, omp_lock);
  lock_section_nest2(tid, n, omp_lock);
}


// 大丈夫な書き方。
// __OMP_NEST_1_LOCK__のlockとは完全に別の__OMP_NEST_2_LOCK__のlockを作る。
// なので引数にも__OMP_NEST_1_LOCK__のlock変数は必要ない。
// この書き方では同じレベルのparallel領域で入れ子的なlockを
// 使うわけではないので__OMP_NEST_1_LOCK__と__OMP_NEST_2_LOCK__のlockは
// 両方共omp_nest_lockを使う必要はなくomp_lockで事足りる。
void set_sections_nest2_C(int tid, int n)
{
  omp_nest_lock_t omp_lock2;
  omp_init_nest_lock(&omp_lock2);

  // sub sections
# pragma omp parallel sections num_threads(2)
  {
    // sub section
# pragma omp section
    lock_section_nest2(tid, n, &omp_lock2);

# pragma omp section
    lock_section_nest2(tid, n, &omp_lock2);

  }//end sub sections

  omp_destroy_nest_lock(&omp_lock2);
}


// __OMP_NEST_2_LOCK__のlock
void lock_section_nest2(int tid, int n, omp_nest_lock_t *omp_lock)
{
# ifdef __OMP_NEST_2_LOCK__
  omp_set_nest_lock(omp_lock);
# endif

  int sub_tid = omp_get_thread_num();
  print_loop(tid, sub_tid, n);

# ifdef __OMP_NEST_2_LOCK__
  omp_unset_nest_lock(omp_lock);
# endif
}

void print_loop(int tid, int sub_tid, int n)
{
# pragma omp parallel for num_threads(n)
  for(int ix=0;ix<n;ix++) {
    printf("section-%d-%d : %2d\n", tid, sub_tid, ix);
    fflush(stdout);
  }
}



int main(int argc, char **argv)
{
  omp_nest_lock_t omp_lock;
  omp_init_nest_lock(&omp_lock);

# ifdef __OMP_NEST_1_LOCK__
  printf("__OMP_NEST_1_LOCK__\n");
# endif
# ifdef __OMP_NEST_2_LOCK__
  printf("__OMP_NEST_2_LOCK__\n");
# endif

  int n=4;
  omp_set_nested(1);

  set_sections_nest1(n, &omp_lock);

  omp_destroy_nest_lock(&omp_lock);

  return EXIT_SUCCESS;
}

set_sections_nest2_A()またはset_sections_nest2_C()のコメントアウトを外し、__OMP_NEST_1_LOCK__フラグのみオンにして実行すると

$ gcc -fopenmp -D__OMP_NEST_1_LOCK__  omp_nest_lock.c
$./a.out
__OMP_NEST_1_LOCK__
section-0-0 :  0      
section-0-0 :  2
section-0-0 :  1
section-0-1 :  1
section-0-1 :  0
section-0-0 :  3
section-0-1 :  3
section-0-1 :  2
section-1-0 :  3
section-1-1 :  1
section-1-1 :  0
section-1-1 :  2
section-1-1 :  3
section-1-0 :  2
section-1-0 :  1
section-1-0 :  0

section-x-yのレベル1でのスレッドIDであるxが連続して出力されることがわかる。yはバラバラ。この例では高確率でsection-0-yが先に来るが稀にsection-1-yが先に並ぶ。__OMP_NEST_2_LOCK__フラグのみオンにして実行すると

$ gcc -fopenmp -D__OMP_NEST_2_LOCK__  omp_nest_lock.c
$./a.out 
__OMP_NEST_2_LOCK__
section-0-0 :  3
section-0-0 :  1
section-0-0 :  0
section-1-0 :  3
section-0-0 :  2
section-1-0 :  0
section-1-0 :  2
section-1-0 :  1
section-1-1 :  1
section-1-1 :  0
section-1-1 :  3
section-1-1 :  2
section-0-1 :  2
section-0-1 :  3
section-0-1 :  0
section-0-1 :  1

section-x-yのレベル2でのスレッドIDであるyが連続して出力されることがわかる。xはバラバラ。この例では高確率でsection-x-0が先に来るが稀にsection-x-1が先に並ぶ。

__OMP_NEST_1_LOCK__ , __OMP_NEST_2_LOCK__フラグの両方をオンにして実行する。まずはset_sections_nest2_A()のコメントアウトを外して実行するとデッドロックを起こす。*omp_lock変数がポインタ変数であり、parallel領域を呼び出す際にprivate変数にできないのが原因。set_sections_nest2_B()のように同一階層のparallel領域で入れ子的なlockを取るとデッドロックを起こさないで動作するがこの例の場合は__OMP_NEST_2_LOCK__のlockをわざわざ使う必要はない(__OMP_NEST_1_LOCK__のロックでスレッドが占有されているため)。つまりomp_nest_lockは同一階層のparallel領域内で入れ子的に使うことはできるが入れ子階層のparallel領域で入れ子的に使うことは想定されていない。

入れ子階層のparallel領域で入れ子的に使うには上位の階層で使っていたものと別のlock変数を並列領域内で新たに定義してあげると使用可能(set_sections_nest2_C())。この場合、同一階層で入れ子的にlockを使うわけではないのでomp_nest_lockの必要はなく通常のomp_lockでよい。サンプルコードはomp_nest_lockを使うようになっているがomp_lockに書き換えても正常動作する。

$ gcc -fopenmp -D__OMP_NEST_1_LOCK__ -D__OMP_NEST_2_LOCK__ omp_nest_lock.c
$ ./a.out                                                                  
__OMP_NEST_1_LOCK__
__OMP_NEST_2_LOCK__
section-0-0 :  0
section-0-0 :  3
section-0-0 :  1
section-0-0 :  2
section-0-1 :  2
section-0-1 :  0
section-0-1 :  1
section-0-1 :  3
section-1-0 :  0
section-1-0 :  2
section-1-0 :  3
section-1-0 :  1
section-1-1 :  1
section-1-1 :  0
section-1-1 :  2
section-1-1 :  3

section-x-yのxとyがともに連続して出力されることがわかる。この例では高確率で0-0,0-1,1-0,0-0の順番で並ぶがnを大きくしたりすればそれ以外の順番でxとyがともに連続して出力される。

まとめ

omp_lockとomp_nest_lockの使い方を紹介した。
omp_lockは比較的簡単。
omp_nest_lockは同一階層のparallel領域内で入れ子的に使えるが階層的parallel領域では使えない。
同一階層のparallel領域内で入れ子的にlock機構を使ったとしてもすでに外側のlockが働くので内側のlockがどういうときに使えるかがいまいちわからなかった。
階層的parallel領域では新たなomp_lockを使うと階層的なlockが実現可能。
OpenMPでlockを使うようなコードは極力書かないでいいようにし、(暗黙の)barrier同期、atomicやcriticalで置き換え可能ならそちらを使う。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up