More than 3 years have passed since last update.

PostgreSQLのrow-level lockの動きについて

Last updated at 2021-01-05Posted at 2021-01-05

この記事では、前回の記事PostgreSQLのrow-level lockの概要に引き続き、動きがわかりづらいPostgreSQLのrow-level lockについての解説を行います。
大まかな概要を掴むための記事なのですべての動作を網羅しているわけではないですし、一部解説に誤りが含まれていたり、正確ではない可能性があるのでご注意ください。

なお、クエリの実行例などはすべてPostgreSQL 13上での動作結果となります。
また、次の二つの拡張機能を利用しています。

ロックに利用される領域について

row-level lockはタプルヘッダのt_infomask, t_infomask2領域にて管理されます。

htup_details.h

struct HeapTupleHeaderData
{
	union
	{
		HeapTupleFields t_heap;
		DatumTupleFields t_datum;
	}			t_choice;

	ItemPointerData t_ctid;		/* current TID of this or newer tuple (or a
								 * speculative insertion token) */

	/* Fields below here must match MinimalTupleData! */

# define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
	uint16		t_infomask2;	/* number of attributes + various flags */

# define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
	uint16		t_infomask;		/* various flag bits, see below */

t_infomask, t_infomask2に設定される情報は次の通りです。

htup_details.h

/*
 * information stored in t_infomask:
 */
# define HEAP_HASNULL			0x0001	/* has null attribute(s) */
# define HEAP_HASVARWIDTH		0x0002	/* has variable-width attribute(s) */
# define HEAP_HASEXTERNAL		0x0004	/* has external stored attribute(s) */
# define HEAP_HASOID_OLD			0x0008	/* has an object-id field */
# define HEAP_XMAX_KEYSHR_LOCK	0x0010	/* xmax is a key-shared locker */
# define HEAP_COMBOCID			0x0020	/* t_cid is a combo cid */
# define HEAP_XMAX_EXCL_LOCK		0x0040	/* xmax is exclusive locker */
# define HEAP_XMAX_LOCK_ONLY		0x0080	/* xmax, if valid, is only a locker */

 /* xmax is a shared locker */
# define HEAP_XMAX_SHR_LOCK	(HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_KEYSHR_LOCK)

# define HEAP_LOCK_MASK	(HEAP_XMAX_SHR_LOCK | HEAP_XMAX_EXCL_LOCK | \
						 HEAP_XMAX_KEYSHR_LOCK)
# define HEAP_XMIN_COMMITTED		0x0100	/* t_xmin committed */
# define HEAP_XMIN_INVALID		0x0200	/* t_xmin invalid/aborted */
# define HEAP_XMIN_FROZEN		(HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID)
# define HEAP_XMAX_COMMITTED		0x0400	/* t_xmax committed */
# define HEAP_XMAX_INVALID		0x0800	/* t_xmax invalid/aborted */
# define HEAP_XMAX_IS_MULTI		0x1000	/* t_xmax is a MultiXactId */
# define HEAP_UPDATED			0x2000	/* this is UPDATEd version of row */
# define HEAP_MOVED_OFF			0x4000	/* moved to another place by pre-9.0
										 * VACUUM FULL; kept for binary
										 * upgrade support */
# define HEAP_MOVED_IN			0x8000	/* moved from another place by pre-9.0
										 * VACUUM FULL; kept for binary
										 * upgrade support */
# define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN)

# define HEAP_XACT_MASK			0xFFF0	/* visibility-related bits */
...
/*
 * information stored in t_infomask2:
 */
# define HEAP_NATTS_MASK			0x07FF	/* 11 bits for number of attributes */
/* bits 0x1800 are available */
# define HEAP_KEYS_UPDATED		0x2000	/* tuple was updated and key cols
										 * modified, or tuple deleted */
# define HEAP_HOT_UPDATED		0x4000	/* tuple was HOT-updated */
# define HEAP_ONLY_TUPLE			0x8000	/* this is heap-only tuple */

# define HEAP2_XACT_MASK			0xE000	/* visibility-related bits */

t_infomask, t_infomask2の情報を確認してみる

pgrowlocks拡張機能にて前述のt_infomask, t_infomask2の情報を確認可能です。
例えば次のようなトランザクションを実行中の時に

postgres=# BEGIN;
BEGIN
postgres=*# UPDATE HOGE SET id = 2;
UPDATE 1

pgrowlocks関数で別のセッションから行ロックの取得状況を確認すると次の通りとなります。

postgres=# SELECT * FROM pgrowlocks('hoge');
 locked_row | locker | multi | xids  |  modes   | pids
------------+--------+-------+-------+----------+------
 (0,1)      |    611 | f     | {611} | {Update} | {36}
(1 row)

pageinspectの様子は次の通りです。

postgres=# SELECT * FROM heap_page_items(get_raw_page('hoge', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |   t_data
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+------------
  1 |   8160 |        1 |     28 |    620 |    621 |        0 | (0,2)  |        8193 |        256 |     24 |        |       | \x01000000
  2 |   8128 |        1 |     28 |    621 |      0 |        0 | (0,2)  |           1 |      10240 |     24 |        |       | \x02000000
(2 rows)

mode=Updateはpgrowlocksのソースコードの次の箇所で判定していることがわかります。
また、判定の際にはt_infomask2にHEAP_KEYS_UPDATEDのフラグが立っているかどうかで確認していることもわかります。

pgrowlocks.c

Datum
pgrowlocks(PG_FUNCTION_ARGS)
{
...
				else
				{
					if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
						snprintf(values[Atnum_modes], NCHARS, "{Update}");

同様にSHARE MODEのロックを取得する場合、

postgres=# BEGIN;
BEGIN
postgres=*# SELECT * FROM HOGE FOR SHARE;
 id
----
  1
(1 row)

pgrowlocksでも次のように確認できます。


postgres=# SELECT * FROM pgrowlocks('hoge');
 locked_row | locker | multi | xids  |     modes     |  pids
------------+--------+-------+-------+---------------+--------
 (0,1)      |    617 | f     | {617} | {"For Share"} | {5617}
(1 row)

pgrowlocksのソースより次の部分が該当します。今度はt_infomaskの方で判定していることがわかります。

pgrowlocks.c

Datum
pgrowlocks(PG_FUNCTION_ARGS)
{
...
	if (infomask & HEAP_XMAX_LOCK_ONLY)
				{
					if (HEAP_XMAX_IS_SHR_LOCKED(infomask))
						snprintf(values[Atnum_modes], NCHARS, "{For Share}");

htup_details.h

/*
 * Use these to test whether a particular lock is applied to a tuple
 */
# define HEAP_XMAX_IS_SHR_LOCKED(infomask) \
	(((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_SHR_LOCK)

non-blocking readとblocking read/writeの違いについて

初めに非常に重要な点としてはSELECT (without FOR UPDATE/SHARE)ではロック待ちが発生しません。
この振る舞いによりnon-blocking readによる高い読み取りスケーラビリティと、アプリとしての整合性、更新順序を両立しています。
以下、簡単な図解となります。今回のrow-level lockが発生するのは右側のTwo Phase-Lockingを前提とした、blocking readの世界の話となります。

実際に実行計画上もnon-blocking read(SELECT)とblocking read(SELECT FOR UPDATE)は異なり、LockRowsというプランノードで入れ子になっていることがわかります。

// non-blocking read
postgres=# EXPLAIN SELECT * FROM hoge ;
                       QUERY PLAN
---------------------------------------------------------
 Seq Scan on hoge  (cost=0.00..144.99 rows=9999 width=4)
(1 row)

// blocking read
postgres=# EXPLAIN SELECT * FROM hoge  FOR UPDATE;
                           QUERY PLAN
----------------------------------------------------------------
 LockRows  (cost=0.00..244.98 rows=9999 width=10)
   ->  Seq Scan on hoge  (cost=0.00..144.99 rows=9999 width=10)
(2 rows)

タプル単位のロックを取得するtable_tuple_lock関数はExecLockRows(LockRowsノードの実体)から実行されていることがわかります。
一方でExecSeqScan(Seq Scanノードの実体)では実行されていない(row-level lockは発生しない)ことも確認できます。

nodeLockRows.c

/* ----------------------------------------------------------------
 *		ExecLockRows
 * ----------------------------------------------------------------
 */
static TupleTableSlot *			/* return: a tuple or NULL */
ExecLockRows(PlanState *pstate)
...
	/*
	 * Get next tuple from subplan, if any.
	 */
lnext:
	slot = ExecProcNode(outerPlan);
...
		test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot,
								markSlot, estate->es_output_cid,
								lockmode, erm->waitPolicy,
								lockflags,
								&tmfd);

なお、例外としてはNOWAIT(待たずにエラーとする)、SKIP LOCKED(ロックされている行をスルーする)が存在します。
https://www.postgresql.org/docs/current/sql-select.html#SQL-FOR-UPDATE-SHARE

To prevent the operation from waiting for other transactions to commit, use either the NOWAIT or SKIP LOCKED option. With NOWAIT, the statement reports an error, rather than waiting, if a selected row cannot be locked immediately. With SKIP LOCKED, any selected rows that cannot be immediately locked are skipped.

// セッション1
BEGIN;
SELECT * FROM hoge FOR UPDATE;

// セッション2
postgres=*# SELECT * FROM hoge  FOR UPDATE NOWAIT;
2020-12-29 02:00:14.737 JST [28639] ERROR:  could not obtain lock on row in relation "hoge"
2020-12-29 02:00:14.737 JST [28639] STATEMENT:  SELECT * FROM hoge  FOR UPDATE NOWAIT;
ERROR:  could not obtain lock on row in relation "hoge"

postgres=# SELECT * FROM hoge FOR UPDATE SKIP LOCKED;
 id
----
(0 rows)

各row lock modeの意味を確認してみる

公式ドキュメントではrow lockのモードは次の4つが定義されています。

Row-Level Lock Modes
FOR UPDATE
FOR UPDATE causes the rows retrieved by the SELECT statement to be locked as though for update. This prevents them from being locked, modified or deleted by other transactions until the current transaction ends. That is, other transactions that attempt UPDATE, DELETE, SELECT FOR UPDATE, SELECT FOR NO KEY UPDATE, SELECT FOR SHARE or SELECT FOR KEY SHARE of these rows will be blocked until the current transaction ends; conversely, SELECT FOR UPDATE will wait for a concurrent transaction that has run any of those commands on the same row, and will then lock and return the updated row (or no row, if the row was deleted). Within a REPEATABLE READ or SERIALIZABLE transaction, however, an error will be thrown if a row to be locked has changed since the transaction started. For further discussion see Section 13.4.

The FOR UPDATE lock mode is also acquired by any DELETE on a row, and also by an UPDATE that modifies the values of certain columns. Currently, the set of columns considered for the UPDATE case are those that have a unique index on them that can be used in a foreign key (so partial indexes and expressional indexes are not considered), but this may change in the future.

FOR NO KEY UPDATE
Behaves similarly to FOR UPDATE, except that the lock acquired is weaker: this lock will not block SELECT FOR KEY SHARE commands that attempt to acquire a lock on the same rows. This lock mode is also acquired by any UPDATE that does not acquire a FOR UPDATE lock.

FOR SHARE
Behaves similarly to FOR NO KEY UPDATE, except that it acquires a shared lock rather than exclusive lock on each retrieved row. A shared lock blocks other transactions from performing UPDATE, DELETE, SELECT FOR UPDATE or SELECT FOR NO KEY UPDATE on these rows, but it does not prevent them from performing SELECT FOR SHARE or SELECT FOR KEY SHARE.

FOR KEY SHARE
Behaves similarly to FOR SHARE, except that the lock is weaker: SELECT FOR UPDATE is blocked, but not SELECT FOR NO KEY UPDATE. A key-shared lock blocks other transactions from performing DELETE or any UPDATE that changes the key values, but not other UPDATE, and neither does it prevent SELECT FOR NO KEY UPDATE, SELECT FOR SHARE, or SELECT FOR KEY SHARE.

row-level lockの詳細な振る舞いは上記のドキュメントの通りですが、より大雑把にかみ砕いて説明すると以下の通りです。

ロックモード	ブロックするロックモード	取得される条件
FOR UPDATE	全てブロック(non-blocking readは可能)	SELECT FOR UPDATE文、unique keyを更新するDELETE/UPDATE文
FOR NO KEY UPDATE	SELECT FOR KEY SHARE以外はブロック	SELECT FOR NO KEY UPDATE, unique keyを更新しないDELETE/UPDATE文
FOR SHARE	SELECT FOR SHARE, FOR KEY SHARE以外はブロック	SELECT FOR SHARE
FOR KEY SHARE	FOR NO KEY UPDATE, FOR SHARE, FOR KEY SHARE以外はブロック(FOR UPDATE以外は許可)	SELECT FOR KEY SHARE

ロックの競合が発生した際の振る舞いについて

前回の記事でも概要には触れていますが、ロックの競合が発生した場合の振る舞いを見ていきます。

// テーブルの準備(事前に実行)
postgres=# CREATE TABLE HOGE(id int);
CREATE TABLE

postgres=# INSERT INTO HOGE VALUES(1);
INSERT 0 1

postgres=# SELECT *, decode_infomask(t_infomask) as infomask_decoded, decode_infomask2(t_infomask2) as infomask2_decoded FROM heap_page_items(get_raw_page('hoge', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |   t_data   |  infomask_decoded
| infomask2_decoded
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+------------+--------------------
+-------------------
  1 |   8160 |        1 |     28 |    697 |      0 |        0 | (0,1)  |           1 |       2048 |     24 |        |       | \x01000000 |  HEAP_XMAX_INVALID
|
(1 row)


// セッション1
postgres=# BEGIN;
BEGIN
postgres=*# SELECT pg_backend_pid(),pg_current_xact_id();
 pg_backend_pid | pg_current_xact_id
----------------+--------------------
           5617 |                683
(1 row)

postgres=*# SELECT * FROM HOGE FOR UPDATE;
 id
----
  1
(1 row)

// セッション2
postgres=# BEGIN;
BEGIN
postgres=*# SELECT pg_backend_pid(),pg_current_xact_id();
 pg_backend_pid | pg_current_xact_id
----------------+--------------------
           9675 |                684
(1 row)

postgres=*# SELECT * FROM hoge FOR UPDATE;

// セッション3
postgres=# SELECT * FROM pgrowlocks('hoge');
 locked_row | locker | multi | xids  |     modes      |  pids
------------+--------+-------+-------+----------------+--------
 (0,1)      |    683 | f     | {683} | {"For Update"} | {5617}
(1 row)

postgres=# SELECT *, decode_infomask(t_infomask) as infomask_decoded, decode_infomask2(t_infomask2) as infomask2_decoded FROM heap_page_items(get_raw_page('ho
ge', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |   t_data   |
   infomask_decoded                       | infomask2_decoded
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+------------+--------------------
------------------------------------------+--------------------
  1 |   8160 |        1 |     28 |    682 |    683 |        0 | (0,1)  |        8193 |        448 |     24 |        |       | \x01000000 |  HEAP_XMAX_LOCK_ONL
Y HEAP_XMAX_EXCL_LOCK HEAP_XMIN_COMMITTED |  HEAP_KEYS_UPDATED
(1 row)

postgres=# SELECT * FROM pg_stat_activity WHERE query <> '' AND pid <> pg_backend_pid();
 datid | datname  | pid  | leader_pid | usesysid | usename  | application_name | client_addr | client_hostname | client_port |         backend_start         |
          xact_start           |          query_start          |         state_change          | wait_event_type |  wait_event   |        state        | backe
nd_xid | backend_xmin |             query              |  backend_type
-------+----------+------+------------+----------+----------+------------------+-------------+-----------------+-------------+-------------------------------+
-------------------------------+-------------------------------+-------------------------------+-----------------+---------------+---------------------+------
-------+--------------+--------------------------------+----------------
 12664 | postgres | 5617 |            |       10 | postgres | psql             |             |                 |          -1 | 2020-12-26 23:33:46.503602+09 |
 2020-12-27 20:58:17.25196+09  | 2020-12-27 20:58:29.862251+09 | 2020-12-27 20:58:29.862969+09 | Client          | ClientRead    | idle in transaction |
   683 |              | SELECT * FROM HOGE FOR UPDATE; | client backend
 12664 | postgres | 9675 |            |       10 | postgres | psql             |             |                 |          -1 | 2020-12-27 20:56:48.020057+09 |
 2020-12-27 20:58:59.033257+09 | 2020-12-27 20:59:05.388101+09 | 2020-12-27 20:59:05.388105+09 | Lock            | transactionid | active              |
   684 |          683 | SELECT * FROM hoge FOR UPDATE; | client backend
(2 rows)

postgres=# SELECT (SELECT relname FROM pg_class c WHERE c.oid = relation), * FROM pg_locks WHERE pid <> pg_backend_pid() ORDER BY pid, 1, 2;
 relname |   locktype    | database | relation | page | tuple | virtualxid | transactionid | classid | objid | objsubid | virtualtransaction | pid  |        m
ode         | granted | fastpath
---------+---------------+----------+----------+------+-------+------------+---------------+---------+-------+----------+--------------------+------+---------
------------+---------+----------
 hoge    | relation      |    12664 |    41107 |      |       |            |               |         |       |          | 3/82               | 5617 | RowShare
Lock        | t       | t
         | transactionid |          |          |      |       |            |           683 |         |       |          | 3/82               | 5617 | Exclusiv
eLock       | t       | f
         | virtualxid    |          |          |      |       | 3/82       |               |         |       |          | 3/82               | 5617 | Exclusiv
eLock       | t       | t
 hoge    | relation      |    12664 |    41107 |      |       |            |               |         |       |          | 4/234              | 9675 | RowShare
Lock        | t       | t
 hoge    | tuple         |    12664 |    41107 |    0 |     1 |            |               |         |       |          | 4/234              | 9675 | AccessEx
clusiveLock | t       | f
         | transactionid |          |          |      |       |            |           683 |         |       |          | 4/234              | 9675 | ShareLoc
k           | f       | f
         | transactionid |          |          |      |       |            |           684 |         |       |          | 4/234              | 9675 | Exclusiv
eLock       | t       | f
         | virtualxid    |          |          |      |       | 4/234      |               |         |       |          | 4/234              | 9675 | Exclusiv
eLock       | t       | t
(8 rows)

上記の例では非常にシンプルなSELECT FOR UPDATE文を二つのセッションで実行し、ロック待ちが発生してからpgrowlocks, heap_page_items, pg_stat_activity, pg_locksの各情報を確認したものです。
decode_infomask, decode_infomask2関数に関しては自分で用意しました。
上記のクエリの結果から以下のことを確認できます。

pgrowlocks
- xid=683, pid=5617のセッションにより、ctid 0,1(0ブロック目の1行目)の行にてFOR UPDATEのrow-level lockが取得されている
decode_infomask, decode_infomask2関数の結果より、次のフラグが立っている
- HEAP_XMAX_LOCK_ONL
  Y HEAP_XMAX_EXCL_LOCK HEAP_XMIN_COMMITTED
- HEAP_KEYS_UPDATED

また、上記の観測結果はpgrowlocks関数の以下の部分とも合致していることがわかります。
条件がHEAP_XMAX_LOCK_ONLYかつHEAP_XMAX_EXCL_LOCKかつHEAP_KEYS_UPDATEDですね。

pgrowlocks.c

Datum
pgrowlocks(PG_FUNCTION_ARGS)
{
...
				if (infomask & HEAP_XMAX_LOCK_ONLY)
				{
					if (HEAP_XMAX_IS_SHR_LOCKED(infomask))
						snprintf(values[Atnum_modes], NCHARS, "{For Share}");
					else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
						snprintf(values[Atnum_modes], NCHARS, "{For Key Share}");
					else if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
					{
						if (tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED)
							snprintf(values[Atnum_modes], NCHARS, "{For Update}");
						else
...

加えて、pg_stat_activity, pg_locksの情報から次のことも確認できます。

セッション2はセッション1のtransaction idの共有ロック取得待ち
- 前回の記事の解析結果より、row-level lockの競合が発生した際の期待される動作

以下、前回の記事で説明したrow-level lockとtable-level lockの関係性の図解です。
row-level lockが発生するとtable-level lock(SQL object lock)の機能を用いて、他のトランザクションのCOMMIT/ROLLBACKを待合せます。
row-level lockではCOMMIT/ROLLBACK時に自動的にリソースを開放して次のwaiterを起こす機能は提供されていないので、必要に応じてtable-level lockの機能を呼び出します。
(公式ドキュメントではtable-level lockという呼称ですが、実際には行のロックの取得待ちにも持ちいられるため、SQL object lockと呼ぶのが実態に即しています)

次にgdbでセッション1、セッション2の動きを大雑把に追ってみます。

セッション1の動き

gdbで取得したバックトレースの一部は次の通りです。

...
# 10 0x000056007133b5bf in heap_lock_tuple (relation=0x7f6b9819f4e8, tuple=0x56007356c938, cid=0, mode=LockTupleExclusive, wait_policy=LockWaitBlock,
    follow_updates=true, buffer=0x7ffff88a28f4, tmfd=0x7ffff88a2aa0) at heapam.c:4380
# 11 0x00005600713445d1 in heapam_tuple_lock (relation=0x7f6b9819f4e8, tid=0x7ffff88a2a3a, snapshot=0x56007356cf58, slot=0x56007356c8e8, cid=0,
    mode=LockTupleExclusive, wait_policy=LockWaitBlock, flags=3 '\003', tmfd=0x7ffff88a2aa0) at heapam_handler.c:365
# 12 0x000056007158a46a in table_tuple_lock (rel=0x7f6b9819f4e8, tid=0x7ffff88a2a3a, snapshot=0x56007356cf58, slot=0x56007356c8e8, cid=0,
    mode=LockTupleExclusive, wait_policy=LockWaitBlock, flags=3 '\003', tmfd=0x7ffff88a2aa0) at ../../../src/include/access/tableam.h:1336
# 13 0x000056007158a8fa in ExecLockRows (pstate=0x56007356b1d8) at nodeLockRows.c:182
...

上記の重要な処理を大雑把なシーケンス図にしたものが以下のとおりです。
細部は相当省略していますが、大まかにタプルの取得および必要な情報の更新の流れを図式化しています。

この行ロックの取得処理により、以下のようにxmaxおよびinfomask, infomask2の項目が変化します。
バッファ(ページ)の更新期間中は排他ロックが取得されているため、不整合や競合が発生することなく更新可能です。

// before
postgres=# SELECT *, decode_infomask(t_infomask) as infomask_decoded, decode_infomask2(t_infomask2) as infomask2_decoded FROM heap_page_items(get_raw_page('hoge', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |   t_data   |  infomask_decoded
| infomask2_decoded
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+------------+--------------------
+-------------------
  1 |   8160 |        1 |     28 |    697 |      0 |        0 | (0,1)  |           1 |       2048 |     24 |        |       | \x01000000 |  HEAP_XMAX_INVALID
|
(1 row)

// after
postgres=# SELECT *, decode_infomask(t_infomask) as infomask_decoded, decode_infomask2(t_infomask2) as infomask2_decoded FROM heap_page_items(get_raw_page('ho
ge', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |   t_data   |
   infomask_decoded                       | infomask2_decoded
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+------------+--------------------
------------------------------------------+--------------------
  1 |   8160 |        1 |     28 |    682 |    683 |        0 | (0,1)  |        8193 |        448 |     24 |        |       | \x01000000 |  HEAP_XMAX_LOCK_ONL
Y HEAP_XMAX_EXCL_LOCK HEAP_XMIN_COMMITTED |  HEAP_KEYS_UPDATED
(1 row)

セッション2の動き

セッション2は途中までは同じシーケンスですが、すでにセッション1が当該行をロックしているためHeapTupleSatisfiesUpdateによる判定以降の動作が異なります。

HeapTupleSatisfiesUpdateにて、セッション1が当該行のxmaxの値を更新しており、かつxmaxが指し示すトランザクションが実行中であることからTM_BeingModifiedと判定されます。
このように競合を検知した場合には以下の処理を実行します。

一旦バッファのロックを解除します
行のregular lockを取得します
更新中のトランザクションの完了を待ちます

取得されたロックの情報が前述のpg_locksの検索結果のうち、次の行に該当します。

1行目が取得済みの直列化のための行のロック
2行目が他のトランザクションの完了待ち(granted=false)


postgres=# SELECT (SELECT relname FROM pg_class c WHERE c.oid = relation), * FROM pg_locks WHERE pid <> pg_backend_pid() ORDER BY pid, 1, 2;
 relname |   locktype    | database | relation | page | tuple | virtualxid | transactionid | classid | objid | objsubid | virtualtransaction | pid  |        m
ode         | granted | fastpath
---------+---------------+----------+----------+------+-------+------------+---------------+---------+-------+----------+--------------------+------+---------
------------+---------+----------
 ...
 hoge    | tuple         |    12664 |    41107 |    0 |     1 |            |               |         |       |          | 4/234              | 9675 | AccessEx
clusiveLock | t       | f
         | transactionid |          |          |      |       |            |           683 |         |       |          | 4/234              | 9675 | ShareLoc
           | f       | f
...
(8 rows)

以下、上記の動きを図式化したものとなります。

この振る舞いはREADMEでは次のように記述されています。
やはりテーブルロックのみではロックの取得順序を強制できないことが問題とされていますね。

To provide more reliable semantics about who gets a tuple-level lock first, we use the
standard lock manager, which implements the second level mentioned above.  The
protocol for waiting for a tuple-level lock is really

     LockTuple()
     XactLockTableWait()
     mark tuple as locked by me
     UnlockTuple()

When there are multiple waiters, arbitration of who is to get the lock next
is provided by LockTuple().

MultiXactについて

前述のロックの競合が発生した際の振る舞いで扱いきれていない、「同一の行を複数のトランザクションから共有ロックを取得した場合」の仕組みについて解説します。

MultiXactのモチベーション

今まで見てきた通り、単一のトランザクションが排他ロックを取得する場合にはxmaxの値で識別可能です。
しかしながら共有ロック等の複数トランザクションが単一の行をロックする場合には単一xmaxでは複数のロック情報を格納できないことは明白です。
理屈の上では行単位のロックもすべてregular lock賄うことは可能でしょうが、メモリを無尽蔵に消費してしまうので多数の行のロック扱うには現実的ではないです。
そこでPostgreSQLは二段階の行ロックの仕組みを導入しています。
以下、READMEの引用となります。

Locking tuples is not as easy as locking tables or other database objects.
The problem is that transactions might want to lock large numbers of tuples at
any one time, so it's not possible to keep the locks objects in shared memory.
To work around this limitation, we use a two-level mechanism.  The first level
is implemented by storing locking information in the tuple header: a tuple is
marked as locked by setting the current transaction's XID as its XMAX, and
setting additional infomask bits to distinguish this case from the more normal
case of having deleted the tuple.  When multiple transactions concurrently
lock a tuple, a MultiXact is used; see below.  This mechanism can accommodate
arbitrarily large numbers of tuples being locked simultaneously.

一段目が今まで見てきたタプルヘッダ上の情報(xmaxやinfomask, infomask2)を利用したもので、可変長の行ロックを扱うために二段階目のMultiXactという仕組みが提供されています。

実際に共有行ロックを取得してみる

実際に複数のトランザクションから同一行の共有ロックを取得します。

// セッション1
postgres=# SELECT * FROM pg_backend_pid();
 pg_backend_pid
----------------
           5961
(1 row)

postgres=# BEGIN;
BEGIN

postgres=*# SELECT * FROM hoge FOR SHARE;
 id
----
  1
(1 row)

// セッション2
postgres=# SELECT * FROM pg_backend_pid();
 pg_backend_pid
----------------
           5941
(1 row)

postgres=# BEGIN;
BEGIN
postgres=*# SELECT * FROM hoge FOR SHARE;
 id
----
  1
(1 row)

// セッション3
postgres=# SELECT *, decode_infomask(t_infomask) as infomask_decoded, decode_infomask2(t_infomask2) as infomask2_decoded FROM heap_page_items(get_raw_page('hoge', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |   t_data   |
            infomask_decoded                                | infomask2_decoded
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+------------+--------------------
------------------------------------------------------------+-------------------
  1 |   8160 |        1 |     28 |    720 |      6 |        0 | (0,1)  |           1 |       4560 |     24 |        |       | \x01000000 |  HEAP_XMAX_LOCK_ONL
Y HEAP_XMAX_SHR_LOCK HEAP_XMIN_COMMITTED HEAP_XMAX_IS_MULTI |
(1 row)

postgres=# SELECT * FROM pgrowlocks('hoge');
 locked_row | locker | multi |   xids    |     modes     |    pids
------------+--------+-------+-----------+---------------+-------------
 (0,1)      |      6 | t     | {724,725} | {Share,Share} | {5941,5961}
(1 row)

postgres=# SELECT (SELECT relname FROM pg_class c WHERE c.oid = relation), * FROM pg_locks WHERE pid <> pg_backend_pid() ORDER BY pid, 1, 2;
   relname   |   locktype    | database | relation | page | tuple | virtualxid | transactionid | classid | objid | objsubid | virtualtransaction | pid  |
mode      | granted | fastpath
-------------+---------------+----------+----------+------+-------+------------+---------------+---------+-------+----------+--------------------+------+-----
----------+---------+----------
 hoge        | relation      |    12664 |    41107 |      |       |            |               |         |       |          | 3/4                | 5941 | RowS
hareLock  | t       | t
             | transactionid |          |          |      |       |            |           724 |         |       |          | 3/4                | 5941 | Excl
usiveLock | t       | f
             | virtualxid    |          |          |      |       | 3/4        |               |         |       |          | 3/4                | 5941 | Excl
usiveLock | t       | t
 hoge        | relation      |    12664 |    41107 |      |       |            |               |         |       |          | 4/5                | 5961 | RowS
hareLock  | t       | t
             | transactionid |          |          |      |       |            |           725 |         |       |          | 4/5                | 5961 | Excl
usiveLock | t       | f
             | virtualxid    |          |          |      |       | 4/5        |               |         |       |          | 4/5                | 5961 | Excl
usiveLock | t       | t
(8 rows)

上記の検証結果から以下のことがわかります。

heap_page_items
- xmaxの値が明らかに明らかにトランザクションIDとは異なる値である。
- HEAP_XMAX_SHR_LOCKフラグにより共有ロックを取得済み。
- HEAP_XMAX_IS_MULTIフラグによりMultiXactを利用中。
pgrowlocks
- ctid(0,1)の行をpid{5941,5961}のxid{724,725}のトランザクションで共有ロック(Share)を取得済み。
- multi=tより、MultiXactが利用済み。
pg_locks
- pid{5941,5961}の各トランザクションがそれぞれhogeテーブルのRowShareLockを取得済み。
- トランザクション開始時に必ず取得されるtransactionidのExclusiveLock以外の排他ロックは未取得。

セッション2の動きを追ってみる

セッション1の動きについては取得するロックが異なる以外はほぼ同等なので、ここではセッション2の動きのみを見ていきます。

以前の処理シーケンスとそっくりですが、以下の点が明確に異なることがわかります。

トランザクションの完了待ちを行わない
MultiXactと呼ばれるデータ構造を作成し、xmaxにMultiXactIdを指定

MultiXactIdについては公式ドキュメントでも言及されています。
トランザクションIDと同じ32bitの数値データであり、トランザクションIDとは独立して管理されます。
また、データはpg_multixactディレクトリに保存されます。

Multixact IDs are used to support row locking by multiple transactions. Since there is only limited space in a tuple header to store lock information, that information is encoded as a “multiple transaction ID”, or multixact ID for short, whenever there is more than one transaction concurrently locking a row. Information about which transaction IDs are included in any particular multixact ID is stored separately in the pg_multixact subdirectory, and only the multixact ID appears in the xmax field in the tuple header. Like transaction IDs, multixact IDs are implemented as a 32-bit counter and corresponding storage, all of which requires careful aging management, storage cleanup, and wraparound handling. There is a separate storage area which holds the list of members in each multixact, which also uses a 32-bit counter and which must also be managed.

ここで共有ロックを取得した状態でのxmaxの値を再度確認します。
t_xmax=6と明らかにトランザクションIDとは異なる数値が指定されていることをがわかります。
この謎の数値がMultiXactIdとなります。

// セッション3
postgres=# SELECT *, decode_infomask(t_infomask) as infomask_decoded, decode_infomask2(t_infomask2) as infomask2_decoded FROM heap_page_items(get_raw_page('hoge', 0));
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |   t_data   |
            infomask_decoded                                | infomask2_decoded
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+------------+--------------------
------------------------------------------------------------+-------------------
  1 |   8160 |        1 |     28 |    720 |      6 |        0 | (0,1)  |           1 |       4560 |     24 |        |       | \x01000000 |  HEAP_XMAX_LOCK_ONL
Y HEAP_XMAX_SHR_LOCK HEAP_XMIN_COMMITTED HEAP_XMAX_IS_MULTI |
(1 row)

MultiXactを図式したものは以下の通りです。
MultiXactはpg_multixactディレクトリに保存され、一定量のエントリーがキャッシュされます。
保存先がストレージであり、メモリ上に保持するのは一定量のキャッシュのみであるため、行ロックをストレージの容量が許す範囲で制限なしで保持可能となります。
※かなり端折っています

最後に

極々簡単にはありますが、PostgreSQLのrow-level lockの動きを見ていきました。
次回はさらに実装に踏み込んでいく予定です。
最後に以下は記事上の重要なキーワードのまとめです。

行ロックの情報の保持方法
- xmax, infomask, infomask2, MultiXact
non-blocking readとblocking readの違い
row lock modeの意味
- FOR UPDATE, FOR NO KEY UPDATE, FOR SHARE, FOR KEY SHARE
pgrowlocks, pageinspect拡張を利用した行ロックの取得状況のチェック
row-level lockとregular lockの関係性について

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up