2
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

ZEAM開発ログ v.0.4.2 型多相かつ型安全なNIFのコードを分岐予測の観点で最適化する

Last updated at Posted at 2018-09-16

はじめに

ZACKY こと山崎進です。

前回「ZEAM開発ログ v.0.4.1 型多相かつ型安全なNIFの LLVM IR コードを読み解く」で,現状のコードでは分岐予測の最適化が不十分であることがわかりましたので,今回は分岐予測の最適化をしてみたいと思います。

「ZEAM開発ログ 目次」はこちら

分岐予測の最適化のために

こんなドキュメントがありました。

LLVM Branch Weight Metadata

__builtin_expectを使えばいいみたいですね。

この記事にしたがって最適化してみます。

最適化したC言語のコード

static
ERL_NIF_TERM asm_1_nif_ii(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
	long a, b;
	if(__builtin_expect((enif_get_int64(env, argv[0], &a) == 0), 0)) {
		goto error;
	}
	if(__builtin_expect((enif_get_int64(env, argv[1], &b) == 0), 0)) {
		goto error;
	}
	if(__builtin_expect((a > LONG_MAX - b), 0)) {
		return error_atom;
	}
	long result =  a + b;
	return enif_make_int64(env, result);
error:
	return arithmetic_error;
}

最適化した LLVM のコード

生成されたLLVMコード

define internal i64 @asm_1_nif_ii(%struct.enif_environment_t*, i32, i64* nocapture readonly) #0 !dbg !152 {
  %4 = alloca i64, align 8
  %5 = alloca i64, align 8
  call void @llvm.dbg.value(metadata %struct.enif_environment_t* %0, metadata !154, metadata !DIExpression()), !dbg !161
  call void @llvm.dbg.value(metadata i32 %1, metadata !155, metadata !DIExpression()), !dbg !162
  call void @llvm.dbg.value(metadata i64* %2, metadata !156, metadata !DIExpression()), !dbg !163
  %6 = bitcast i64* %4 to i8*, !dbg !164
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %6) #6, !dbg !164
  %7 = bitcast i64* %5 to i8*, !dbg !164
  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %7) #6, !dbg !164
  %8 = load i64, i64* %2, align 8, !dbg !165, !tbaa !107
  call void @llvm.dbg.value(metadata i64* %4, metadata !157, metadata !DIExpression()), !dbg !167
  %9 = call i32 @enif_get_long(%struct.enif_environment_t* %0, i64 %8, i64* nonnull %4) #6, !dbg !168
  %10 = icmp eq i32 %9, 0, !dbg !169
  br i1 %10, label %26, label %11, !dbg !170, !prof !171

; <label>:11:                                     ; preds = %3
  %12 = getelementptr inbounds i64, i64* %2, i64 1, !dbg !172
  %13 = load i64, i64* %12, align 8, !dbg !172, !tbaa !107
  call void @llvm.dbg.value(metadata i64* %5, metadata !159, metadata !DIExpression()), !dbg !174
  %14 = call i32 @enif_get_long(%struct.enif_environment_t* %0, i64 %13, i64* nonnull %5) #6, !dbg !175
  %15 = icmp eq i32 %14, 0, !dbg !176
  br i1 %15, label %26, label %16, !dbg !177, !prof !171

; <label>:16:                                     ; preds = %11
  %17 = load i64, i64* %4, align 8, !dbg !178, !tbaa !107
  call void @llvm.dbg.value(metadata i64 %17, metadata !157, metadata !DIExpression()), !dbg !167
  %18 = load i64, i64* %5, align 8, !dbg !180, !tbaa !107
  call void @llvm.dbg.value(metadata i64 %18, metadata !159, metadata !DIExpression()), !dbg !174
  %19 = sub nsw i64 9223372036854775807, %18, !dbg !181
  %20 = icmp sgt i64 %17, %19, !dbg !182
  br i1 %20, label %21, label %23, !dbg !183, !prof !171

; <label>:21:                                     ; preds = %16
  %22 = load i64, i64* @error_atom, align 8, !dbg !184, !tbaa !107
  br label %28, !dbg !186

; <label>:23:                                     ; preds = %16
  %24 = add nsw i64 %18, %17, !dbg !187
  call void @llvm.dbg.value(metadata i64 %24, metadata !160, metadata !DIExpression()), !dbg !188
  %25 = call i64 @enif_make_long(%struct.enif_environment_t* %0, i64 %24) #6, !dbg !189
  br label %28, !dbg !190

; <label>:26:                                     ; preds = %11, %3
  %27 = load i64, i64* @arithmetic_error, align 8, !dbg !191, !tbaa !107
  br label %28, !dbg !192

; <label>:28:                                     ; preds = %26, %23, %21
  %29 = phi i64 [ %27, %26 ], [ %22, %21 ], [ %25, %23 ]
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %7) #6, !dbg !193
  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %6) #6, !dbg !193
  ret i64 %29, !dbg !193
}
!171 = !{!"branch_weights", i32 1, i32 2000}

デバッグ情報を削除したLLVMコード

define internal i64 @asm_1_nif_ii(%struct.enif_environment_t*, i32, i64* nocapture readonly) #0 !dbg !152 {
  %4 = alloca i64, align 8
  %5 = alloca i64, align 8
  %6 = bitcast i64* %4 to i8*, !dbg !164
  %7 = bitcast i64* %5 to i8*, !dbg !164
  %8 = load i64, i64* %2, align 8, !dbg !165, !tbaa !107
  %9 = call i32 @enif_get_long(%struct.enif_environment_t* %0, i64 %8, i64* nonnull %4) #6, !dbg !168
  %10 = icmp eq i32 %9, 0, !dbg !169
  br i1 %10, label %26, label %11, !dbg !170, !prof !171

; <label>:11:                                     ; preds = %3
  %12 = getelementptr inbounds i64, i64* %2, i64 1, !dbg !172
  %13 = load i64, i64* %12, align 8, !dbg !172, !tbaa !107
  %14 = call i32 @enif_get_long(%struct.enif_environment_t* %0, i64 %13, i64* nonnull %5) #6, !dbg !175
  %15 = icmp eq i32 %14, 0, !dbg !176
  br i1 %15, label %26, label %16, !dbg !177, !prof !171

; <label>:16:                                     ; preds = %11
  %17 = load i64, i64* %4, align 8, !dbg !178, !tbaa !107
  %18 = load i64, i64* %5, align 8, !dbg !180, !tbaa !107
  %19 = sub nsw i64 9223372036854775807, %18, !dbg !181
  %20 = icmp sgt i64 %17, %19, !dbg !182
  br i1 %20, label %21, label %23, !dbg !183, !prof !171

; <label>:21:                                     ; preds = %16
  %22 = load i64, i64* @error_atom, align 8, !dbg !184, !tbaa !107
  br label %28, !dbg !186

; <label>:23:                                     ; preds = %16
  %24 = add nsw i64 %18, %17, !dbg !187
  %25 = call i64 @enif_make_long(%struct.enif_environment_t* %0, i64 %24) #6, !dbg !189
  br label %28, !dbg !190

; <label>:26:                                     ; preds = %11, %3
  %27 = load i64, i64* @arithmetic_error, align 8, !dbg !191, !tbaa !107
  br label %28, !dbg !192

; <label>:28:                                     ; preds = %26, %23, %21
  %29 = phi i64 [ %27, %26 ], [ %22, %21 ], [ %25, %23 ]
  ret i64 %29, !dbg !193
}
!171 = !{!"branch_weights", i32 1, i32 2000}

最適化したx86_64コード

生成された x64_64 コード

	.p2align	4, 0x90         ## -- Begin function asm_1_nif_ii
_asm_1_nif_ii:                          ## @asm_1_nif_ii
Lfunc_begin4:
	.loc	3 16 0                  ## native/lib.c:16:0
	.cfi_startproc
## %bb.0:
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	pushq	%r14
	pushq	%rbx
	subq	$16, %rsp
	.cfi_offset %rbx, -32
	.cfi_offset %r14, -24
	##DEBUG_VALUE: asm_1_nif_ii:env <- %rdi
	##DEBUG_VALUE: asm_1_nif_ii:argc <- %esi
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rdx
	movq	%rdx, %rbx
	movq	%rdi, %r14
Ltmp18:
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rbx
	##DEBUG_VALUE: asm_1_nif_ii:env <- %r14
	.loc	3 18 43 prologue_end    ## native/lib.c:18:43
	movq	(%rbx), %rsi
Ltmp19:
	.loc	3 0 43 is_stmt 0        ## native/lib.c:0:43
	leaq	-32(%rbp), %rdx
Ltmp20:
	##DEBUG_VALUE: asm_1_nif_ii:a <- [DW_OP_constu 32, DW_OP_minus] [%rbp+0]
	.loc	3 18 23                 ## native/lib.c:18:23
	callq	_enif_get_long
	.loc	3 18 56                 ## native/lib.c:18:56
	testl	%eax, %eax
Ltmp21:
	.loc	3 18 5                  ## native/lib.c:18:5
	je	LBB4_5
Ltmp22:
## %bb.1:
	##DEBUG_VALUE: asm_1_nif_ii:a <- [DW_OP_constu 32, DW_OP_minus] [%rbp+0]
	##DEBUG_VALUE: asm_1_nif_ii:env <- %r14
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rbx
	.loc	3 21 43 is_stmt 1       ## native/lib.c:21:43
	movq	8(%rbx), %rsi
	leaq	-24(%rbp), %rdx
Ltmp23:
	##DEBUG_VALUE: asm_1_nif_ii:b <- [DW_OP_constu 24, DW_OP_minus] [%rbp+0]
	.loc	3 21 23 is_stmt 0       ## native/lib.c:21:23
	movq	%r14, %rdi
	callq	_enif_get_long
	.loc	3 21 56                 ## native/lib.c:21:56
	testl	%eax, %eax
Ltmp24:
	.loc	3 21 5                  ## native/lib.c:21:5
	je	LBB4_5
Ltmp25:
## %bb.2:
	##DEBUG_VALUE: asm_1_nif_ii:b <- [DW_OP_constu 24, DW_OP_minus] [%rbp+0]
	##DEBUG_VALUE: asm_1_nif_ii:a <- [DW_OP_constu 32, DW_OP_minus] [%rbp+0]
	##DEBUG_VALUE: asm_1_nif_ii:env <- %r14
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rbx
	.loc	3 24 23 is_stmt 1       ## native/lib.c:24:23
	movq	-32(%rbp), %rax
Ltmp26:
	##DEBUG_VALUE: asm_1_nif_ii:a <- %rax
	.loc	3 24 38 is_stmt 0       ## native/lib.c:24:38
	movq	-24(%rbp), %rsi
Ltmp27:
	##DEBUG_VALUE: asm_1_nif_ii:b <- %rsi
	.loc	3 24 36                 ## native/lib.c:24:36
	movabsq	$9223372036854775807, %rcx ## imm = 0x7FFFFFFFFFFFFFFF
	subq	%rsi, %rcx
	.loc	3 24 25                 ## native/lib.c:24:25
	cmpq	%rcx, %rax
Ltmp28:
	.loc	3 24 5                  ## native/lib.c:24:5
	jg	LBB4_3
Ltmp29:
## %bb.4:
	##DEBUG_VALUE: asm_1_nif_ii:b <- %rsi
	##DEBUG_VALUE: asm_1_nif_ii:a <- %rax
	##DEBUG_VALUE: asm_1_nif_ii:env <- %r14
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rbx
	.loc	3 27 19 is_stmt 1       ## native/lib.c:27:19
	addq	%rax, %rsi
Ltmp30:
	##DEBUG_VALUE: asm_1_nif_ii:result <- %rsi
	.loc	3 28 9                  ## native/lib.c:28:9
	movq	%r14, %rdi
	callq	_enif_make_long
Ltmp31:
LBB4_6:
	##DEBUG_VALUE: asm_1_nif_ii:env <- %r14
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rbx
	.loc	3 31 1                  ## native/lib.c:31:1
	addq	$16, %rsp
	popq	%rbx
Ltmp32:
	popq	%r14
Ltmp33:
	popq	%rbp
	retq
LBB4_5:
Ltmp34:
	##DEBUG_VALUE: asm_1_nif_ii:a <- [DW_OP_constu 32, DW_OP_minus] [%rbp+0]
	##DEBUG_VALUE: asm_1_nif_ii:env <- %r14
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rbx
	.loc	3 30 9                  ## native/lib.c:30:9
	movq	_arithmetic_error(%rip), %rax
	jmp	LBB4_6
Ltmp35:
LBB4_3:
	##DEBUG_VALUE: asm_1_nif_ii:b <- %rsi
	##DEBUG_VALUE: asm_1_nif_ii:a <- %rax
	##DEBUG_VALUE: asm_1_nif_ii:env <- %r14
	##DEBUG_VALUE: asm_1_nif_ii:argv <- %rbx
	.loc	3 25 10                 ## native/lib.c:25:10
	movq	_error_atom(%rip), %rax
Ltmp36:
	.loc	3 0 10 is_stmt 0        ## native/lib.c:0:10
	jmp	LBB4_6
Ltmp37:
Lfunc_end4:
	.cfi_endproc

デバッグ情報を取り除いた x86_64 コード

	.p2align	4, 0x90         ## -- Begin function asm_1_nif_ii
_asm_1_nif_ii:                          ## @asm_1_nif_ii
Lfunc_begin4:
	.cfi_startproc
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r14
	pushq	%rbx
	subq	$16, %rsp
	movq	%rdx, %rbx
	movq	%rdi, %r14
	movq	(%rbx), %rsi
	leaq	-32(%rbp), %rdx
	callq	_enif_get_long
	testl	%eax, %eax
	je	LBB4_5
	movq	8(%rbx), %rsi
	leaq	-24(%rbp), %rdx
	movq	%r14, %rdi
	callq	_enif_get_long
	testl	%eax, %eax
	je	LBB4_5
	movq	-32(%rbp), %rax
	movq	-24(%rbp), %rsi
	movabsq	$9223372036854775807, %rcx ## imm = 0x7FFFFFFFFFFFFFFF
	subq	%rsi, %rcx
	cmpq	%rcx, %rax
	jg	LBB4_3
	addq	%rax, %rsi
	movq	%r14, %rdi
	callq	_enif_make_long
LBB4_6:
	addq	$16, %rsp
	popq	%rbx
	popq	%r14
	popq	%rbp
	retq
LBB4_5:
	movq	_arithmetic_error(%rip), %rax
	jmp	LBB4_6
LBB4_3:
	movq	_error_atom(%rip), %rax
	jmp	LBB4_6
Lfunc_end4:
	.cfi_endproc

狙い通りになりましたね!

次回は「ZEAM開発ログ v.0.4.3 型多相かつ型安全なNIFでオーバーフローを検出する」です。お楽しみに!

:stars::stars::stars: お知らせ:Elixirもくもく会(リモート参加OK、入門トラック有)を9月28日に開催します :stars::stars::stars:

「fukuoka.ex#14:Elixir/Phoenixもくもく会~入門もあるよ」を2018年9月28日金曜日に開催します

前回は,ゲリラ的に募った「Zoomによるリモート参加」を,今回から正式に受け付けるようになりましたので,福岡以外の首都圏や地方からでも参加できます(申し込みいただいたら、追ってZoom URLをconnpassメールでお送りします)

また,これまではElixir/Phoenix経験者を対象とした,もくもく会オンリーでしたが,今回から,入門者トラックも併設し,fukuoka.exアドバイザーズ/キャストに質問できるようにアップグレードしました

私,山崎も参加します! この記事の延長線上のものを作ろうと思っています。

お申込みはコチラから
https://fukuokaex.connpass.com/event/100659/
image.png

2
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?