最強コンパイラを使えばマージソートは遅くない？

Posted at 2025-04-20

はじめに

現在，筆者はソートアルゴリズムについて勉強中であり，これまで下記の記事で取り上げてきた。

しかしながら，GNU C Libarary は（当たり前のことながら）GNU C コンパイラでビルドされることが前提であり，単に Microsoft の Visual C++ では性能が出なかっただけなのかもしれない。とはいえ，今から GNU C コンパイラをインストールするのは気が進まないというか，既に筆者のPCには LLVM の clang-cl と intel C++ コンパイラがインストールされているのだった¹²。ということで中立的かつ最強コンパイラの呼び声も高い LLVM の clang-cl と intel C++ コンパイラで再評価を試みた。

結論を先に言うと，マージソートも clang-cl または intel C++ コンパイラを使えばクイックソートに肉薄するレベルまで高速化できる。

試験条件

要素数 200,000,000 個のint 型の配列の昇順ソートを行う。マージソートの場合，同サイズの配列がもう一つ必要になることから，32bit プログラムではほぼ上限値である。
4×200,000,000 bytes = 0x2FAF0800 bytes
データの順序は以下の四種類とする。
- ゼロ
- 昇順：0 → 199,999,999
- 降順：199,999,999 → 0
- 乱数：0～199,999,999 の範囲の一様乱数，乱数発生器は XorShift とした³
Cコンパイラは下記の三種類とし，かつターゲットを 32bit (x86)と64bit (x64)の二種類の組み合わせで合計六種類とする。
- Microsoft Visual C++ 2022
- LLVM clang 17.0.3
- intel C++ compiler：oneAPI 2024.2, Compiler Release 2024.2.1
評価マシンのスペックは下記の通り
Windows10 2022H2，Core i7-13700 5.1GHZ，32GB

ソースコード

ソースコードは前回の記事と全く同じであるが，念のため再掲する。

ベンチ用メイン

bench.c

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#define	MACROOF(arg)	#arg
#define	STRINGOF(macro)	MACROOF(macro)
typedef	int		(*COMPARE)(int, int);
extern	void	SORTFUNC(int *a, int len, COMPARE compare);
uint32_t	xorshift( void ) {
static	uint64_t	x = 1;
	x ^= x << 13;
	x ^= x >>  7;
	x ^= x << 17;
	return (uint32_t)x;
}
#ifdef	COUNT
uint64_t	move_count = 0;
uint64_t	comp_count = 0;
#endif
static	int		compare(int p, int q) {
#ifdef	COUNT
	comp_count++;
#endif
	return p - q;
}
int	main(int argc, char *argv[]) {
	if(argc < 3) {
		fprintf(stderr, "Usage: bench(.exe) [data] [size] (debug)\n");
		fprintf(stderr, " data: zero, ascend, descend, random\n");
		fprintf(stderr, " func: " STRINGOF(SORTFUNC) "\n");
		return -1;
	}
	int	size = atoi(argv[2]);
	int	*table = malloc(sizeof(int) * size);
	if(table == (int*)NULL) {
		fprintf(stderr, "cannot allocate size: %d\n", size);
		return -1;
	}
	switch(argv[1][0]) {
	  case 'z': for(int i = 0; i < size; i++) table[i] = 0;					break;
	  case 'a':	for(int i = 0; i < size; i++) table[i] = i;					break;
	  case 'd':	for(int i = 0; i < size; i++) table[i] = size - i - 1;		break;
	  case 'r':	for(int i = 0; i < size; i++) table[i] = xorshift() % size;	break;
	  default:
		fprintf(stderr, "illegal data: %s\n", argv[1]);
		return -1;
	}
#ifdef	COUNT
	SORTFUNC(table, size, compare);
	fprintf(stdout, "comp_count: %lld\n", comp_count);
	fprintf(stdout, "move_count: %lld\n", move_count);
#else
	clock_t	t1 = clock();
	SORTFUNC(table, size, compare);
	clock_t	t2 = clock();
	fprintf(stdout, "time: %.3f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
#endif
	if(argc >= 4 && argv[3][0] == 'd')
		for(int i = 0; i < size; i++) fprintf(stdout, "%d\n", table[i]);
	return 0;
}

uCRT 版 qsort の意訳版

qsort_ucrt.c

#ifdef	COUNT
#include <stdint.h>
extern	uint64_t	move_count;
#endif
typedef	int		(*COMPARE)(int, int);
#define	STACK_SIZE	30
static	void	swap(int *p, int *q) {
#ifdef	COUNT
	move_count += 2;
#endif
	int	temp = *p;
	*p = *q;
	*q = temp;
}
void	qsort_ucrt(int *a, int len, COMPARE compare) {
	if(len < 2) return;
	int	top_stack[STACK_SIZE];
	int	end_stack[STACK_SIZE];
	int	stack = 0;
	int	top = 0;
	int	end = len - 1;
LOOP:
	int	n = end - top + 1;
	if(n <= 8) {
		for(int e = end; e > top; e--) {
			int	max = top;
			for(int i = top + 1; i <= e; i++)
				if(compare(a[i], a[max]) > 0) max = i;
			swap(&a[max], &a[e]);
		}
	} else {
		int	mid = top + n / 2;
		int	t = top;
		int	e = end;
		if(compare(a[top], a[mid]) > 0) swap(&a[top], &a[mid]);
		if(compare(a[top], a[end]) > 0) swap(&a[top], &a[end]);
		if(compare(a[mid], a[end]) > 0) swap(&a[mid], &a[end]);
		for(;;) {
			if(t <  mid) while(++t <  mid && compare(a[t], a[mid]) <= 0);
			if(t >= mid) while(++t <= end && compare(a[t], a[mid]) <= 0);
			while(--e > mid && compare(a[e], a[mid]) > 0);
			if(t > e) break;
			swap(&a[t], &a[e]);
			if(e == mid) mid = t;
		}
		e++;
		if(e >  mid) while(--e > mid && compare(a[e], a[mid]) == 0);
		if(e <= mid) while(--e > top && compare(a[e], a[mid]) == 0);
		if(e - top >= end - t) {
			if(top < e) {
				top_stack[stack] = top;
				end_stack[stack] = e;
				stack++;
			}
			if(t < end) {
				top = t;
				goto LOOP;
			}
		} else {
			if(t < end) {
				top_stack[stack] = t;
				end_stack[stack] = end;
				stack++;
			}
			if(top < e) {
				end = e;
				goto LOOP;
			}
		}
	}
	if(--stack >= 0) {
		top = top_stack[stack];
		end = end_stack[stack];
		goto LOOP;
	}
}

GNU C ライブラリ版 qsort の意訳版

qsort_glibc.c

#ifdef	COUNT
#include <stdint.h>
extern	uint64_t	move_count;
#endif
#include <stdlib.h>
typedef	int		(*COMPARE)(int, int);
#define	TEMP_SIZE	256
static	void	sort(int *buf, int *a, int len, COMPARE compare) {
	if(len < 2) return;
	int	n1 = len / 2;
	int	n2 = len - n1;
	int	*p1 = a;
	int	*p2 = &a[n1];
	sort(buf, p1, n1, compare);
	sort(buf, p2, n2, compare);
	int	*q = buf;
	for(;;) {
#ifdef	COUNT
		move_count++;
#endif
		if(compare(*p1, *p2) <= 0) {
			*q++ = *p1++;
			if(--n1 == 0) break;
		} else {
			*q++ = *p2++;
			if(--n2 == 0) break;
		}
	}
#ifdef	COUNT
	move_count += n1 + len - n2;
#endif
	while(n1 > 0) {
		*q++ = *p1++;
		n1--;
	}
	for(int i = 0; i < len - n2; i++)
		a[i] = buf[i];
}
void	qsort_glibc(int *a, int len, COMPARE compare) {
	if(len < 2) return;
	if(len <= TEMP_SIZE) {
		int	tmp[TEMP_SIZE];
		sort(tmp, a, len, compare);
	} else {
		int	*buf = (int*)malloc(sizeof(int) * len);
		if(buf == (int*)NULL) return;
		sort(buf, a, len, compare);
		free(buf);
	}
}

メイクファイル

Microsoft Visual C++ 用メイクファイルを以下に示す。32bit（x86）版と64bit（x64）版共用である。オプション /GL を指定してプログラム全体の最適化を有効にした。

Makefile.vc

OUTDIR=$(VSCMD_ARG_TGT_ARCH)vc
CC=cl
CFLAGS=/c /O2 /W4 /arch:AVX2 /GL /GS-
LFLAGS=/GL
LIBS=
!include "MAKEFILE.INC"

LLVM clang-cl 用メイクファイルを以下に示す。32bit（x86）版と64bit（x64）版共用である。オプション /TC は C ソースファイルとみなす。Visual C++ では拡張子 .c のファイルは C ソースファイルとみなすので指定不要だが，clang-cl ではわざわざ指定する必要がある。オプション -flto はリンク時最適化であり，リンカも専用のものを指定する必要がある。

Makefile.clang

OUTDIR=$(VSCMD_ARG_TGT_ARCH)clang
CC=clang-cl
CFLAGS=/c /O2 /W4 /arch:AVX2 /TC -flto /GS-
LFLAGS=-flto -fuse-ld=lld
LIBS=
!include "MAKEFILE.inc"

intel C++ コンパイラ用メイクファイルを以下に示す。32bit（x86）版と64bit（x64）版共用である。基本的に細かい注意点については clang-cl と同じ。

Makefile.intel

OUTDIR=$(VS_TARGET_ARCH)intel
CC=icx-cl
CFLAGS=/c /O2 /W4 /arch:AVX2 /TC -flto /GS-
LFLAGS=-flto -fuse-ld=lld
LIBS=
!include "MAKEFILE.inc"

共通部分を以下に示す。

Makefile.inc

OUTDIRS=$(OUTDIR) \
        $(OUTDIR)\ucrt \
        $(OUTDIR)\glibc

target: $(OUTDIRS) \
        $(OUTDIR)\ucrt\bench.exe \
        $(OUTDIR)\glibc\bench.exe

$(OUTDIRS):
	@if not exist $@ mkdir $@

$(OUTDIR)\ucrt\bench.exe:  $(OUTDIR)\ucrt\bench.obj  $(OUTDIR)\ucrt\qsort_ucrt.obj
$(OUTDIR)\glibc\bench.exe: $(OUTDIR)\glibc\bench.obj $(OUTDIR)\glibc\qsort_glibc.obj

$(OUTDIR)\ucrt\bench.exe \
$(OUTDIR)\glibc\bench.exe:
	$(CC) $** /Fe$* $(LFLAGS) $(LIBS)

.c{$(OUTDIR)\ucrt}.obj::
	$(CC) $(CFLAGS) $< /Fo$(OUTDIR)\ucrt\ /DSORTFUNC=qsort_ucrt

.c{$(OUTDIR)\glibc}.obj::
	$(CC) $(CFLAGS) $< /Fo$(OUTDIR)\glibc\ /DSORTFUNC=qsort_glibc

clean:
	if exist $(OUTDIR) rd /s /q $(OUTDIR)

上記のメイクファイルを実行すると，x86vc, x64vc, x86clang, x64clang, x86intel, x64intel と６個のフォルダが作られ，各フォルダの下にそれぞれの環境でビルドされた ucrt\bench.exe と glibc\bench.exe が作られる。

バイナリサイズの比較

intel C++ コンパイラを使うとバイナリサイズが一気に肥大化するので驚く。おそらくループアンローリングや関数のインライン展開を多用するのだろう。

表１　バイナリサイズの比較（単位はバイト）
コンパイラ		qsort_ucrt	qsort_glibc
Visual C++	32bit	118,272	117,760

	64bit	142,848	142,848
LLVM clang-cl	32bit	118,784	118,784

	64bit	148,480	148,480
intel icx-cl	32bit	166,400	169,472

	64bit	208,384	226,304

ここで qsort_ucrt と qsort_glibc のファイルサイズに注目する。コンパクトな 32bit 版のサイズに着目すると，

Visual C++ では qsort_ucrt < qsort_glibc
LLVM clang-cl では qsort_ucrt = qsort_glibc
intel icx-cl では qsort_ucrt > qsort_glibc

となっており，強力な最適化を施すことによって，とりわけ qsort_glibc のほうが著しくサイズが増えていることが分かる。

試験結果

試験結果を以下に示す。

表２　実行時間の比較（単位は秒）
コンパイラ	順序	qsort_ucrt	qsort_glibc
Visual C++ 32bit版	ゼロ	0.090	2.660

	昇順	1.433	2.656

	降順	1.475	3.914

	乱数	15.505	17.996
Visual C++ 64bit版	ゼロ	0.093	2.470

	昇順	1.582	2.473

	降順	1.617	3.769

	乱数	15.500	21.129
LLVM clang-cl 32bit版	ゼロ	0.098	2.066

	昇順	1.627	2.060

	降順	1.688	3.642

	乱数	15.636	16.487
LLVM clang-cl 64bit版	ゼロ	0.089	1.958

	昇順	1.460	1.997

	降順	1.523	2.942

	乱数	15.710	16.022
intel icx-cl 32bit版	ゼロ	0.070	2.488

	昇順	1.592	2.498

	降順	1.691	3.333

	乱数	17.120	17.809
intel icx-cl 64bit版	ゼロ	0.064	1.811

	昇順	1.366	1.797

	降順	1.491	2.507

	乱数	16.584	15.969

こういうのはグラフにした方が分かり易い。

■qsort_ucrt，■qsort_glibc

結論

三種類のコンパイラ×ターゲット32bit(x86) or 64bit(x64) の合計六種類の実行ファイルを作成した。

ゼロ・昇順・降順のような特殊なデータ順序については，qsort_glibc（マージソート）の全敗であるのは仕方ないだろう。だがしかし，intel C++ コンパイラを使えばその差が圧縮される。Visual C++ と intel C++ コンパイラの結果を見比べると，qsort_ucrt（クイックソート）のほうはあまり変わらないが，qsort_glibc（マージソート）のほうは大きく改善されている。バイナリサイズも qsort_glibc（マージソート）のほうが増えていることから，おそらくループアンローリング等の最適化技術を多用しており，そのせいでバイナリサイズも増えていると思われる。

乱数データについては，Visual C++ だと 16～36% の差があるが，LLVM clang や intel C++ コンパイラではその差が 5% 以下に縮まり，一部逆転もしている。

qsort_glibc（マージソート）は LLVM clang や intel C++ コンパイラなどの優れたCコンパイラを用いることで性能が伸びているが，qsort_ucrt（クイックソート）のほうは Visual C++ とあまり変わらないというか，むしろ Visual C++ が最速である。もともとマイクロソフト謹製の uCRT で用いられているものなので，Visual C++ で性能が伸びるように作られているのだろう。逆説的に言えば，uCRT の作りは Visual C++ に特化し過ぎていて，先進的なCコンパイラにとって最適化の障害になっているようにも思える。glibc の作りはシンプルであるがゆえにCコンパイラにとって見通しが良く，最適化し易いのかもしれない。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up