More than 5 years have passed since last update.

「ゼロから作る Deep Learning」第3章を CUDA でやってみる。

Last updated at 2019-09-25Posted at 2019-09-25

はじめに

「ゼロから作る Deep Learning」、名著だと思います。
ただ、Python で書かれているので、肝心の行列演算のところが NumPy を使っていてブラックボックスです。なんで、CUDA で行列演算のところも書いて、マイナスから作ってみたいと思います。

全ソースと重みデータは以下のレポジトリにおいてあります。
https://github.com/Satachito/cuda_exp

注意点

モダンなコードにするために Unified Memory を使っています。
直感的にわかりやすくするために意識的にエラー処理を省いています。
CUDA でプログラムする局面では double と float、時には int の全てに対応する必要がある場合が多いです。そのためにテンプレートを使っています。
sum とか max とかの要素間に関係があるものは CUDA 化せずに、C++ で実現してあります。
スピードを目的とせず、チューニングしない最もベーシックな実装にしてあります。

環境

以下の環境で検証してあります。

Ubuntu 18.04LTS
CUDA 10.1
NVIDIA GeForce GTX 780 Ti（古っ！）

準備

ベクトルと行列を構造体で定義します。

以下、Unified Memory を使いますが、それについては以下の別記事を参照してもらえると、理解の一助になると思います。
https://qiita.com/Satachito/items/610f8a3403f38b399b60

NumPy はベクトルも行列も同じ array という形になっていますが、ここではベクトルは Array、行列は Matrix と別の型にします。

NN3.cu

using namespace std;

template < typename F > struct
vArray {

	F*		_;
	size_t	n;

	vArray( F* _, size_t n )
	:	_( _ )
	,	n( n ) {
	}
	F&
	operator[]( size_t I ) const {
		return _[ I ];
	}
};

template < typename F > struct
Array	: vArray< F > {
	~
	Array() {
		cudaFree( vArray< F >::_ );
	}

	static	F*
	Malloc( size_t N ) {
		F*	_;
		cudaMallocManaged( &_, N * sizeof( F ) );
		return _;
	}
	Array( size_t n )
	:	vArray< F >( Malloc( n ), n ) {
	}
};

template < typename F > struct
vMatrix {

	F*		_;
	size_t	h;
	size_t	w;
	size_t	v;

	vMatrix( F* _, size_t h, size_t w, size_t v )
	:	_( _ )
	,	h( h )
	,	w( w )
	,	v( v ) {
	}

	F&
	operator()( size_t Y, size_t X ) const {
		return _[ Y * v + X ];
	}
	vArray< F >
	operator[]( size_t I ) const {
		return vArray< F >(
			_ + I * v
		,	w
		);
	}
};
# include <iostream>
template	< typename F >	ostream&
operator <<( ostream& S, const vMatrix< F >& P ) {
	for ( size_t y = 0; y < P.h; y++ ) {
		for ( size_t x = 0; x < P.w; x++ ) S << "	" << P( y, x );
		S << endl;
	}
	return S;
}

template < typename F > struct
Matrix	: vMatrix< F > {
	~
	Matrix() {
		cudaFree( vMatrix< F >::_ );
	}

	static	F*
	Malloc( size_t N ) {
		F*	_;
		cudaMallocManaged( &_, N * sizeof( F ) );
		return _;
	}
	Matrix( size_t h, size_t w )
	:	vMatrix< F >( Malloc( h * w ), h, w, w ) {
	}

	Matrix( const vMatrix< F >& _ )
	:	vMatrix< F >( Malloc( _.h * _.w ), _.h, _.w, _.w ) {
		for ( size_t y = 0; y < _.h; y++ ) for ( size_t x = 0; x < _.w; x++ ) (*this)( y, x ) = _( y, x );
	}

	Matrix( const Matrix< F >& _ )
	:	Matrix< F >( (vMatrix< F >)_ ) {
	}
};

vArray< F > vMatrix::operator[]( size_t I )
これは行列の I で指定される行をベクトルで取り出しています。3.6.1 で使います。

ユーティリティ

次に、いくつかのユーティリティを定義します。

NN3.cu

# define	UNITS( p, q )	( ( p + q - 1 ) / q )

# define	B_S		256

inline	dim3
grid1D( size_t N ) {
	return dim3( UNITS( N, B_S ) );
}
inline	dim3
thread1D() {
	return dim3( B_S );
}

# define	B_S_H	32
# define	B_S_W	32

inline	dim3
grid2D( size_t H, size_t W ) {
	return dim3( UNITS( W, B_S_W ), UNITS( H, B_S_H ) );
}
inline	dim3
thread2D() {
	return dim3( B_S_W, B_S_H );
}

# include	<vector>

template	< typename F, int Y, int X >	Matrix< F >
MakeMatrix( initializer_list< F > args ) {
	vector< F >	argsV = args;
	Matrix< F > _( Y, X );
	for ( size_t y = 0; y < Y; y++ ) {
		for ( size_t x = 0; x < X; x++ ) {
			_( y, x ) = argsV[ y * X + x ];
		}
	}
	return _;
}

3.2.4 シグモイド関数

原著の 3.2.4、シグモイド関数の CUDA による実装の例です。グラフは省いてます。

NN3.cu


////////////////////////////////////////////////////////////////////////////////	3.2.4
template	< typename F >	__global__	void
SIGMOID( vMatrix< F > V, vMatrix< F >  P ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	V._[ y * V.v + x ] = 1 / ( 1 + exp( - P._[ y * P.v + x ] ) );
}
template	< typename F >	Matrix< F >
sigmoid( const vMatrix< F >& P ) {
	Matrix< F >	v( P.h, P.w );
	SIGMOID<<< grid2D( v.h, v.w ), thread2D() >>>( v, P );
	cudaDeviceSynchronize();
	return v;
}


template	< typename F >	void
_3_2_4() {
	cout << "3.2.4 sigmoid" << endl;
	cout << sigmoid( MakeMatrix< F, 2, 5 >( { -1, -0.5, 0, 0.5, 1, -1, -0.5, 0, 0.5, 1 } ) );
}

実行結果（ソースの最後で実行しています。）

3.2.4 sigmoid
	0.268941	0.377541	0.5	0.622459	0.731059
	0.268941	0.377541	0.5	0.622459	0.731059

3.2.7 ReLU 関数

原著の 3.2.7、ReLU 関数の CUDA による実装の例です。グラフは省いてます。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	3.2.7
template	< typename F >	__global__	void
RELU( vMatrix< F > V, vMatrix< F >  P ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	V._[ y * V.v + x ] = max( F( 0 ), P._[ y * P.v + x ] );
}
template	< typename F >	Matrix< F >
ReLU( const vMatrix< F >& P ) {
	Matrix< F >	v( P.h, P.w );
	RELU<<< grid2D( v.h, v.w ), thread2D() >>>( v, P );
	cudaDeviceSynchronize();
	return v;
}

template	< typename F >	void
_3_2_7() {
	cout << "3.2.7 ReLU" << endl;
	cout << ReLU( MakeMatrix< F, 2, 5 >( { -1, -0.5, 0, 0.5, 1, -1, -0.5, 0, 0.5, 1 } ) );
}

実行結果（ソースの最後で実行しています。）

3.2.7 ReLU
	0	0	0	0.5	1
	0	0	0	0.5	1

3.3.2 行列の積

原著の 3.3.2、行列の積の CUDA による実装の例です。

NN3.cu


////////////////////////////////////////////////////////////////////////////////	3.3.2
template	< typename F >	__global__	void
DOT( vMatrix< F > V, vMatrix< F > L, vMatrix< F > R, size_t WH ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	auto lp = L._ + y * L.v;
	auto rp = R._ + x;
	F w = 0;
	for ( size_t _ = 0; _ < WH; _++ ) w += lp[ _ ] * rp[ _ * R.v ];
	V._[ y * V.v + x ] = w;
}
template	< typename F >	Matrix< F >
operator *( const vMatrix< F >& L, const vMatrix< F >& R ) {
	Matrix< F >	v( L.h, R.w );
	DOT<<< grid2D( v.h, v.w ), thread2D() >>>( v, L, R, L.w );
	cudaDeviceSynchronize();
	return v;
}

template	< typename F >	void
_3_3_2() {
	cout << "3.3.2 dot operation" << endl;
	auto l = MakeMatrix< F, 2, 3 >( { 1, 2, 3, 4, 5, 6 } );
	auto r = MakeMatrix< F, 3, 2 >( { 1, 2, 3, 4, 5, 6 } );
	cout << l * r;
}

実行結果（ソースの最後で実行しています。）

3.3.2 dot operation
	22	28
	49	64

3.4.3 3層ニューラルネットワーク

原著の 3.4.3、3層ニューラルネットワークの CUDA による実装の例です。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	3.4.3
# include	<map>

template	< typename F >	map< string, Matrix< F > >
init_network_3_4_3() {
	map< string, Matrix< F > >	_;
	_.emplace( "W1", MakeMatrix< F, 2, 3 >( { 0.1, 0.3, 0.5, 0.2, 0.4, 0.6 } ) );
	_.emplace( "b1", MakeMatrix< F, 1, 3 >( { 0.1, 0.2, 0.3 } ) );
	_.emplace( "W2", MakeMatrix< F, 3, 2 >( { 0.1, 0.4, 0.2, 0.5, 0.3, 0.6 } ) );
	_.emplace( "b2", MakeMatrix< F, 1, 2 >( { 0.1, 0.2 } ) );
	_.emplace( "W3", MakeMatrix< F, 2, 2 >( { 0.1, 0.3, 0.2, 0.4 } ) );
	_.emplace( "b3", MakeMatrix< F, 1, 2 >( { 0.1, 0.2 } ) );
	return _;
}

template	< typename F >	Matrix< F >
identify_function( const Matrix< F >& _ ) {
	return _;
}

template	< typename F >	__global__	void
ADD( vMatrix< F > V, vMatrix< F > L, vMatrix< F > R ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	V._[ y * V.v + x ] = L._[ y * L.v + x ] + R._[ y * R.v + x ];
}
template	< typename F >	Matrix< F >
operator +( const vMatrix< F >& L, const vMatrix< F >& R ) {
	Matrix< F >	v( L.h, R.w );
	ADD<<< grid2D( v.h, v.w ), thread2D() >>>( v, L, R );
	cudaDeviceSynchronize();
	return v;
}

template	< typename F >	Matrix< F >
forward( map< string, Matrix< F > >& network, const vMatrix< F >& x ) {
	auto W1 = network.at( "W1" );
	auto W2 = network.at( "W2" );
	auto W3 = network.at( "W3" );
	auto b1 = network.at( "b1" );
	auto b2 = network.at( "b2" );
	auto b3 = network.at( "b3" );

	auto a1 = x * W1 + b1;
	auto z1 = sigmoid( a1 );
	auto a2 = z1 * W2 + b2;
	auto z2 = sigmoid( a2 );
	auto a3 = z2 * W3 + b3;
	auto y = identify_function( a3 );
	return y;
}

template	< typename F >	void
_3_4_3() {
	cout << "3.4.3 neural" << endl;
	auto network = init_network_3_4_3< F >();
	auto x = MakeMatrix< F, 1, 2 >( { 1.0, 0.5 } );
	auto y = forward( network, x );
	cout << y;
}

実行結果（ソースの最後で実行しています。）

3.4.3 neural
	0.316827	0.696279

3.5.1 ソフトマックス関数

原著の 3.5.1、ソフトマックス関数の CUDA による実装の例です。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	3.5.1
template	< typename F >	__global__	void
EXP( vMatrix< F > V, vMatrix< F >  P ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	V._[ y * V.v + x ] = exp( P._[ y * P.v + x ] );
}
template	< typename F >	Matrix< F >
exp( const vMatrix< F >& P ) {
	Matrix< F >	v( P.h, P.w );
	EXP<<< grid2D( v.h, v.w ), thread2D() >>>( v, P );
	cudaDeviceSynchronize();
	return v;
}

template	< typename F >	F
sum( const vMatrix< F >& P ) {
	F	_ = 0;
	for ( size_t y = 0; y < P.h; y++ ) for ( size_t x = 0; x < P.w; x++ ) _ += P( y, x );
	return _;
}

template	< typename F >	__global__	void
DIV_INP( vMatrix< F > V, F P ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	V._[ y * V.v + x ] /= P;
}
template	< typename F >	void
operator /=( const vMatrix< F >& L, F R ) {
	DIV_INP<<< grid2D( L.h, L.w ), thread2D() >>>( L, R );
	cudaDeviceSynchronize();
}

template	< typename F >	Matrix< F >
softmax_primitive( const vMatrix< F >& p ) {
	auto v = exp( p );
	v /= sum( v );
	return v;
}
template	< typename F >	void
_3_5_1() {
	cout << "3.5.1 softmax_primitive" << endl;
	cout << softmax_primitive( MakeMatrix< F, 1, 3 >( { 0.3, 2.9, 4.0 } ) );
}

実行結果（ソースの最後で実行しています。）

3.5.1 softmax_primitive
	0.0182113	0.245192	0.736597

3.5.2 ソフトマックス関数の実装上の注意

原著の 3.5.2 の CUDA による実装の例です。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	3.5.2
template	< typename F >	F
max( const vMatrix< F >& P ) {
	F	_ = P( 0, 0 );
	for ( size_t y = 0; y < P.h; y++ ) for ( size_t x = 0; x < P.w; x++ ) if ( P( y, x ) > _ ) _ = P( y, x );
	return _;
}

template	< typename F >	__global__	void
SUB_C( vMatrix< F > V, vMatrix< F > L, F R ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	V._[ y * V.v + x ] = L._[ y * L.v + x ] - R;
}
template	< typename F >	Matrix< F >
operator -( const vMatrix< F >& L, F R ) {
	Matrix< F >	v( L.h, L.w );
	SUB_C<<< grid2D( v.h, v.w ), thread2D() >>>( v, L, R );
	cudaDeviceSynchronize();
	return v;
}

template	< typename F >	Matrix< F >
softmax( const vMatrix< F >& p ) {
	auto v = exp( p - max( p ) );
	v /= sum( v );
	return v;
}
template	< typename F >	void
_3_5_2() {
	cout << "3.5.2 softmax" << endl;
	cout << softmax( MakeMatrix< F, 1, 3 >( { 1010, 1000, 990 } ) );
}

実行結果（ソースの最後で実行しています。）

3.5.2 softmax
	0.999955	4.53979e-05	2.06106e-09

3.5.3 ソフトマックス関数の特徴

原著の 3.5.3、ソフトマックス関数の特徴の CUDA による実装の例です。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	3.5.3
template	< typename F >	void
_3_5_3() {
	cout << "3.5.3 sum( softmax )" << endl;
	cout << sum( softmax( MakeMatrix< F, 1, 3 >( { 0.3, 2.9, 4.0 } ) ) ) << endl;
}

実行結果（ソースの最後で実行しています。）

3.5.3 sum( softmax )
1

3.6.1-2、MNIST データセット

原著の 3.6.1-2、MNIST データセットの CUDA による実装の例です。

原著では重みのデータとしてsample_weight.pklというデータが提供されていますが、これは Python じゃないと読みにくいので、バイナリに変換してレポジトリに入れておきました。sample_weight.binという名前です。
MNIST のデータはhttp://yann.lecun.com/exdb/mnist/からダウンロードしてください。
データが double なのでテンプレート化していません。
NumPy には行列とベクトルの間の加算があるので、template < typename F > Matrix< F > operator +( const vMatrix< F >& L, const vArray< F >& R )で同じ機能を作っています。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	3.6.1
# include	<fstream>

template	< typename F >	map< string, Matrix< F > >
get_data() {
	map< string, Matrix< F > > v;
	{	ifstream ifs( "../train-images.idx3-ubyte" );
		if ( ! ifs.is_open() ) throw "../train-images.idx3-ubyte";
		ifs.ignore( 16 );
		Matrix< F > w( 60000, 28 * 28 );
		for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ( (unsigned char)ifs.get() ) / 255.0;
		v.emplace( "x_train", w );
	}
	{	ifstream ifs( "../train-labels.idx1-ubyte" );
		if ( ! ifs.is_open() ) throw "../train-labels.idx1-ubyte";
		ifs.ignore( 8 );
		Matrix< F > w( 1, 60000 );
		for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ifs.get();
		v.emplace( "t_train", w );
	}
	{	ifstream ifs( "../t10k-images.idx3-ubyte" );
		if ( ! ifs.is_open() ) throw "../t10k-images.idx3-ubyte";
		ifs.ignore( 16 );
		Matrix< F > w( 10000, 28 * 28 );
		for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ( (unsigned char)ifs.get() ) / 255.0;
		v.emplace( "x_test", w );
	}
	{	ifstream ifs( "../t10k-labels.idx1-ubyte" );
		if ( ! ifs.is_open() ) throw "../t10k-labels.idx1-ubyte";
		ifs.ignore( 8 );
		Matrix< F > w( 1, 10000 );
		for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ifs.get();
		v.emplace( "t_test", w );
	}
	return v;
}

map< string, Matrix< double > >
init_network() {
	map< string, Matrix< double > > v;
	ifstream	ifs( "../sample_weight.bin" );
	if ( ! ifs.is_open() ) throw "../sample_weight.bin";

	{	Matrix< double > w( 784, 50 );
		ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
		v.emplace( "W1", w );
	}
	{	Matrix< double > w( 50, 100 );
		ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
		v.emplace( "W2", w );
	}
	{	Matrix< double > w( 100, 10 );
		ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
		v.emplace( "W3", w );
	}

	{	Matrix< double > w( 1, 50 );
		ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
		v.emplace( "b1", w );
	}
	{	Matrix< double > w( 1, 100 );
		ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
		v.emplace( "b2", w );
	}
	{	Matrix< double > w( 1, 10 );
		ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
		v.emplace( "b3", w );
	}

	return v;
}
template	< typename F >	__global__	void
ADD( vMatrix< F > V, vMatrix< F > L, vArray< F > R ) {
	auto y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;	if ( y >= V.h ) return;
	auto x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;	if ( x >= V.w ) return;
	V._[ y * V.v + x ] = L._[ y * L.v + x ] + R._[ x ];
}
template	< typename F >	Matrix< F >
operator +( const vMatrix< F >& L, const vArray< F >& R ) {
	Matrix< F >	v( L.h, L.w );
	ADD<<< grid2D( v.h, v.w ), thread2D() >>>( v, L, R );
	cudaDeviceSynchronize();
	return v;
}
template	< typename F >	Matrix< F >
predict( map< string, Matrix< F > >& network, const vMatrix< F >& x ) {
	Matrix< F >& W1 = network.at( "W1" );
	Matrix< F >& W2 = network.at( "W2" );
	Matrix< F >& W3 = network.at( "W3" );
	auto b1 = network.at( "b1" )[ 0 ];
	auto b2 = network.at( "b2" )[ 0 ];
	auto b3 = network.at( "b3" )[ 0 ];

	auto a1 = x * W1 + b1;
	auto z1 = sigmoid( a1 );
	auto a2 = z1 * W2 + b2;
	auto z2 = sigmoid( a2 );
	auto a3 = z2 * W3 + b3;
	auto y = softmax( a3 );
	return y;
}

template	< typename F >	F
argmax( const vArray< F >& P ) {
	size_t	_ = 0;
	for ( size_t i = 1; i < P.n; i++ ) if ( P[ i ] > P[ _ ] ) _ = i;
	return F( _ );
}

template	< typename F >	Array< F >
argmax( const vMatrix< F >& P ) {
	Array< F >	_( P.h );
	for ( size_t y = 0; y < P.h; y++ ) _[ y ] = argmax( P[ y ] );
	return _;
}

template	< typename F >	vArray< F >
Part( const vArray< F >& _, size_t O, size_t N ) {
	return vArray< F >(
		_._ + O
	,	N
	);
}
template	< typename F >	vMatrix< F >
Part( const vMatrix< F >& _, size_t Y, size_t X, size_t H, size_t W ) {
	return vMatrix< F >(
		_._ + Y * _.v + X
	,	H
	,	W
	,	_.v
	);
}

void
_3_6_1() {
	cout << "3.6.1 MNIST" << endl;
	auto w = get_data< double >();
	auto x_test = w.at( "x_test" );
	auto t_test = w.at( "t_test" )[ 0 ];
	auto network = init_network();
	auto accuracy_cnt = 0;
	for ( size_t i = 0; i < x_test.h; i++ ) {
		auto y = predict( network, Part( x_test, i, 0, 1, x_test.w ) );
		auto p = argmax( y[ 0 ] );
		if ( p == t_test[ i ] ) accuracy_cnt++;
	}
	cout << "accuracy_cnt: " << ( ( double)accuracy_cnt / (double)x_test.h ) << endl;
}

実行結果（ソースの最後で実行しています。）

3.6.1 MNIST
accuracy_cnt: 0.9352

3.6.3 バッチ処理

原著の 3.6.3、バッチ処理の CUDA による実装の例です。

原著では正答数のカウントにnp.sum( p == t[ i: i + batch_size ] )としていますが、ここでは CountEquals という関数で一気にカウントしています。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	3.6.3
template < typename F > size_t
CountEquals( const vArray< F >& L, const vArray< F >& R ) {
	size_t _ = 0;
	for ( size_t i = 0; i < L.n; i++ ) if ( L[ i ] == R[ i ] ) _++;
	return _;
}

void
_3_6_3() {
	cout << "3.6.3 MNIST BATCH" << endl;
	auto w = get_data< double >();
	auto x_test = w.at( "x_test" );
	auto t_test = w.at( "t_test" )[ 0 ];
	auto network = init_network();
	auto accuracy_cnt = 0;
	for ( size_t i = 0; i < x_test.h; i += 100 ) {
		auto y = predict( network, Part( x_test, i, 0, 100, x_test.w ) );
		auto p = argmax( y );
		accuracy_cnt += CountEquals( p, Part( t_test, i, 100 ) );
	}
	cout << "accuracy_cnt: " << ( ( double)accuracy_cnt / (double)x_test.h ) << endl;
}

実行結果（ソースの最後で実行しています。）

3.6.3 MNIST BATCH
accuracy_cnt: 0.9352

実行部

実行部です。

NN3.cu

////////////////////////////////////////////////////////////////////////////////	Main
template	< typename F >	void
Main() {
	_3_2_4< F >();
	_3_2_7< F >();
	_3_3_2< F >();
	_3_4_3< F >();
	_3_5_1< F >();
	_3_5_2< F >();
	_3_5_3< F >();
	_3_6_1();
	_3_6_3();
}
int
main( int argc, char* argv[] ) {
	Main< double >();
}

最後に

原著の形に合わせるために map とか使ったりしたせいで、コピーコンストラクタMatrix( const vMatrix< F >& _ )が走ってしまって、時間的にもメモリ的にもよくないので、実際に CUDA でスクラッチから作る場合はメモリのコピーがおきないように考慮すべきでしょう。

書いてから気がついたんですが、vMatrix::operator() を__host__ __device__をつけて以下のように定義してやれば

__host__ __device__ F&
operator()( size_t Y, size_t X ) const {
    return _[ Y * v + X ];
}

カーネルの中でも使えます。例えば

V._[ y * V.v + x ] = 1 / ( 1 + exp( - P._[ y * P.v + x ] ) );

は以下のように書けました。

V( y, x ) = 1 / ( 1 + exp( - P( y, x ) ) );

レポジトリの方は変更しておきます。

おまけ（C++バージョン）

CUDA を使わずに C++ だけでやるバージョンも作ってみました。

NN3CPU.cpp

using namespace std;
# include <iostream>
# include <iomanip>
# include <string.h>
# include <math.h>


template < typename F > struct
vArray {

    F*      _;
    size_t  n;

    vArray( F* _, size_t n )
    :   _( _ )
    ,   n( n ) {
    }
    F&
    operator[]( size_t I ) const {
        return _[ I ];
    }
};

template < typename F > struct
Array   : vArray< F > {
    ~
    Array() {
        delete[] vArray< F >::_;
    }
    Array( size_t n )
    :   vArray< F >( new F[ n ], n ) {
    }
};

template < typename F > struct
vMatrix {

    F*      _;
    size_t  h;
    size_t  w;
    size_t  v;

    vMatrix( F* _, size_t h, size_t w, size_t v )
    :   _( _ )
    ,   h( h )
    ,   w( w )
    ,   v( v ) {
    }

    F&
    operator()( size_t Y, size_t X ) const {
        return _[ Y * v + X ];
    }
    vArray< F >
    operator[]( size_t I ) const {
        return vArray< F >(
            _ + I * v
        ,   w
        );
    }
};
# include <iostream>
template    < typename F >  ostream&
operator <<( ostream& S, const vMatrix< F >& P ) {
    for ( size_t y = 0; y < P.h; y++ ) {
        for ( size_t x = 0; x < P.w; x++ ) S << "   " << P( y, x );
        S << endl;
    }
    return S;
}

template < typename F > struct
Matrix  : vMatrix< F > {
    ~
    Matrix() {
        delete[] vMatrix< F >::_;
    }

    Matrix( size_t h, size_t w )
    :   vMatrix< F >( new F[ h * w ], h, w, w ) {
    }

    Matrix( const vMatrix< F >& _ )
    :   vMatrix< F >( new F[ _.h * _.w ], _.h, _.w, _.w ) {
        for ( size_t y = 0; y < _.h; y++ ) for ( size_t x = 0; x < _.w; x++ ) (*this)( y, x ) = _( y, x );
    }

    Matrix( const Matrix< F >& _ )
    :   Matrix< F >( (vMatrix< F >)_ ) {
    }
};

# include    <vector>

template    < typename F, int Y, int X >    Matrix< F >
MakeMatrix( initializer_list< F > args ) {
    vector< F > argsV = args;
    Matrix< F > _( Y, X );
    for ( size_t y = 0; y < Y; y++ ) {
        for ( size_t x = 0; x < X; x++ ) {
            _( y, x ) = argsV[ y * X + x ];
        }
    }
    return _;
}


////////////////////////////////////////////////////////////////////////////////    3.2.4
template    < typename F >  Matrix< F >
sigmoid( const vMatrix< F >& P ) {
    Matrix< F > v( P.h, P.w );
	for ( size_t y = 0; y < v.h; y++ ) {
		for ( size_t x = 0; x < v.w; x++ ) {
			v( y, x ) = 1 / ( 1 + exp( - P( y, x ) ) );
		}
	}
    return v;
}

template    < typename F >  void
_3_2_4() {
    cout << "3.2.4 sigmoid" << endl;
    cout << sigmoid( MakeMatrix< F, 2, 5 >( { -1, -0.5, 0, 0.5, 1, -1, -0.5, 0, 0.5, 1 } ) );
}

////////////////////////////////////////////////////////////////////////////////    3.2.7
template    < typename F >  Matrix< F >
ReLU( const vMatrix< F >& P ) {
    Matrix< F > v( P.h, P.w );
	for ( size_t y = 0; y < v.h; y++ ) {
		for ( size_t x = 0; x < v.w; x++ ) {
			v( y, x ) = max( F( 0 ), P( y, x ) );
		}
	}
    return v;
}

template    < typename F >  void
_3_2_7() {
    cout << "3.2.7 ReLU" << endl;
    cout << ReLU( MakeMatrix< F, 2, 5 >( { -1, -0.5, 0, 0.5, 1, -1, -0.5, 0, 0.5, 1 } ) );
}

////////////////////////////////////////////////////////////////////////////////    3.3.2

template	< typename T >	Matrix< T >
operator *( const vMatrix< T >& L, const vMatrix< T >& R ) {
	Matrix< T >	v( L.h, R.w );
	for ( size_t y = 0; y < v.h; y++ ) {
		for ( size_t x = 0; x < v.w; x++ ) {
			v( y, x ) = 0;
			for ( size_t _ = 0; _ < L.w; _++ ) v( y, x ) += L( y, _ ) * R( _, x );
		}
	}
	return v;
}

template    < typename F >  void
_3_3_2() {
    cout << "3.3.2 dot operation" << endl;
    auto l = MakeMatrix< F, 2, 3 >( { 1, 2, 3, 4, 5, 6 } );
    auto r = MakeMatrix< F, 3, 2 >( { 1, 2, 3, 4, 5, 6 } );
    cout << l * r;
}

////////////////////////////////////////////////////////////////////////////////    3.4.3
# include    <map>

template    < typename F >  map< string, Matrix< F > >
init_network_3_4_3() {
    map< string, Matrix< F > >  _;
    _.emplace( "W1", MakeMatrix< F, 2, 3 >( { 0.1, 0.3, 0.5, 0.2, 0.4, 0.6 } ) );
    _.emplace( "b1", MakeMatrix< F, 1, 3 >( { 0.1, 0.2, 0.3 } ) );
    _.emplace( "W2", MakeMatrix< F, 3, 2 >( { 0.1, 0.4, 0.2, 0.5, 0.3, 0.6 } ) );
    _.emplace( "b2", MakeMatrix< F, 1, 2 >( { 0.1, 0.2 } ) );
    _.emplace( "W3", MakeMatrix< F, 2, 2 >( { 0.1, 0.3, 0.2, 0.4 } ) );
    _.emplace( "b3", MakeMatrix< F, 1, 2 >( { 0.1, 0.2 } ) );
    return _;
}

template    < typename F >  Matrix< F >
identify_function( const Matrix< F >& _ ) {
    return _;
}

template    < typename F >  Matrix< F >
operator +( const vMatrix< F >& L, const vMatrix< F >& R ) {
	Matrix< F >	v( L.h, L.w );
	for ( size_t y = 0; y < v.h; y++ ) {
		for ( size_t x = 0; x < v.w; x++ ) v( y, x ) = L( y, x ) + R( y, x );
	}
    return v;
}

template    < typename F >  Matrix< F >
forward( map< string, Matrix< F > >& network, const vMatrix< F >& x ) {
    auto W1 = network.at( "W1" );
    auto W2 = network.at( "W2" );
    auto W3 = network.at( "W3" );
    auto b1 = network.at( "b1" );
    auto b2 = network.at( "b2" );
    auto b3 = network.at( "b3" );

    auto a1 = x * W1 + b1;
    auto z1 = sigmoid( a1 );
    auto a2 = z1 * W2 + b2;
    auto z2 = sigmoid( a2 );
    auto a3 = z2 * W3 + b3;
    auto y = identify_function( a3 );
    return y;
}

template    < typename F >  void
_3_4_3() {
    cout << "3.4.3 neural" << endl;
    auto network = init_network_3_4_3< F >();
    auto x = MakeMatrix< F, 1, 2 >( { 1.0, 0.5 } );
    auto y = forward( network, x );
    cout << y;
}
////////////////////////////////////////////////////////////////////////////////    3.5.1
template    < typename F >  Matrix< F >
exp( const vMatrix< F >& P ) {
    Matrix< F > v( P.h, P.w );
	for ( size_t y = 0; y < v.h; y++ ) {
		for ( size_t x = 0; x < v.w; x++ ) v( y, x ) = exp( P( y, x ) );
	}
    return v;
}

template    < typename F >  F
sum( const vMatrix< F >& P ) {
    F   _ = 0;
    for ( size_t y = 0; y < P.h; y++ ) for ( size_t x = 0; x < P.w; x++ ) _ += P( y, x );
    return _;
}

template	< typename T >	void
operator /=( const Matrix< T >& L, T R ) {
	for ( size_t y = 0; y < L.h; y++ ) {
		for ( size_t x = 0; x < L.w; x++ ) L( y, x ) /= R;
	}
}

template    < typename F >  Matrix< F >
softmax_primitive( const vMatrix< F >& p ) {
    auto v = exp( p );
    v /= sum( v );
    return v;
}
template    < typename F >  void
_3_5_1() {
    cout << "3.5.1 softmax_primitive" << endl;
    cout << softmax_primitive( MakeMatrix< F, 1, 3 >( { 0.3, 2.9, 4.0 } ) );
}

////////////////////////////////////////////////////////////////////////////////    3.5.2
template    < typename F >  F
max( const vMatrix< F >& P ) {
    F   _ = P( 0, 0 );
    for ( size_t y = 0; y < P.h; y++ ) for ( size_t x = 0; x < P.w; x++ ) if ( P( y, x ) > _ ) _ = P( y, x );
    return _;
}

template    < typename F >  Matrix< F >
operator -( const vMatrix< F >& L, F R ) {
    Matrix< F > v( L.h, L.w );
	for ( size_t y = 0; y < v.h; y++ ) {
		for ( size_t x = 0; x < v.w; x++ ) v( y, x ) = L( y, x ) - R;
	}
    return v;
}

template    < typename F >  Matrix< F >
softmax( const vMatrix< F >& p ) {
    auto v = exp( p - max( p ) );
    v /= sum( v );
    return v;
}
template    < typename F >  void
_3_5_2() {
    cout << "3.5.2 softmax" << endl;
    cout << softmax( MakeMatrix< F, 1, 3 >( { 1010, 1000, 990 } ) );
}

////////////////////////////////////////////////////////////////////////////////    3.5.3
template    < typename F >  void
_3_5_3() {
    cout << "3.5.3 sum( softmax )" << endl;
    cout << sum( softmax( MakeMatrix< F, 1, 3 >( { 0.3, 2.9, 4.0 } ) ) ) << endl;
}



////////////////////////////////////////////////////////////////////////////////    3.6.1
# include    <fstream>

template    < typename F >  map< string, Matrix< F > >
get_data() {
    map< string, Matrix< F > > v;
    {   ifstream ifs( "../NN3/train-images.idx3-ubyte" );
        if ( ! ifs.is_open() ) throw "../NN3/train-images.idx3-ubyte";
        ifs.ignore( 16 );
        Matrix< F > w( 60000, 28 * 28 );
        for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ( (unsigned char)ifs.get() ) / 255.0;
        v.emplace( "x_train", w );
    }
    {   ifstream ifs( "../NN3/train-labels.idx1-ubyte" );
        if ( ! ifs.is_open() ) throw "../NN3/train-labels.idx1-ubyte";
        ifs.ignore( 8 );
        Matrix< F > w( 1, 60000 );
        for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ifs.get();
        v.emplace( "t_train", w );
    }
    {   ifstream ifs( "../NN3/t10k-images.idx3-ubyte" );
        if ( ! ifs.is_open() ) throw "../NN3/t10k-images.idx3-ubyte";
        ifs.ignore( 16 );
        Matrix< F > w( 10000, 28 * 28 );
        for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ( (unsigned char)ifs.get() ) / 255.0;
        v.emplace( "x_test", w );
    }
    {   ifstream ifs( "../NN3/t10k-labels.idx1-ubyte" );
        if ( ! ifs.is_open() ) throw "../NN3/t10k-labels.idx1-ubyte";
        ifs.ignore( 8 );
        Matrix< F > w( 1, 10000 );
        for ( size_t _ = 0; _ < w.h * w.w; _++ ) w._[ _ ] = ifs.get();
        v.emplace( "t_test", w );
    }
    return v;
}

map< string, Matrix< double > >
init_network() {
    map< string, Matrix< double > > v;
    ifstream    ifs( "../NN3/sample_weight.bin" );
    if ( ! ifs.is_open() ) throw "../NN3/sample_weight.bin";

    {   Matrix< double > w( 784, 50 );
        ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
        v.emplace( "W1", w );
    }
    {   Matrix< double > w( 50, 100 );
        ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
        v.emplace( "W2", w );
    }
    {   Matrix< double > w( 100, 10 );
        ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
        v.emplace( "W3", w );
    }

    {   Matrix< double > w( 1, 50 );
        ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
        v.emplace( "b1", w );
    }
    {   Matrix< double > w( 1, 100 );
        ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
        v.emplace( "b2", w );
    }
    {   Matrix< double > w( 1, 10 );
        ifs.read( (char*)w._, w.h * w.w * sizeof( double ) );
        v.emplace( "b3", w );
    }

    return v;
}
template    < typename F >  Matrix< F >
operator +( const vMatrix< F >& L, const vArray< F >& R ) {
    Matrix< F > v( L.h, L.w );
	for ( size_t y = 0; y < v.h; y++ ) {
		for ( size_t x = 0; x < v.w; x++ ) v( y, x ) = L( y, x ) + R[ x ];
	}
    return v;
}
template    < typename F >  Matrix< F >
predict( map< string, Matrix< F > >& network, const vMatrix< F >& x ) {
    Matrix< F >& W1 = network.at( "W1" );
    Matrix< F >& W2 = network.at( "W2" );
    Matrix< F >& W3 = network.at( "W3" );
    auto b1 = network.at( "b1" )[ 0 ];
    auto b2 = network.at( "b2" )[ 0 ];
    auto b3 = network.at( "b3" )[ 0 ];

    auto a1 = x * W1 + b1;
    auto z1 = sigmoid( a1 );
    auto a2 = z1 * W2 + b2;
    auto z2 = sigmoid( a2 );
    auto a3 = z2 * W3 + b3;
    auto y = softmax( a3 );
    return y;
}

template    < typename F >  F
argmax( const vArray< F >& P ) {
    size_t  _ = 0;
    for ( size_t i = 1; i < P.n; i++ ) if ( P[ i ] > P[ _ ] ) _ = i;
    return F( _ );
}

template    < typename F >  Array< F >
argmax( const vMatrix< F >& P ) {
    Array< F >  _( P.h );
    for ( size_t y = 0; y < P.h; y++ ) _[ y ] = argmax( P[ y ] );
    return _;
}

template    < typename F >  vArray< F >
Part( const vArray< F >& _, size_t O, size_t N ) {
    return vArray< F >(
        _._ + O
    ,   N
    );
}
template    < typename F >  vMatrix< F >
Part( const vMatrix< F >& _, size_t Y, size_t X, size_t H, size_t W ) {
    return vMatrix< F >(
        _._ + Y * _.v + X
    ,   H
    ,   W
    ,   _.v
    );
}

void
_3_6_1() {
    cout << "3.6.1 MNIST" << endl;
    auto w = get_data< double >();
    auto x_test = w.at( "x_test" );
    auto t_test = w.at( "t_test" )[ 0 ];
    auto network = init_network();
    auto accuracy_cnt = 0;
    for ( size_t i = 0; i < x_test.h; i++ ) {
        auto y = predict( network, Part( x_test, i, 0, 1, x_test.w ) );
        auto p = argmax( y[ 0 ] );
        if ( p == t_test[ i ] ) accuracy_cnt++;
    }
    cout << "accuracy_cnt: " << ( ( double)accuracy_cnt / (double)x_test.h ) << endl;
}

////////////////////////////////////////////////////////////////////////////////    3.6.3
template < typename F > size_t
CountEquals( const vArray< F >& L, const vArray< F >& R ) {
    size_t _ = 0;
    for ( size_t i = 0; i < L.n; i++ ) if ( L[ i ] == R[ i ] ) _++;
    return _;
}

void
_3_6_3() {
    cout << "3.6.3 MNIST BATCH" << endl;
    auto w = get_data< double >();
    auto x_test = w.at( "x_test" );
    auto t_test = w.at( "t_test" )[ 0 ];
    auto network = init_network();
    auto accuracy_cnt = 0;
    for ( size_t i = 0; i < x_test.h; i += 100 ) {
        auto y = predict( network, Part( x_test, i, 0, 100, x_test.w ) );
        auto p = argmax( y );
        accuracy_cnt += CountEquals( p, Part( t_test, i, 100 ) );
    }
    cout << "accuracy_cnt: " << ( ( double)accuracy_cnt / (double)x_test.h ) << endl;
}

////////////////////////////////////////////////////////////////////////////////    Main
template    < typename F >  void
Main() {
    _3_2_4< F >();
    _3_2_7< F >();
    _3_3_2< F >();
    _3_4_3< F >();
    _3_5_1< F >();
    _3_5_2< F >();
    _3_5_3< F >();
    _3_6_1();
    _3_6_3();
}
int
main( int argc, char* argv[] ) {
    Main< double >();
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up