Deep Learning from scratch 2にBahdanau Attention #Python3

　RNN (GRU) を使った Attention 付きの seq2seq モデルのニューラルネットワークについてです。実行確認済のソースコード有。

　この記事に掲載するソースは、斎藤康毅さんの書いた「ゼロから作る Deep Learning 2 自然言語処理編」の第8章で提示されたソースにクラスを付け加えて、Bahdanau attention を使えるようにするものです。斎藤さんも、Attention については、いろいろな方法があると著書で書かれています。その一つのバリエーションです。Bahdanau attention については、tensorflow のページ

で使われているものを勉強して実装しました。

　今回のプログラムを、一括してダウンロードする場合は

　付け加えるクラスは

AttentionSeq2seq2 クラス
AttentionEncoder2 クラス
AttentionDecoder2 クラス
BAttention クラス
Score クラス

の5つです。学習の経過は図1に示す通りです。

　最初に、斎藤さんの第8章のソース ch08/attention_seq2seq.py の最下部に次のソースをコピペしてください。内容は、AttentionSeq2seq2 クラス、AttentionEncoder2 クラス、AttentionDecoder2 クラスの追加です。

#エンコーダー
class AttentionEncoder2():

    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        #エンコーダーで使うパラメーターを定義。
        embed_W = (rn(V, D) / 100).astype('f')
        gru_Wx = (rn(D, 3 * H) / np.sqrt(D)).astype('f')
        gru_Wh = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
        gru_b = np.zeros(3 * H).astype('f')

        #レイヤーを定義
        self.embed = TimeEmbedding(embed_W)
        self.gru = TimeGRU(gru_Wx, gru_Wh, gru_b, stateful=False)

        #パラメーターと勾配の入れ物を定義。
        self.params = self.embed.params + self.gru.params
        self.grads = self.embed.grads + self.gru.grads
        self.hs = None

        #順伝搬
    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.gru.forward(xs)
        return hs

        #逆伝搬
    def backward(self, dhs):
        dout = self.gru.backward(dhs)
        dout = self.embed.backward(dout)
        
        self.grads[0][...] = self.embed.grads[0][...]
        self.grads[1][...] = self.gru.grads[0][...]
        self.grads[2][...] = self.gru.grads[1][...]
        self.grads[3][...] = self.gru.grads[2][...]        
        
        return dout

#デコーダー
class AttentionDecoder2:
    def __init__(self, vocab_size, wordvec_size, hidden_size, embed_W,bAaffine1_W,bAaffine1_b,bAaffine2_W,bAaffine2_b,bAaffine3_W,bAaffine3_b,gru_Wx,gru_Wh,gru_b,affine_W,affine_b ):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.V = V
        self.D = D
        self.H = H
        self.T = None

        #レイヤーを定義。
        self.embed = TimeEmbedding(embed_W)
        self.battention = BAttention(bAaffine1_W, bAaffine1_b, bAaffine2_W, bAaffine2_b, bAaffine3_W, bAaffine3_b ) #Bahdanau attention のクラスを attention_layers.py で定義。
        self.gru = TimeGRU(gru_Wx, gru_Wh, gru_b, stateful=False )
        self.affine = Affine(affine_W, affine_b)
        layers = [self.embed,  self.battention, self.gru, self.affine]

        self.params = []
        self.grads = []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        #tensorflow https://tensorflow.classcat.com/2019/04/07/tf20-alpha-tutorials-sequences-nmt-with-attention/ の Decoder を参考に順伝搬。
    def forward(self, xs, dec_hs, enc_hs ):
        N, H = dec_hs.shape

        h_initial = None
        self.N, self.T, self.H = enc_hs.shape

        self.gru.set_state(h_initial)
        x = self.embed.forward(xs)
        c = self.battention.forward( dec_hs, enc_hs )
        c = np.expand_dims( c, axis = 1 )
        x = np.concatenate( [c, x], axis = -1 )
        output = self.gru.forward(x)
        dec_hs = np.squeeze( output, axis = 1 )
        output = np.reshape( output, (-1, output.shape[2]))
        score = self.affine.forward(output)

        return score, dec_hs

        #順伝搬を参考に逆伝搬。
    def backward(self, dscore,  ddec_hs ):
        N,V = dscore.shape
        
        doutput = self.affine.backward(dscore)
        # output が output と dec_hs に分岐したことの逆伝搬。 sum。
        doutput0 = np.expand_dims( ddec_hs, axis = 1 )
        doutput = np.expand_dims( doutput, axis = 1 ) + doutput0
        dx = self.gru.backward( doutput )
        #dh = self.gru.dh
        # concatenate の逆伝搬
        dc = dx[:,0,0:self.H]
        dx = dx[:,:,self.H:]
        ddec_hs, denc_hs = self.battention.backward( dc )
        #denc_hs[:,-1] += dh

        dxs = self.embed.backward(dx)

        self.grads[0][...] = self.embed.grads[0][...]
        self.grads[1][...] = self.battention.grads[0][...]
        self.grads[2][...] = self.battention.grads[1][...]
        self.grads[3][...] = self.battention.grads[2][...]
        self.grads[4][...] = self.battention.grads[3][...]
        self.grads[5][...] = self.battention.grads[4][...]
        self.grads[6][...] = self.battention.grads[5][...]
        self.grads[7][...] = self.gru.grads[0][...]
        self.grads[8][...] = self.gru.grads[1][...]
        self.grads[9][...] = self.gru.grads[2][...]
        self.grads[10][...] = self.affine.grads[0][...]
        self.grads[11][...] = self.affine.grads[1][...]

        return denc_hs, ddec_hs, dxs

# Bahdanau attention を使った seq2seq モデル
class AttentionSeq2seq2(BaseModel):
    def __init__(self, vocab_size, wordvec_size, hidden_size):

        rn = np.random.randn
        self.H = hidden_size
        self.V = vocab_size
        V, D, H = vocab_size, wordvec_size, hidden_size

        #パラメーターの初期値を設定。
        self.embed_W = (rn(V, D) / 100).astype('f')
        self.bAaffine1_W = (rn( (H), H ) / np.sqrt((H))).astype('f')
        self.bAaffine1_b = np.zeros( H ).astype('f')
        self.bAaffine2_W = (rn( H, H ) / np.sqrt(H)).astype('f')
        self.bAaffine2_b = np.zeros( H ).astype('f')
        self.bAaffine3_W = (rn( H, 1 ) / np.sqrt(H)).astype('f')
        self.bAaffine3_b = np.zeros( 1 ).astype('f')        
        self.gru_Wx = (rn((H+D), 3 * H) / np.sqrt((H+D))).astype('f')
        self.gru_Wh = (rn(H, 3 * H) / np.sqrt(H)).astype('f')
        self.gru_b = np.zeros(3 * H).astype('f')
        self.affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        self.affine_b = np.zeros(V).astype('f')

        #インスタンス作成時の引数リストを作っておく。
        args1 = vocab_size, wordvec_size, hidden_size
        self.args2 = vocab_size, wordvec_size, hidden_size, self.embed_W,  self.bAaffine1_W, self.bAaffine1_b, self.bAaffine2_W, self.bAaffine2_b, self.bAaffine3_W,self.bAaffine3_b, self.gru_Wx, self.gru_Wh, self.gru_b, self.affine_W, self.affine_b        

        #エンコーダーのインスタンスを作成。
        self.encoder = AttentionEncoder2(*args1)

        # パラメーターの入れ物を定義。
        self.params = self.encoder.params + [ self.embed_W,self.bAaffine1_W, self.bAaffine1_b, self.bAaffine2_W, self.bAaffine2_b,self.bAaffine3_W,self.bAaffine3_b,self.gru_Wx,self.gru_Wh,self.gru_b, self.affine_W, self.affine_b]
        
        # 勾配の入れ物を定義。
        self.d_grads = [ np.zeros_like(self.embed_W),np.zeros_like(self.bAaffine1_W), np.zeros_like(self.bAaffine1_b), np.zeros_like(self.bAaffine2_W), np.zeros_like(self.bAaffine2_b),np.zeros_like(self.bAaffine3_W),np.zeros_like(self.bAaffine3_b),np.zeros_like(self.gru_Wx),np.zeros_like(self.gru_Wh),np.zeros_like(self.gru_b), np.zeros_like(self.affine_W), np.zeros_like(self.affine_b)]
        self.grads =  self.encoder.grads +  self.d_grads

    def forward(self, xs, ts ):
        N, T = ts.shape
        Nxs, Txs = xs.shape
        self.N = N
        self.T = T
        self.Txs = Txs
        start_id = ts[0,0] # start_id 多分 14 だが、これを定義。
        #元のAttentionSeq2seq だと loss の計算に decoder_ts を使うようになっているが、https://tensorflow.classcat.com/2019/04/07/tf20-alpha-tutorials-sequences-nmt-with-attention/
        #の train_step では、for 文をrange(1,T)とすることにより、意識的に1ずらしているので、新たに decoder_ts2 を定義してこれを使う。
        decoder_xs, decoder_ts, decoder_ts2 = ts[:,:], ts[:, 1:], ts[:,:]

        h = self.encoder.forward(xs) #AttentionEncoder2 をコール。
        #print( "in attention_seq2seq class AttentionSeq2seq2 def forward, h[:,:,:]:{}".format(  h[:,:,:] ))


        self.layers = [] # for ループの中で Decoder2 のインスタンスを複数作成するので、そのレイヤーを定義。
        self.layers2 = [] # for ループの中で SoftMaxWithLoss のインスタンスを複数作成するので、そのレイヤーを定義。

        dec_hs = np.sum( h, axis = 1 ) # デコーダーに入力する隠れ状態。tensorflow では、エンコーダーの最終状態だが、エンコーダーの隠れ状態の sum の方が良い結果がでるようだ。
        dec_input = np.zeros( ( self.N, 1 ), dtype=np.int64  ) #デコーダーに入力する、入力テンソルの器を準備。
        dec_input[:,0] = start_id #デコーダーに入力する、入力テンソルの t = 0 は、start_id。
        total_loss = 0
        
        for t in range( 1, T  ): # tensorflow のページにあるように、1,T でループ。
            layer = AttentionDecoder2(*self.args2) # AttentionDecoder2 のインスタンスを作成
            layer2 = SoftmaxWithLoss() # SoftmaxWithLoss のインスタンスを作成
            predictions, dec_hs = layer.forward( dec_input, dec_hs, h ) # AttentionDecoder2 レイヤーをコール
            loss = layer2.forward( predictions, decoder_ts2[:,t] ) # SoftmaxWithLoss レイヤーをコールして loss を計算。
            total_loss += loss # total_loss を計算。
            self.layers.append(layer) 
            self.layers2.append( layer2 )
            dec_input[:,0] = decoder_xs[:, t] #デコーダーに入力する入力テンソルを設定。

        return total_loss / ( T - 1 )

    def backward(self, dout=1):

        d_grads = [0,0,0,0,0,0,0,0,0,0,0,0] #デコーダーにおける、各パラメーターの勾配を定義。
        ddec_hs = np.zeros( ( self.N, self.H ), dtype=np.float64 ) 
        dh_sum = np.zeros( ( self.N, self.Txs, self.H ), dtype=np.float64 )
        for t in reversed(range( 1, self.T ) ): # forward の 1,T のループを逆にたどる。
            t2 = t - 1 # layer を識別するためのパラメーター t2 を作成。
            layer = self.layers[t2] # レイヤーを呼び出す。
            layer2 = self.layers2[t2]
            dpredictions = layer2.backward( dout ) # SoftmaxWithLoss レイヤーの逆伝搬
            dh, ddec_hs, ddec_input = layer.backward(dpredictions, ddec_hs ) # AttentionDecoder2 の逆伝搬
            dh_sum += dh # h の分岐に対する逆伝搬の sum。 
            for i, grad in enumerate(layer.grads): # AttentionDecoder2 レイヤーの勾配の分岐による逆伝搬の sum
                d_grads[i] += grad

        dxs = self.encoder.backward(dh_sum) # AttentionEncoder の逆伝搬

        self.grads[0][...] = self.encoder.grads[0][...] # AttentionEncoder2 の勾配を格納
        self.grads[1][...] = self.encoder.grads[1][...]
        self.grads[2][...] = self.encoder.grads[2][...]
        self.grads[3][...] = self.encoder.grads[3][...]
        for i, d_grad in enumerate(d_grads): # AttentionDecoder2 の勾配を格納
            i2 = i + len( self.encoder.grads )
            self.grads[i2][...] = d_grad    
        
        return dxs

        # 個別の予測（N = 1） に対応するためのモジュール
    def generate(self, xs, start_id, sample_size):
        N, T = xs.shape

        h = self.encoder.forward(xs)
        sampled = []

        self.layers = []

        dec_hs = np.sum( h, axis = 1 )
        dec_input = np.zeros( ( 1, 1 ), dtype=np.int64  )
        dec_input[0,0] = start_id
        sample_id = start_id

        for t in range( sample_size ):
            layer = AttentionDecoder2(*self.args2)
            predictions, dec_hs = layer.forward( dec_input, dec_hs, h )
            self.layers.append(layer)
            sample_id = np.argmax( predictions )
            sampled.append(sample_id)
            # 予測された ID がモデルに戻される（forward とここが違う）
            dec_input[0,0] = sample_id
        
        return sampled

また、attention_seq2seq.py の上部に

from common.base_model import BaseModel
from ch08.attention_layer import BAttention

を加えてください。
　次に、ch08/attention_layer.py の最下部に次のソースをコピペしてください。内容は、BAttentionクラスとScore クラスの追加です。


# tensorflow https://tensorflow.classcat.com/2019/04/07/tf20-alpha-tutorials-sequences-nmt-with-attention/ の class BahdanauAttention の score の計算。
class Score:
    def __init__(self, bAaffine1_W, bAaffine1_b, bAaffine2_W, bAaffine2_b, bAaffine3_W, bAaffine3_b ):
        self.affine1 = TimeAffine(bAaffine1_W, bAaffine1_b)
        self.affine2 = TimeAffine(bAaffine2_W, bAaffine2_b)
        self.affine3 = TimeAffine(bAaffine3_W, bAaffine3_b)
        layers = [self.affine1,self.affine2,self.affine3]
        
        self.params = []
        self.grads = []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads        

        #順伝搬
    def forward(self,  query, values ):
        N, T, H = values.shape
        N, H = query.shape
        self.T = T
        
        hidden_with_time_axis = np.expand_dims( query, axis = 1 )
        self.values = values

        score0 = self.affine1.forward( values )
        score1 = self.affine2.forward( hidden_with_time_axis )

        score2 = score0 + score1

        score3 = np.tanh( score2 )

        self.score3 = score3

        score = self.affine3.forward( score3 )

        return score

        #順伝搬を参考に逆伝搬
    def backward( self,  dscore ):

        dscore3 = self.affine3.backward( dscore )

        dscore2 = dscore3 * ( 1 - self.score3 ** 2 ) # tanh の逆伝搬。
        
        dscore0 = dscore2
        # score1 の broadcast の逆伝搬
        dscore1 = np.sum( dscore2, axis = 1 )
        dscore1 = np.expand_dims( dscore1, axis = 1 )

        dhidden_with_time_axis = self.affine2.backward( dscore1 )
        dquery = np.squeeze( dhidden_with_time_axis, axis = 1 )

        dvalues = self.affine1.backward( dscore0 )

        self.grads[0][...] = self.affine1.grads[0][...]
        self.grads[1][...] = self.affine1.grads[1][...]
        self.grads[2][...] = self.affine2.grads[0][...]
        self.grads[3][...] = self.affine2.grads[1][...]
        self.grads[4][...] = self.affine3.grads[0][...]
        self.grads[5][...] = self.affine3.grads[1][...] 
       
        return dquery, dvalues

# tensorflow https://tensorflow.classcat.com/2019/04/07/tf20-alpha-tutorials-sequences-nmt-with-attention/ の class BahdanauAttention。
class BAttention:
    def __init__(self, bAaffine1_W, bAaffine1_b, bAaffine2_W, bAaffine2_b, bAaffine3_W, bAaffine3_b ):
        self.score = Score(bAaffine1_W, bAaffine1_b, bAaffine2_W, bAaffine2_b, bAaffine3_W, bAaffine3_b)
        layers = [self.score]
        
        self.params = []
        self.grads = []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads        
        
        self.T = None
        self.Softmax_layer = Softmax()
        
        #順伝搬
    def forward(self,  query, values ):

        N, H = query.shape
        N, T, H = values.shape
        self.T = T
        
        self.values = values

        score = self.score.forward( query, values )
        
        attention_weights = self.Softmax_layer.forward( score )
        
        self.attention_weights = attention_weights #attention_weights は ( batch_size, T_max_length, 1 )
        
        context_vector = attention_weights * values # ( batch_size, T_max_length, hidden_size ) attemtop_weights はブロードキャスト
        context_vector = np.sum( context_vector, axis = 1 )

        return context_vector
       
        #順伝搬を参考に逆伝搬
    def backward(self, dcon ):
        N, H = dcon.shape
       
        dcon = np.expand_dims( dcon, axis = 1 )
        dcon = np.repeat( dcon, self.T, axis = 1 )

        dattention_weights = dcon * self.values
        # attention_weights の boradcast の逆伝搬
        dattention_weights = np.sum( dattention_weights, axis = 2 )
        dattention_weights = np.expand_dims( dattention_weights, axis = 2 )
        
        dvalues0 = dcon * self.attention_weights
       
        dscore = self.Softmax_layer.backward( dattention_weights )

        dquery, dvalues = self.score.backward( dscore )

        # values の分岐による逆伝搬の sum。
        dvalues += dvalues0

        self.grads[0][...] = self.score.grads[0][...]
        self.grads[1][...] = self.score.grads[1][...]
        self.grads[2][...] = self.score.grads[2][...]
        self.grads[3][...] = self.score.grads[3][...]
        self.grads[4][...] = self.score.grads[4][...]
        self.grads[5][...] = self.score.grads[5][...]       
       
        return dquery, dvalues

加えて、attention_layer.py の上部に

from common.time_layers import TimeAffine

を加えてください。
　次に、ch08/train.py のバックアップを取った上で、次の修正を加えてください。ch08/train.py の上部に

from attention_seq2seq import AttentionSeq2seq2

を加え、model 変数のコンストラクタで

model = AttentionSeq2seq(vocab_size, wordvec_size, hidden_size)
↓
model = AttentionSeq2seq2(vocab_size, wordvec_size, hidden_size)

AttentionSeq2seq2 のインスタンスを作成するようにしてください。また、入力の反転は使いませんので、train.py のつぎの部分

# 入力文を反転
x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]
↓
# 入力文を反転
#x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]

をコメントアウトし、

        correct_num += eval_seq2seq(model, question, correct,
                                    id_to_char, verbose, is_reverse=True)
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　↓
        correct_num += eval_seq2seq(model, question, correct,
                                    id_to_char, verbose, is_reverse=False)

のように、is_reverse=True を False にしてください。また、オプティマイザーAdam の学習率を lr=1e-2 と train.py で与えてください。

optimizer = Adam()
↓
optimizer = Adam(lr=1e-2)

　最後に、class BAttention の self.Softmax_layer で使う softmax 関数ですが、3階のテンソルに対応するようにcommon/functions.py の softmax 関数を

def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims=True)
    elif x.ndim == 1:
        x = x - np.max(x)
        x = np.exp(x) / np.sum(np.exp(x))
    elif x.ndim == 3:
        x = x - x.max( axis = 1, keepdims = True )
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims = True )

    return x

に置き換えてください。
　加えて、ch07/peeky_seq2seq.py の5行目を

from seq2seq import Seq2seq, Encoder
↓
from ch07.seq2seq import Seq2seq, Encoder

とします。
　これで、準備ができました。ch08 ディレクトリで、#python train.py とすれば学習が始まるはずです。
　ここで作成した AttentionSeq2seq2 は、GRU ですが、LSTM 版も作ってあります。また、

のページの日本語-英語のデータを LSTM 版に 10 epochs およそ三日間学習させて、良好な機械翻訳結果を得ています。