Edited at

# 【将棋AI】「将棋AIで学ぶディープラーニング」を読む♪～方策・価値マルチタスク学習

### 解説したいこと

（１）方策ネットワークと価値ネットワークのマルチタスク学習

（２）マルチタスク学習する～コード説明

（３）収束状況を確認する～正規表現利用のログのグラフ出力

### （１）方策ネットワークと価値ネットワークのマルチタスク学習

policy_value.py

```﻿from chainer import Chain
import chainer.functions as F
from pydlshogi.common import *

ch = 192
fcl = 256
class PolicyValueNetwork(Chain):
def __init__(self):
super(PolicyValueNetwork, self).__init__()
with self.init_scope():
self.l1=L.Convolution2D(in_channels = 104, out_channels = ch, ksize = 3, pad = 1)
self.l2=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l3=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l4=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l5=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l6=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l7=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l8=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l9=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l10=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l11=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
self.l12=L.Convolution2D(in_channels = ch, out_channels = ch, ksize = 3, pad = 1)
# policy network
self.l13=L.Convolution2D(in_channels = ch, out_channels = MOVE_DIRECTION_LABEL_NUM, ksize = 1, nobias = True)
self.l13_bias=L.Bias(shape=(9*9*MOVE_DIRECTION_LABEL_NUM))
# value network
self.l13_v=L.Convolution2D(in_channels = ch, out_channels = MOVE_DIRECTION_LABEL_NUM, ksize = 1)
self.l14_v=L.Linear(9*9*MOVE_DIRECTION_LABEL_NUM, fcl)
self.l15_v=L.Linear(fcl, 1)

def __call__(self, x):
h1 = F.relu(self.l1(x))
h2 = F.relu(self.l2(h1))
h3 = F.relu(self.l3(h2))
h4 = F.relu(self.l4(h3))
h5 = F.relu(self.l5(h4))
h6 = F.relu(self.l6(h5))
h7 = F.relu(self.l7(h6))
h8 = F.relu(self.l8(h7))
h9 = F.relu(self.l9(h8))
h10 = F.relu(self.l10(h9))
h11 = F.relu(self.l11(h10))
h12 = F.relu(self.l12(h11))
# policy network
h13 = self.l13(h12)
policy = self.l13_bias(F.reshape(h13, (-1, 9*9*MOVE_DIRECTION_LABEL_NUM)))
# value network
h13_v = F.relu(self.l13_v(h12))
h14_v = F.relu(self.l14_v(h13_v))
value = self.l15_v(h14_v)
return policy, value
```

これは、前回までで利用したpolicy.pyとvalue.pyを連結した構造になっています。

```y1, y2 = model(x)
```

というように使います。

### （２）マルチタスク学習する～コード説明

```﻿import numpy as np
import chainer
from chainer import cuda, Variable
from chainer import optimizers, serializers
import chainer.functions as F

from pydlshogi.common import *
from pydlshogi.network.policy_value import PolicyValueNetwork
from pydlshogi.features import *

import argparse
import random
import pickle
import os
import re
import logging
```

importするものも以前と同一です。

``` from pydlshogi.network.policy_value import PolicyValueNetwork```

だけ、上記policy_value.pyのマルチタスク学習のためのnetworkになっています。

```parser = argparse.ArgumentParser()
parser.add_argument('--batchsize', '-b', type=int, default=32, help='Number of positions in each mini-batch')
parser.add_argument('--test_batchsize', type=int, default=512, help='Number of positions in each test mini-batch')
parser.add_argument('--epoch', '-e', type=int, default=10, help='Number of epoch times')
parser.add_argument('--model', type=str, default='model/model_policy_value', help='model file name')
parser.add_argument('--state', type=str, default='model/state_policy_value', help='state file name')
parser.add_argument('--initmodel', '-m', default='', help='Initialize the model from given file')
parser.add_argument('--resume', '-r', default='', help='Resume the optimization from snapshot')
parser.add_argument('--eval_interval', '-i', type=int, default=1000, help='eval interval')
args = parser.parse_args()
```

パラメータ類も前回と同じです。

```logging.basicConfig(format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%Y/%m/%d %H:%M:%S', filename=args.log, level=logging.DEBUG)
```

また、ログ出力もほぼ同様です。

```model = PolicyValueNetwork()
model.to_gpu()

optimizer = optimizers.SGD(lr=args.lr)
optimizer.setup(model)
```

model定義して、optimizerをセットします。

```# Init/Resume
if args.initmodel:
if args.resume:
```

```logging.info('read kifu start')
# 保存済みのpickleファイルがある場合、pickleファイルを読み込む
# train date
train_pickle_filename = re.sub(r'\..*?\$', '', args.kifulist_train) + '.pickle'
if os.path.exists(train_pickle_filename):
with open(train_pickle_filename, 'rb') as f:
else:

# test data
test_pickle_filename = re.sub(r'\..*?\$', '', args.kifulist_test) + '.pickle'
if os.path.exists(test_pickle_filename):
with open(test_pickle_filename, 'rb') as f:
else:
```

```# 保存済みのpickleがない場合、pickleファイルを保存する
if not os.path.exists(train_pickle_filename):
with open(train_pickle_filename, 'wb') as f:
pickle.dump(positions_train, f, pickle.HIGHEST_PROTOCOL)
logging.info('save train pickle')
if not os.path.exists(test_pickle_filename):
with open(test_pickle_filename, 'wb') as f:
pickle.dump(positions_test, f, pickle.HIGHEST_PROTOCOL)
logging.info('save test pickle')
```

```logging.info('train position num = {}'.format(len(positions_train)))
logging.info('test position num = {}'.format(len(positions_test)))
```

``` line 2018/08/24 20:02:13 INFO train position num = 1892246 line 2018/08/24 20:02:13 INFO test position num = 208704 ```

```# mini batch
def mini_batch(positions, i, batchsize):
mini_batch_data = []
mini_batch_move = []
mini_batch_win = []
for b in range(batchsize):
features, move, win = make_features(positions[i + b])
mini_batch_data.append(features)
mini_batch_move.append(move)
mini_batch_win.append(win)

return (Variable(cuda.to_gpu(np.array(mini_batch_data, dtype=np.float32))),
Variable(cuda.to_gpu(np.array(mini_batch_move, dtype=np.int32))),
Variable(cuda.to_gpu(np.array(mini_batch_win, dtype=np.int32).reshape((-1, 1)))))

def mini_batch_for_test(positions, batchsize):
mini_batch_data = []
mini_batch_move = []
mini_batch_win = []
for b in range(batchsize):
features, move, win = make_features(random.choice(positions))
mini_batch_data.append(features)
mini_batch_move.append(move)
mini_batch_win.append(win)

return (Variable(cuda.to_gpu(np.array(mini_batch_data, dtype=np.float32))),
Variable(cuda.to_gpu(np.array(mini_batch_move, dtype=np.int32))),
Variable(cuda.to_gpu(np.array(mini_batch_win, dtype=np.int32).reshape((-1, 1)))))
```

```# train
logging.info('start training')
itr = 0
sum_loss = 0
for e in range(args.epoch):
positions_train_shuffled = random.sample(positions_train, len(positions_train))

itr_epoch = 0
sum_loss_epoch = 0
for i in range(0, len(positions_train_shuffled) - args.batchsize, args.batchsize):
x, t1, t2 = mini_batch(positions_train_shuffled, i, args.batchsize)
y1, y2 = model(x)

loss = F.softmax_cross_entropy(y1, t1) + F.sigmoid_cross_entropy(y2, t2)
loss.backward()
optimizer.update()

itr += 1
sum_loss += loss.data
itr_epoch += 1
sum_loss_epoch += loss.data

# print train loss and test accuracy
if optimizer.t % args.eval_interval == 0:
x, t1, t2 = mini_batch_for_test(positions_test, args.test_batchsize)
y1, y2 = model(x)
logging.info('epoch = {}, iteration = {}, loss = {}, accuracy_pol = {},accuracy_val = {}'.format(
optimizer.epoch + 1, optimizer.t, sum_loss / itr,
F.accuracy(y1, t1).data, F.binary_accuracy(y2, t2).data))
itr = 0
sum_loss = 0

# validate test data
logging.info('validate test data')
itr_test = 0
sum_test_accuracy1 = 0
sum_test_accuracy2 = 0
for i in range(0, len(positions_test) - args.batchsize, args.batchsize):
x, t1, t2 = mini_batch(positions_test, i, args.batchsize)
y1, y2 = model(x)
itr_test += 1
sum_test_accuracy1 += F.accuracy(y1, t1).data
sum_test_accuracy2 += F.binary_accuracy(y2, t2).data
logging.info('epoch = {}, iteration = {}, train loss avr = {}, test accuracy_pol = {},test accuracy_val = {}'.format(
optimizer.epoch + 1, optimizer.t, sum_loss_epoch / itr_epoch,
sum_test_accuracy1 / itr_test, sum_test_accuracy2 / itr_test))

```

lossは以下のとおり、softmax_cross_entropy(policy用)とsigmoid_cross_entropy(value用)の和になっています。

```loss = F.softmax_cross_entropy(y1, t1) + F.sigmoid_cross_entropy(y2, t2)
```

```    logging.info('save the model')
serializers.save_npz(args.model+'{}'.format(optimizer.epoch + 1), model)
logging.info('save the optimizer')
serializers.save_npz(args.state+'{}'.format(optimizer.epoch + 1), optimizer)
optimizer.new_epoch()
```

### （３）収束状況を確認する～正規表現利用のログのグラフ出力

```>python train_policy_value.py kifulist3000_train.txt kifulist3000_test.txt --eval_interval 1000
2018/08/29 23:22:44     INFO    read kifu start
2018/08/29 23:23:04     INFO    load train pickle
2018/08/29 23:23:07     INFO    load test pickle
2018/08/29 23:23:07     INFO    read kifu end
2018/08/29 23:23:07     INFO    train position num = 1892246
2018/08/29 23:23:07     INFO    test position num = 208704
2018/08/29 23:23:07     INFO    start training
2018/08/29 23:23:39     INFO    epoch = 1, iteration = 1000, loss = 7.463757, accuracy_pol = 0.01171875,accuracy_val = 0.49609375
2018/08/29 23:24:09     INFO    epoch = 1, iteration = 2000, loss = 7.0174394, accuracy_pol = 0.015625,accuracy_val = 0.5214844
2018/08/29 23:24:38     INFO    epoch = 1, iteration = 3000, loss = 6.834741, accuracy_pol = 0.044921875,accuracy_val = 0.5390625
```

accuracy_pol = 0.044921875,accuracy_val = 0.5390625を出力しているので、どちらも同じグラフ化したいということで、以下のようなコードにしました。

※この正規表現部分で苦労しました

```﻿import argparse
import re
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser()
args = parser.parse_args()
ptn = re.compile(r'iteration = ([0-9]+), loss = ([0-9.]+), accuracy_pol = ([0-9.]+),accuracy_val = ([0-9.]+)')

iteration_list = []
loss_list = []
accuracy_pol_list = []
accuracy_val_list = []
for line in open(args.log, 'r'):
m = ptn.search(line)
print("line",line,m)
if m:
iteration_list.append(int(m.group(1)))
loss_list.append(float(m.group(2)))
accuracy_pol_list.append(float(m.group(3)))
accuracy_val_list.append(float(m.group(4)))

fig, ax1 = plt.subplots()
p1, = ax1.plot(iteration_list, loss_list, 'r', label='loss')
ax1.set_xlabel('iterations')

ax2=ax1.twinx()
p2, = ax2.plot(iteration_list, accuracy_pol_list, 'g', label='accuracy_pol')
p3, = ax2.plot(iteration_list, accuracy_val_list, 'b', label='accuracy_val')
ax1.legend(handles=[p1, p2,p3])
plt.show()
```

ここで苦労したのは、`ptn = re.compile(r'iteration = ([0-9]+), loss = ([0-9.]+), accuracy_pol = ([0-9.]+),accuracy_val = ([0-9.]+)')`

ですね。

ということで、以下のようなグラフが得られました。

これを見ると収束状況はpolicyとvalueで異なることが分かります。

### まとめ

・マルチタスク学習について説明した

・学習の収束性を見た

・これを利用して対戦したい