ぷよぷよプログラミングAI学習システムは、中高生の自己調整学習に向けて開発されました。
どうやって、Chromebookで、10分で、素敵な人工知能を作れるのでしょうか?
秘密は、3つ。
1 jupyter liteで、webクライアントがpythonコードを実行
2 超軽量な機械学習フレームワークdezeroを採用
3 ぷよぷよのstageとactionがコンパクト
Github&X
ぷよぷよプログラミングAI学習用まとめ
チュートリアルまとめ
Step12 learned Agent
12 賢いagent作ってみましょう
機械学習したモデルを使って、agentを作ります。
まずは、基本的な部分を作っていきます。
Library
import os
import numpy as np
import random
import dezero_emb as dezero
from puyopuyo import *
dezero_embは、機械学習のためのフレームワークです。
model_filename = "trained_models/puyopuyo.npz"
deep_q_network
class DQNet(dezero.Models.Model):
def __init__(self):
super().__init__()
self.l1 = dezero.L.Linear(128)
self.l2 = dezero.L.Linear(128)
self.l3 = dezero.L.Linear(1)
def forward(self, x):
x = dezero.F.relu(self.l1(x))
x = dezero.F.relu(self.l2(x))
x = self.l3(x)
return x
DQN agent
class DQNAgent:
def __init__(self):
self.action_size = 2
self.qnet = DQNet()
def __call__(self, board, puyo):
action_list = utils.create_action_list(board)
next_boards = []
next_reward =[]
action =(2, 1)
if len(action_list):
for action in action_list:
next_board, reward, done = utils.next_board(board, puyo, action)
if not done:
next_boards.append(next_board)
next_reward.append(reward)
next_boards = np.stack(next_boards)
predictions = self.eval2(next_boards)
next_reward =np.array(next_reward)[:, np.newaxis]
predictions += dezero.Variable(next_reward)
index = predictions.data.argmax()
action = action_list[index]
return action
def boardtostate(self, board):
cont_b = 2 ** np.arange(CFG.Width,dtype=np.int32)
b1 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
b1[board == 1] = 1
b2 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
b2[board == 2] = 1
b3 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
b3[board == 3] = 1
b4 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
b4[board == 4] = 1
board_list =np.concatenate([b1,b2,b3,b4])
state = board_list.dot(cont_b)
return state
def eval(self, board):
state = self.boardtostate(board)
return self.qnet(state)
def eval2(self, boards):
states = []
for i in range(boards.shape[0]):
state = self.boardtostate(boards[i])
states.append(state)
states = np.stack(states)
return self.qnet(states)
def load_model(self,filename):
self.qnet.load_weights(filename)
Test
np.random.seed(seed=123)
env = EnvPuyopuyo()
agent = DQNAgent()
agent.load_model(model_filename)
for _ in range(5):
board, puyo = env.reset()
done = False
final_score = 0
final_pieces = 0
while not done:
action = agent(board, puyo)
board, puyo, reward, done = env.step(action)
final_score += reward
final_pieces += 2
print("Score: {}, pieces {}".format(
final_score,
final_pieces))
Score: 685590, pieces 3442
Score: 51000, pieces 356
Score: 597730, pieces 2988
Score: 59470, pieces 496
Score: 60920, pieces 432
まあまあ