入力層のノードが2個、3層で1層あたりのノード数が3個の隠れ層、出力層のノードが4個のQ-networkとtarget-networkで迷路を解こうとしました。
Deeplearning.py
class Deepnetwork:
def __init__(self,input_layer,hidden_layer,output_layer,learningrate):
self.count_output=output_layer
self.weight=[np.random.normal(0.0,pow(hidden_layer[0],-0.5),(hidden_layer[0],input_layer))]#入力層、隠れ層間の重み -1.0〜1.0までの正規分布(標準偏差-0.5)
for i in range(len(hidden_layer)-1):
self.weight.append(np.random.normal(0.0,pow(hidden_layer[i+1],-0.5),(hidden_layer[i+1],hidden_layer[i])))#入力層、隠れ層間の重みと同様
self.weight.append(np.random.normal(0.0,pow(self.count_output,-0.5),(self.count_output,hidden_layer[-1])))#入力層、隠れ層間の重みと同様
self.lr=learningrate
self.count_weight=len(self.weight)-1
def caluculation(self,inputer):#順伝播
self.state=[inputer]
for i in range(self.count_weight+1):
self.save=np.dot(self.weight[i],self.state[i])
self.state.append(np.where(self.save>0,self.save,0.01*self.save))
return self.state[-1]
def learning(self,error):#誤差逆伝播
error[np.isnan(error)]=0.0#NaNを解消
self.error=[error]
for i in range(self.count_weight):
self.the_error=np.dot(self.weight[self.count_weight-i].T,self.error[i])
self.the_error[np.isnan(self.the_error)]=0.0#NaNを解消
self.error.append(self.the_error)
#学習率×損失×LReLU関数を微分したもの×前の層からの入力
for i in range(self.count_weight+1):
self.weight[self.count_weight-i]+=self.lr*np.dot((self.error[i]*np.where(np.dot(self.weight[self.count_weight-i],self.state[self.count_weight-i])>0,1.0,0.01)).reshape(-1,1),np.array(self.state[self.count_weight-i]).reshape(1,-1))
def get_weight(self):
return self.weight
def change_weight(self,weight):
self.weight=weight
Qlearning.py
class Qlearning:
def __init__(self,lr,dr,gamma,c_state,c_action):
self.lr=lr#学習率の設定
self.dr=dr#割引率の設定
self.gamma=gamma#ε-greedy方策のεの設定
self.c_action=c_action#行動の種類の数の設定
self.q_network=Deepnetwork(c_state,(3,3,3),c_action,lr)#入力層が2個、1層あたりのノード数が3個で層数が3の隠れ層
self.t_network=Deepnetwork(c_state,(3,3,3),c_action,lr)#上記と同様
self.memory=[]
def act(self,state):
self.state=state
#学習状況を知るための余分なコード
print(state)
#ε-greedy方策
if self.gamma>np.random.uniform():
self.action=np.random.randint(0,self.c_action)
else:
self.action=np.argmax(self.t_network.caluculation(state))
#学習状況を知るための余分なコード print(self.t_network.caluculation(state))
return self.action
def note(self,next_state,reward,is_no_next):
#if reward>=1.0:
#self.reward=1.0
#elif reward<=-1.0:
#self.reward=-1.0
#else:
#self.reward=0.0
self.reward=reward
#状態、行動、報酬、次の状態、次の状態で行動があるか、を保存する
self.near_memory=[self.state,self.action,self.reward,next_state,is_no_next]
self.memory.append(self.near_memory)
self.c_memory=len(self.memory)
def learn(self,batch,batchsize,hard,changer):
if batch==True:
if batchsize>self.c_memory:
self.c_sample=self.c_memory
else:
self.c_sample=batchsize
self.sample=random.sample(self.memory,self.c_sample)
self.c_the_action=np.repeat(0.0,self.c_action)
self.sum=[]
for i in range(self.c_sample):
self.the_action=self.sample[i][1]
self.error=np.repeat(0.0,self.c_action)
self.error[self.the_action]=self.sample[i][2]+np.where(self.sample[i][4]==True,0.0,self.dr*max(self.t_network.caluculation(self.sample[i][3])))-self.q_network.caluculation(self.sample[i][0])[self.the_action]
self.c_the_action[self.the_action]+=1.0
self.sum.append(self.error)
self.sum=sum(self.sum)
for i in range(self.c_action):
if not self.c_the_action[i]==0.0:
self.sum[i]/=self.c_the_action[i]
self.q_network.learning(self.sum)
else:
self.the_action=self.near_memory[1]
self.error=np.repeat(0.0,self.c_action)
self.error[self.the_action]=self.near_memory[2]+np.where(self.near_memory[4]==True,0.0,self.dr*max(self.t_network.caluculation(self.near_memory[3])))-self.q_network.caluculation(self.near_memory[0])[self.the_action]
self.q_network.learning(self.error)
if hard==True:
self.t_network.change_weight(self.q_network.get_weight())
else:
self.q_weight=self.q_network.get_weight()
self.t_weight=self.t_network.get_weight()
for i in range(len(self.q_weight)):
self.t_weight[i]=changer*(self.q_weight[i]-self.t_weight[i])
self.t_network.change_weight(self.t_weight)
def change_gamma(self,gamma):
self.gamma=gamma
Meiro.py
class meiro:
def __init__(self,world,spawn):
self.world=world
self.actions={
"UP": 0,
"DOWN": 1,
"LEFT": 2,
"RIGHT": 3
}
self.filed_type={
"N": 0, # 通常
"G": 1, # ゴール
"W": 2, # 壁
"T": 3, # トラップ
}
self.spawn=spawn
self.position=spawn
self.save_position=[spawn]
self.count_goal=[]
self.episode_reward=[]
self.rewards=[]
self.episode_step=0
self.steps=[]
self.episode=0
def show_state(self):
return self.position
def action(self,act):
#リストに当てはまる形に変換
self.xp=self.position[0]-1
self.yp=self.position[1]-1
if act==self.actions["UP"]:
self.yp-=1
if act==self.actions["DOWN"]:
self.yp+=1
if act==self.actions["LEFT"]:
self.xp-=1
if act==self.actions["RIGHT"]:
self.xp+=1
self.episode_step+=1
if self.is_it_ok()==True:
self.next_position=(self.xp+1,self.yp+1)#元の形に戻す
if self.world[self.yp][self.xp]==self.filed_type["N"]:
if self.next_position in self.save_position:
self.reward=-0.1
else:
self.reward=0.1
self.save_position.append(self.next_position)
self.position=self.next_position
self.episode_reward.append(self.reward)
self.is_no_next=False
if self.world[self.yp][self.xp]==self.filed_type["G"]:
self.reward=0.2
self.position=self.spawn
self.episode_reward.append(self.reward)
self.rewards.append(sum(self.episode_reward))
self.episode_reward=[]
self.count_goal.append(1)
self.is_no_next=True
self.steps.append(self.episode_step)
self.episode_step=0
self.episode+=1
self.save_position=[self.spawn]
if self.world[self.yp][self.xp]==self.filed_type["T"]:
self.reward=-0.2
self.position=self.spawn
self.episode_reward.append(self.reward)
self.rewards.append(sum(self.episode_reward))
self.episode_reward=[]
self.count_goal.append(0)
self.is_no_next=True
self.steps.append(self.episode_step)
self.episode_step=0
self.episode+=1
self.save_position=[self.spawn]
else:
self.reward=-0.1
self.next_position=self.position
self.episode_reward.append(self.reward)
self.is_no_next=False
return (self.next_position,self.reward,self.is_no_next)
def is_it_ok(self):
if self.yp>=len(self.world) or self.yp<0:
return False
elif self.xp>=len(self.world[self.yp]) or self.xp<0:
return False
elif self.world[self.yp][self.xp]==self.filed_type["W"]:
return False
return True
def result(self):
return (self.rewards,self.count_goal,self.steps,self.episode)
def reset(self):
self.rewards=[]
self.count_goal=[]
self.steps=[]
self.episode_step=0
self.position=self.spawn
self.rewards=[]
self.episode_reward=[]
self.episode=0
self.save_position=[self.spawn]
Main.py
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy.special import expit
a=Qlearning(0.3,0.3,0.3,2,4)
world=[[3, 2, 0, 1],
[0, 0, 0, 2],
[2, 0, 2, 0],
[2, 0, 2, 0],
[2, 0, 2, 0]]
world2=[[0,1],
[0],
[0],
[0],
[0]]
b=meiro(world,(2,5))
for i in range(60000):
state=b.show_state()
action=a.act(state)
data=b.action(action)
a.note(data[0],data[1],data[2])
a.learn(False,10,True,0.5)
episode=b.result()[3]
b.reset()
a.change_gamma(0.0)
for i in range(50):
state=b.show_state()
action=a.act(state)
b.action(action)
result=b.result()[2]
plt.plot(range(1,len(result)+1),result)
plt.xlabel("episode")
plt.ylabel("step")
plt.show()
print(result)
result2=b.result()[1]
print(episode)