LoginSignup
1
0

上手くいかないDQNのコード

Last updated at Posted at 2023-12-21

入力層のノードが2個、3層で1層あたりのノード数が3個の隠れ層、出力層のノードが4個のQ-networkとtarget-networkで迷路を解こうとしました。

Deeplearning.py
class Deepnetwork:
    def __init__(self,input_layer,hidden_layer,output_layer,learningrate):
        self.count_output=output_layer
        self.weight=[np.random.normal(0.0,pow(hidden_layer[0],-0.5),(hidden_layer[0],input_layer))]#入力層、隠れ層間の重み -1.0〜1.0までの正規分布(標準偏差-0.5)
        for i in range(len(hidden_layer)-1):
            self.weight.append(np.random.normal(0.0,pow(hidden_layer[i+1],-0.5),(hidden_layer[i+1],hidden_layer[i])))#入力層、隠れ層間の重みと同様
        self.weight.append(np.random.normal(0.0,pow(self.count_output,-0.5),(self.count_output,hidden_layer[-1])))#入力層、隠れ層間の重みと同様
        self.lr=learningrate
        self.count_weight=len(self.weight)-1
    def caluculation(self,inputer):#順伝播
      self.state=[inputer]
      for i in range(self.count_weight+1):
        self.save=np.dot(self.weight[i],self.state[i])
        self.state.append(np.where(self.save>0,self.save,0.01*self.save))
      return self.state[-1]

    def learning(self,error):#誤差逆伝播
        error[np.isnan(error)]=0.0#NaNを解消
        self.error=[error]
        for i in range(self.count_weight):
          self.the_error=np.dot(self.weight[self.count_weight-i].T,self.error[i])
          self.the_error[np.isnan(self.the_error)]=0.0#NaNを解消
          self.error.append(self.the_error)
#学習率×損失×LReLU関数を微分したもの×前の層からの入力
        for i in range(self.count_weight+1):
          self.weight[self.count_weight-i]+=self.lr*np.dot((self.error[i]*np.where(np.dot(self.weight[self.count_weight-i],self.state[self.count_weight-i])>0,1.0,0.01)).reshape(-1,1),np.array(self.state[self.count_weight-i]).reshape(1,-1))

    def get_weight(self):
      return self.weight
    def change_weight(self,weight):
      self.weight=weight
Qlearning.py
class Qlearning:
    def __init__(self,lr,dr,gamma,c_state,c_action):
        self.lr=lr#学習率の設定
        self.dr=dr#割引率の設定
        self.gamma=gamma#ε-greedy方策のεの設定
        self.c_action=c_action#行動の種類の数の設定
        self.q_network=Deepnetwork(c_state,(3,3,3),c_action,lr)#入力層が2個、1層あたりのノード数が3個で層数が3の隠れ層
        self.t_network=Deepnetwork(c_state,(3,3,3),c_action,lr)#上記と同様
        self.memory=[]
    def act(self,state):
        self.state=state
#学習状況を知るための余分なコード
        print(state)

#ε-greedy方策
        if self.gamma>np.random.uniform():
            self.action=np.random.randint(0,self.c_action)
        else:
          self.action=np.argmax(self.t_network.caluculation(state))

#学習状況を知るための余分なコード       print(self.t_network.caluculation(state))
        return self.action

    def note(self,next_state,reward,is_no_next):

      #if reward>=1.0:
         #self.reward=1.0
      #elif reward<=-1.0:
         #self.reward=-1.0
      #else:
         #self.reward=0.0
      self.reward=reward
#状態、行動、報酬、次の状態、次の状態で行動があるか、を保存する       
      self.near_memory=[self.state,self.action,self.reward,next_state,is_no_next]
      self.memory.append(self.near_memory)
      self.c_memory=len(self.memory)

    def learn(self,batch,batchsize,hard,changer):
      if batch==True:
        if batchsize>self.c_memory:
          self.c_sample=self.c_memory
        else:
          self.c_sample=batchsize
        self.sample=random.sample(self.memory,self.c_sample)
        self.c_the_action=np.repeat(0.0,self.c_action)
        self.sum=[]
        for i in range(self.c_sample):
          self.the_action=self.sample[i][1]
          self.error=np.repeat(0.0,self.c_action)
        self.error[self.the_action]=self.sample[i][2]+np.where(self.sample[i][4]==True,0.0,self.dr*max(self.t_network.caluculation(self.sample[i][3])))-self.q_network.caluculation(self.sample[i][0])[self.the_action]
          self.c_the_action[self.the_action]+=1.0
          self.sum.append(self.error)
        self.sum=sum(self.sum)
        for i in range(self.c_action):
          if not self.c_the_action[i]==0.0:
            self.sum[i]/=self.c_the_action[i]
        self.q_network.learning(self.sum)
      else:
        self.the_action=self.near_memory[1]
        self.error=np.repeat(0.0,self.c_action)
        self.error[self.the_action]=self.near_memory[2]+np.where(self.near_memory[4]==True,0.0,self.dr*max(self.t_network.caluculation(self.near_memory[3])))-self.q_network.caluculation(self.near_memory[0])[self.the_action]
        self.q_network.learning(self.error)
      if hard==True:
          self.t_network.change_weight(self.q_network.get_weight())
      else:
           self.q_weight=self.q_network.get_weight()
           self.t_weight=self.t_network.get_weight()
           for i in range(len(self.q_weight)):
              self.t_weight[i]=changer*(self.q_weight[i]-self.t_weight[i])
              self.t_network.change_weight(self.t_weight)
    def change_gamma(self,gamma):
      self.gamma=gamma
Meiro.py
class meiro:
    def __init__(self,world,spawn):
        self.world=world
        self.actions={
            "UP": 0,
            "DOWN": 1,
            "LEFT": 2,
            "RIGHT": 3
        }
        self.filed_type={
            "N": 0,  # 通常
            "G": 1,  # ゴール
            "W": 2,  # 壁
            "T": 3,  # トラップ
        }
        self.spawn=spawn
        self.position=spawn
        self.save_position=[spawn]
        self.count_goal=[]
        self.episode_reward=[]
        self.rewards=[]
        self.episode_step=0
        self.steps=[]
        self.episode=0
    def show_state(self):
        return self.position

    def action(self,act):
        #リストに当てはまる形に変換
        self.xp=self.position[0]-1
        self.yp=self.position[1]-1
        if act==self.actions["UP"]:
           self.yp-=1
        if act==self.actions["DOWN"]:
           self.yp+=1
        if act==self.actions["LEFT"]:
           self.xp-=1
        if act==self.actions["RIGHT"]:
           self.xp+=1
        self.episode_step+=1
        if self.is_it_ok()==True:
            self.next_position=(self.xp+1,self.yp+1)#元の形に戻す
            if self.world[self.yp][self.xp]==self.filed_type["N"]:
                if self.next_position in self.save_position:
                    self.reward=-0.1
                else:
                    self.reward=0.1
                    self.save_position.append(self.next_position)
                self.position=self.next_position
                self.episode_reward.append(self.reward)
                self.is_no_next=False

            if self.world[self.yp][self.xp]==self.filed_type["G"]:
                self.reward=0.2
                self.position=self.spawn
                self.episode_reward.append(self.reward)
                self.rewards.append(sum(self.episode_reward))
                self.episode_reward=[]
                self.count_goal.append(1)
                self.is_no_next=True
                self.steps.append(self.episode_step)
                self.episode_step=0
                self.episode+=1
                self.save_position=[self.spawn]
            if self.world[self.yp][self.xp]==self.filed_type["T"]:
                self.reward=-0.2
                self.position=self.spawn
                self.episode_reward.append(self.reward)
                self.rewards.append(sum(self.episode_reward))
                self.episode_reward=[]
                self.count_goal.append(0)
                self.is_no_next=True
                self.steps.append(self.episode_step)
                self.episode_step=0
                self.episode+=1
                self.save_position=[self.spawn]

        else:
            self.reward=-0.1
            self.next_position=self.position
            self.episode_reward.append(self.reward)
            self.is_no_next=False
        return (self.next_position,self.reward,self.is_no_next)

    def is_it_ok(self):
        if self.yp>=len(self.world) or self.yp<0:
            return False
        elif self.xp>=len(self.world[self.yp]) or self.xp<0:
            return False
        elif self.world[self.yp][self.xp]==self.filed_type["W"]:
            return False
        return True

    def result(self):
        return (self.rewards,self.count_goal,self.steps,self.episode)
    def reset(self):
      self.rewards=[]
      self.count_goal=[]
      self.steps=[]
      self.episode_step=0
      self.position=self.spawn
      self.rewards=[]
      self.episode_reward=[]
      self.episode=0
      self.save_position=[self.spawn]
Main.py
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy.special import expit

a=Qlearning(0.3,0.3,0.3,2,4)
world=[[3, 2, 0, 1],
       [0, 0, 0, 2],
       [2, 0, 2, 0],
       [2, 0, 2, 0],
       [2, 0, 2, 0]]
world2=[[0,1],
        [0],
        [0],
        [0],
        [0]]
b=meiro(world,(2,5))
for i in range(60000):
    state=b.show_state()
    action=a.act(state)
    data=b.action(action)
    a.note(data[0],data[1],data[2])
    a.learn(False,10,True,0.5)
episode=b.result()[3]
b.reset()
a.change_gamma(0.0)
for i in range(50):
  state=b.show_state()
  action=a.act(state)
  b.action(action)
result=b.result()[2]
plt.plot(range(1,len(result)+1),result)
plt.xlabel("episode")
plt.ylabel("step")
plt.show()
print(result)
result2=b.result()[1]
print(episode)
1
0
5

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0