More than 1 year has passed since last update.

上手くいかないDQNのコード

Last updated at 2023-12-22Posted at 2023-12-21

入力層のノードが2個、3層で1層あたりのノード数が3個の隠れ層、出力層のノードが4個のQ-networkとtarget-networkで迷路を解こうとしました。

Deeplearning.py

class Deepnetwork:
    def __init__(self,input_layer,hidden_layer,output_layer,learningrate):
        self.count_output=output_layer
        self.weight=[np.random.normal(0.0,pow(hidden_layer[0],-0.5),(hidden_layer[0],input_layer))]#入力層、隠れ層間の重み　-1.0〜1.0までの正規分布(標準偏差-0.5)
        for i in range(len(hidden_layer)-1):
            self.weight.append(np.random.normal(0.0,pow(hidden_layer[i+1],-0.5),(hidden_layer[i+1],hidden_layer[i])))#入力層、隠れ層間の重みと同様
        self.weight.append(np.random.normal(0.0,pow(self.count_output,-0.5),(self.count_output,hidden_layer[-1])))#入力層、隠れ層間の重みと同様
        self.lr=learningrate
        self.count_weight=len(self.weight)-1
    def caluculation(self,inputer):#順伝播
      self.state=[inputer]
      for i in range(self.count_weight+1):
        self.save=np.dot(self.weight[i],self.state[i])
        self.state.append(np.where(self.save>0,self.save,0.01*self.save))
      return self.state[-1]

    def learning(self,error):#誤差逆伝播
        error[np.isnan(error)]=0.0#NaNを解消
        self.error=[error]
        for i in range(self.count_weight):
          self.the_error=np.dot(self.weight[self.count_weight-i].T,self.error[i])
          self.the_error[np.isnan(self.the_error)]=0.0#NaNを解消
          self.error.append(self.the_error)
#学習率×損失×LReLU関数を微分したもの×前の層からの入力
        for i in range(self.count_weight+1):
          self.weight[self.count_weight-i]+=self.lr*np.dot((self.error[i]*np.where(np.dot(self.weight[self.count_weight-i],self.state[self.count_weight-i])>0,1.0,0.01)).reshape(-1,1),np.array(self.state[self.count_weight-i]).reshape(1,-1))

    def get_weight(self):
      return self.weight
    def change_weight(self,weight):
      self.weight=weight

Qlearning.py

class Qlearning:
    def __init__(self,lr,dr,gamma,c_state,c_action):
        self.lr=lr#学習率の設定
        self.dr=dr#割引率の設定
        self.gamma=gamma#ε-greedy方策のεの設定
        self.c_action=c_action#行動の種類の数の設定
        self.q_network=Deepnetwork(c_state,(3,3,3),c_action,lr)#入力層が2個、1層あたりのノード数が3個で層数が3の隠れ層
        self.t_network=Deepnetwork(c_state,(3,3,3),c_action,lr)#上記と同様
        self.memory=[]
    def act(self,state):
        self.state=state
#学習状況を知るための余分なコード
        print(state)

#ε-greedy方策
        if self.gamma>np.random.uniform():
            self.action=np.random.randint(0,self.c_action)
        else:
          self.action=np.argmax(self.t_network.caluculation(state))

#学習状況を知るための余分なコード       print(self.t_network.caluculation(state))
        return self.action

    def note(self,next_state,reward,is_no_next):

      #if reward>=1.0:
         #self.reward=1.0
      #elif reward<=-1.0:
         #self.reward=-1.0
      #else:
         #self.reward=0.0
      self.reward=reward
#状態、行動、報酬、次の状態、次の状態で行動があるか、を保存する       
      self.near_memory=[self.state,self.action,self.reward,next_state,is_no_next]
      self.memory.append(self.near_memory)
      self.c_memory=len(self.memory)

    def learn(self,batch,batchsize,hard,changer):
      if batch==True:
        if batchsize>self.c_memory:
          self.c_sample=self.c_memory
        else:
          self.c_sample=batchsize
        self.sample=random.sample(self.memory,self.c_sample)
        self.c_the_action=np.repeat(0.0,self.c_action)
        self.sum=[]
        for i in range(self.c_sample):
          self.the_action=self.sample[i][1]
          self.error=np.repeat(0.0,self.c_action)
        self.error[self.the_action]=self.sample[i][2]+np.where(self.sample[i][4]==True,0.0,self.dr*max(self.t_network.caluculation(self.sample[i][3])))-self.q_network.caluculation(self.sample[i][0])[self.the_action]
          self.c_the_action[self.the_action]+=1.0
          self.sum.append(self.error)
        self.sum=sum(self.sum)
        for i in range(self.c_action):
          if not self.c_the_action[i]==0.0:
            self.sum[i]/=self.c_the_action[i]
        self.q_network.learning(self.sum)
      else:
        self.the_action=self.near_memory[1]
        self.error=np.repeat(0.0,self.c_action)
        self.error[self.the_action]=self.near_memory[2]+np.where(self.near_memory[4]==True,0.0,self.dr*max(self.t_network.caluculation(self.near_memory[3])))-self.q_network.caluculation(self.near_memory[0])[self.the_action]
        self.q_network.learning(self.error)
      if hard==True:
          self.t_network.change_weight(self.q_network.get_weight())
      else:
           self.q_weight=self.q_network.get_weight()
           self.t_weight=self.t_network.get_weight()
           for i in range(len(self.q_weight)):
              self.t_weight[i]=changer*(self.q_weight[i]-self.t_weight[i])
              self.t_network.change_weight(self.t_weight)
    def change_gamma(self,gamma):
      self.gamma=gamma

Meiro.py

class meiro:
    def __init__(self,world,spawn):
        self.world=world
        self.actions={
            "UP": 0,
            "DOWN": 1,
            "LEFT": 2,
            "RIGHT": 3
        }
        self.filed_type={
            "N": 0,  # 通常
            "G": 1,  # ゴール
            "W": 2,  # 壁
            "T": 3,  # トラップ
        }
        self.spawn=spawn
        self.position=spawn
        self.save_position=[spawn]
        self.count_goal=[]
        self.episode_reward=[]
        self.rewards=[]
        self.episode_step=0
        self.steps=[]
        self.episode=0
    def show_state(self):
        return self.position

    def action(self,act):
        #リストに当てはまる形に変換
        self.xp=self.position[0]-1
        self.yp=self.position[1]-1
        if act==self.actions["UP"]:
           self.yp-=1
        if act==self.actions["DOWN"]:
           self.yp+=1
        if act==self.actions["LEFT"]:
           self.xp-=1
        if act==self.actions["RIGHT"]:
           self.xp+=1
        self.episode_step+=1
        if self.is_it_ok()==True:
            self.next_position=(self.xp+1,self.yp+1)#元の形に戻す
            if self.world[self.yp][self.xp]==self.filed_type["N"]:
                if self.next_position in self.save_position:
                    self.reward=-0.1
                else:
                    self.reward=0.1
                    self.save_position.append(self.next_position)
                self.position=self.next_position
                self.episode_reward.append(self.reward)
                self.is_no_next=False

            if self.world[self.yp][self.xp]==self.filed_type["G"]:
                self.reward=0.2
                self.position=self.spawn
                self.episode_reward.append(self.reward)
                self.rewards.append(sum(self.episode_reward))
                self.episode_reward=[]
                self.count_goal.append(1)
                self.is_no_next=True
                self.steps.append(self.episode_step)
                self.episode_step=0
                self.episode+=1
                self.save_position=[self.spawn]
            if self.world[self.yp][self.xp]==self.filed_type["T"]:
                self.reward=-0.2
                self.position=self.spawn
                self.episode_reward.append(self.reward)
                self.rewards.append(sum(self.episode_reward))
                self.episode_reward=[]
                self.count_goal.append(0)
                self.is_no_next=True
                self.steps.append(self.episode_step)
                self.episode_step=0
                self.episode+=1
                self.save_position=[self.spawn]

        else:
            self.reward=-0.1
            self.next_position=self.position
            self.episode_reward.append(self.reward)
            self.is_no_next=False
        return (self.next_position,self.reward,self.is_no_next)

    def is_it_ok(self):
        if self.yp>=len(self.world) or self.yp<0:
            return False
        elif self.xp>=len(self.world[self.yp]) or self.xp<0:
            return False
        elif self.world[self.yp][self.xp]==self.filed_type["W"]:
            return False
        return True

    def result(self):
        return (self.rewards,self.count_goal,self.steps,self.episode)
    def reset(self):
      self.rewards=[]
      self.count_goal=[]
      self.steps=[]
      self.episode_step=0
      self.position=self.spawn
      self.rewards=[]
      self.episode_reward=[]
      self.episode=0
      self.save_position=[self.spawn]

Main.py

import numpy as np
import random
import matplotlib.pyplot as plt
from scipy.special import expit

a=Qlearning(0.3,0.3,0.3,2,4)
world=[[3, 2, 0, 1],
       [0, 0, 0, 2],
       [2, 0, 2, 0],
       [2, 0, 2, 0],
       [2, 0, 2, 0]]
world2=[[0,1],
        [0],
        [0],
        [0],
        [0]]
b=meiro(world,(2,5))
for i in range(60000):
    state=b.show_state()
    action=a.act(state)
    data=b.action(action)
    a.note(data[0],data[1],data[2])
    a.learn(False,10,True,0.5)
episode=b.result()[3]
b.reset()
a.change_gamma(0.0)
for i in range(50):
  state=b.show_state()
  action=a.act(state)
  b.action(action)
result=b.result()[2]
plt.plot(range(1,len(result)+1),result)
plt.xlabel("episode")
plt.ylabel("step")
plt.show()
print(result)
result2=b.result()[1]
print(episode)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up