More than 1 year has passed since last update.

Nxで始めるゼロから作るディープラーニング４章ニューラルネットワークの学習

Last updated at 2022-01-11Posted at 2021-03-10

はじめに

本記事はElixirで機械学習/ディープラーニングができるようになるnumpy likeなライブラリ Nxを使って
「ゼロから作るDeep Learning ―Pythonで学ぶディープラーニングの理論と実装」
をElixirで書いていこうという記事になります。

今回は４章ニューラルネットワークの学習をやっていきます、各項目はの詳細は書籍を参照して頂いて、本記事ではelixirのコードを補足するにとどめます。

量的、内容的に結構重いですが頑張っていきましょう。

準備編
 exla setup
1章 pythonの基本 -> とばします
2章パーセプトロン -> とばします
3章ニューラルネットワーク
 with exla
４章ニューラルネットワークの学習
５章誤差逆伝播法
 Nx.Defn.Kernel.grad
6章学習に関するテクニック -> とばします
7章畳み込みニューラルネットワーク

4.2.1 2乗和誤差

pythonコードは以下でべき乗は Nx.powerになります
0.5 * np.sum((y-t)**2)

loss.ex

defmodule Ch4.Loss do
  import Nx.Defn
  # comment in exla cpu mode
  # @defn_compiler {EXLA, max_float_type: {:f, 64}}
  defn mean_squared_error(y, t) do
    Nx.power(y - t, 2)
    |> Nx.sum()
    |> Nx.multiply(0.5)
  end
end

iex(1)> t = Nx.tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
#Nx.Tensor<
  s64[10]
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
>
iex(2)> y = Nx.tensor([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0])
#Nx.Tensor<
  f64[10]
  [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
>
iex(3)> Ch4.Loss.mean_squared_error(y, t)
#Nx.Tensor<
  f64
  0.09750000000000003
>

4.2.2 交差エントロピー誤差

こちらは最終的なミニバッチ対応版でbatch_sizeの指定がない場合は1で割るので特に何も起こりません
pythonに比べるとだいぶスッキリしてますね

def cross_entropy_error(y, t): 
  if y.ndim == 1:
    t = t.reshape(1, t.size)
    y = y.reshape(1, y.size)
  batch_size = y.shape[0]
  return -np.sum(t * np.log(y + 1e-7)) / batch_size

loss.ex

defmodule Ch4.Loss do
  import Nx.Defn
  # comment in exla cpu mode
  # @defn_compiler {EXLA, max_float_type: {:f, 64}}

  defn cross_entropy_error(y, t, batch_size \\ 1) do
    Nx.add(y, 1.0e-7)
    |> Nx.log()
    |> Nx.multiply(t)
    |> Nx.sum()
    |> Nx.negate()
    |> Nx.divide(batch_size)
  end
end

iex(1)> t = Nx.tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
#Nx.Tensor<
  s64[10]
  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
>
iex(2)> y = Nx.tensor([0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0])
#Nx.Tensor<
  f64[10]
  [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
>
iex(3)> Ch4.Loss.cross_entropy_error(y, t)
#Nx.Tensor<
  f64
  0.510825457099338
>

4.2.3 ミニバッチ学習

Dataset moduleにone_hot行列に変換するものがなかったので実装します
テンソル化する前のListなので Nx.tensor()でテンソルするのをお忘れなく

dataset.ex

  def to_one_hot(label) do
    list = label |> Enum.max |> (&(0..&1)).() |> Enum.to_list
    label |> Enum.map(fn t ->  Enum.map(list, fn l -> if t == l, do: 1, else: 0 end) end)
  end

np.random.choiceもないので以下で代用します

x_test = Dataset.test_image() 
t_test = Dataset.test_label() |> Dataset.to_one_hot
{test_size, _ } = Nx.shape(x_test)
batch_size = 10
batch_mask = 0..(test_size-1) |> Enum.take_random(batch_size)
x_batch = Enum.map(batch_mask, fn mask -> Enum.at(x_test, mask) end) |> Nx.tensor |> (& Nx.divide(&1, Nx.reduce_max(&1))).()
t_batch = Enum.map(batch_mask, fn mask -> Enum.at(t_test, mask) end) |> Nx.tensor

4.3.1 微分

これを参考に関数を引数に取れるようにします
https://qiita.com/piacerex/items/421753c033647b6cfb99

grad.ex

defmodule Ch4.Grad do
  import Nx.Defn
  # comment in exla cpu mode
  # @defn_compiler {EXLA, max_float_type: {:f, 64}}
  def numerical_diff(func \\ &nop/1, tensor) do
    h = 1.0e-4
    func.(Nx.add(tensor, h))
    |> Nx.subtract(func.(Nx.subtract(tensor, h)))
    |> Nx.divide(2 * h)
  end

  defp nop(enum), do: enum
end

4.3.2 数値微分の例

function1 = fn (x) -> x |> Nx.power(2) |> Nx.multiply(0.01) |> Nx.add(Nx.multiply(x, 0.1)) end
#Function<44.79398840/1 in :erl_eval.expr/5>
iex(47)> Ch4.Grad.numerical_diff(function1, 5)
#Nx.Tensor<
  f64
  0.1999999999990898
>

4.3.3 偏微分

iex(1)> function_tmp1 = fn x0 -> Nx.multiply(x0, x0) |> Nx.add(Nx.power(4.0,2)) end
#Function<44.79398840/1 in :erl_eval.expr/5>
iex(2)> Ch4.Grad.numerical_diff(function_tmp1,3.0)
#Nx.Tensor<
  f64
  6.00000000000378
>
iex(3)> function_tmp2 = fn x1 -> Nx.power(3.0,2) |> Nx.add(Nx.multiply(x1,x1)) end    
#Function<44.79398840/1 in :erl_eval.expr/5>
iex(4)> Ch4.Grad.numerical_diff(function_tmp2,4.0)                                
#Nx.Tensor<
  f64
  7.999999999999119
>

4.4 勾配

pythonコードは一時的に保持する変数を作成して、最後に戻す等していますが、
Elixirは基本イミュータブルなのでEnum.mapでまわします

特定箇所の重み変更するのが難しいので、一度リストにしてList.update_atで変更した後にNx.tensorで変換しています

grad.py

def numerical_gradient(f, x): 
  h = 1e-4 # 0.0001
  grad = np.zeros like(x) # x と同じ形状の配列を生成 
  for idx in range(x.size):
    tmp_val = x[idx] 
    # f(x+h) の計算
    x[idx] = tmp_val + h
    fxh1 = f(x)

    # f(x-h) の計算
    x[idx] = tmp_val - h 
    fxh2 = f(x)

    grad[idx] = (fxh1 - fxh2) / (2*h)
    x[idx] = tmp_val # 値を元に戻す 

  return grad

Ch4/grad.ex

defmodule Ch4.Grad do
  import Nx.Defn
...
  def numerical_gradient(func \\ &nop/1 , tensor) do
    h = 1.0e-4
    Enum.to_list(0..(Nx.size(tensor) - 1))
    |> Enum.map(fn i ->
      idx_p =
        tensor
        |> Nx.to_flat_list
        |> List.update_at(i, &(&1 + h))
        |> Nx.tensor()
        |> Nx.reshape(Nx.shape(tensor))

      idx_n = tensor
        |> Nx.to_flat_list
        |> List.update_at(i, &(&1 - h))
        |> Nx.tensor()
        |> Nx.reshape(Nx.shape(tensor))

      func.(idx_p)
      |> Nx.subtract(func.(idx_n))
      |> Nx.divide(2 * h)
      |> Nx.to_flat_list
    end)
    |> Nx.tensor()
    |> Nx.reshape(Nx.shape(tensor))
  end
end

iex(1)> function_2 = fn x -> Nx.power(x[0], 2) |> Nx.add(Nx.power(x[1], 2)) end
iex(2)> Ch4.Grad.numerical_gradient(function_2, Nx.tensor([3.0, 4.0]))
#Nx.Tensor<
  f32[2]
  [5.98907470703125, 8.001327514648438]
>

4.4.1 勾配法

先程のnumerical_gradientをstem_num回行って、
求められた勾配を学習率(lr)を掛けて引くだけなので特に難しい箇所はありません

ch4/grad.ex

defmodule Ch4.Grad do
  import Nx.Defn
...
  def gradient_descent(f, tensor, lr \\ 0.01, step_num \\ 100) do
    Enum.reduce(
      0..(step_num - 1),
      tensor,
      fn _, acc ->
        numerical_gradient(f, acc)
        |> Nx.multiply(lr)
        |> Nx.subtract(acc)
      end
    )
  end
end

iex(1)> function_2 = fn x -> Nx.power(x[0], 2) |> Nx.add(Nx.power(x[1], 2)) end
iex(2)> Ch4.Grad.gradient_descent(function_2, Nx.tensor([-3.0,4.0]), 0.1)
#Nx.Tensor<
  f32[2]
  [-6.106581906806241e-10, 8.152838404384966e-10]
>

4.4.2 ニューラルネットワークに対する勾配

numerical_gradientを実際にニューラルネットワークで使っていきます

simple_net.ex

defmodule Ch4.SimpleNet do
  def init do
    #Nx.random_normal({2,3})
    Nx.tensor(
      [
        [ 0.47355232, 0.9977393, 0.84668094],
        [ 0.85557411, 0.03563661, 0.69422093]
      ]
    )
  end

  def predict(x,w) do
    Nx.dot(x,w)
  end

  def loss(x,t,w) do
    predict(x,w)
    |> Ch3.Activation.softmax
    |> Ch4.Loss.cross_entropy_error(t)
  end
end

iex(1)> net = Ch4.SimpleNet.init
#Nx.Tensor<
  f32[2][3]
  [
    [0.47355231642723083, 0.997739315032959, 0.8466809391975403],
    [0.8555741310119629, 0.03563661128282547, 0.6942209005355835]
  ]
>
iex(2)> x = Nx.tensor([0.6,0.9])
#Nx.Tensor<
  f32[2]
  [0.6000000238418579, 0.8999999761581421]
>
iex(3)> p = Ch4.SimpleNet.predict(x,net)
#Nx.Tensor<
  f32[3]
  [1.0541480779647827, 0.6307165622711182, 1.1328073740005493]
>
iex(4)> Nx.argmax(p)    
#Nx.Tensor<
  s64
  2
>
iex(5)> t = Nx.tensor([0,0,1])
#Nx.Tensor<
  s64[3]
  [0, 0, 1]
>
iex(6)> Ch4.SimpleNet.loss(x,t,net)
#Nx.Tensor<
  f32
  0.9280683994293213
>
iex(7)> f = fn w -> Ch4.SimpleNet.loss(Nx.tensor([0.6,0.9]),Nx.tensor([0,0,1]),w) end
#Function<44.79398840/1 in :erl_eval.expr/5>
iex(8)> Ch4.Grad.numerical_gradient(f, net)
#Nx.Tensor<
  f32[2][3]
  [
    [0.2193450927734375, 0.14394521713256836, -0.36269426345825195],
    [0.32901763916015625, 0.2154707908630371, -0.5444884300231934]
  ]
>

4.5.1 ２層ニューラルネットワークのクラス

実際に学習を行うNNを実装していきます

numerical_gradientはNxの方に勾配を求める関数 grad()があるのでそちらを使います！
grad()はdefnで定義した関数内でしか使用できないので注意してください

numerical_gradientの引数を{,,,,} = paramsとしているのは
paramsが４つの要素を持つタプルであるということを示す必要があるため、この形になっています
paramsだけですとエラーになりました

ch4/two_layer_net.ex

defmodule Ch4.TwoLayerNet do
  import Nx.Defn
  # @defn_compiler {EXLA, max_float_type: {:f, 32}}
  defn init_params(input_size \\ 784, hidden_size \\ 100, output_size \\ 10) do
    w1 = Nx.random_normal({input_size, hidden_size}, 0.0, 0.1)
    b1 = Nx.random_uniform({ hidden_size }, 0, 0, type: {:f, 64})
    w2 = Nx.random_normal({ hidden_size, output_size }, 0.0, 0.1)
    b2 = Nx.random_uniform({ output_size }, 0, 0, type: {:f, 64})
    { w1, b1, w2, b2 }
  end

  defn predict({ w1, b1, w2, b2 }, x) do
    x
    |> Nx.dot(w1)
    |> Nx.add(b1)
    |> Ch3.Activation.sigmoid
    |> Nx.dot(w2)
    |> Nx.add(b2)
    |> Ch3.Activation.softmax
  end

  defn loss({ w1, b1, w2, b2 }, x, t, batch_size) do
    predict({w1, b1, w2, b2}, x)
    |> Ch4.Loss.cross_entropy_error(t, batch_size)
  end

  defn accuracy({ w1, b1, w2, b2 }, x, t) do
    predict({ w1, b1, w2, b2 }, x)
    |> Nx.argmax(axis: 1)
    |> Nx.equal(Nx.argmax(t, axis: 1))
    |> Nx.mean
  end

  defn numerical_gradient({ _w1, _b1, _w2, _b2 } = params, x, t, batch_size) do
    grad(params, loss(params, x, t, batch_size))
  end
end

iex(1)> params = Ch4.TwoLayerNet.params_init()
iex(2)> x = Nx.random_uniform({100, 784})
iex(3)> t = Enum.map(0..99, fn _ -> Enum.random(0..9)end)  |> Dataset.to_one_hot |> Nx.tensor
iex(4)> { grad_w1, grad_b1, grad_w2, grad_b2 } = Ch4.TwoLayerNet.numerical_gradient(params,x,t)
{#Nx.Tensor<
   f64[784][100]
...
>, #Nx.Tensor<
   f64[100]
...
>,
 #Nx.Tensor<
   f64[100][10]
...
>, #Nx.Tensor<
   f64[10]
...
>}

4.5.2 ミニバッチ学習の実装

grad関数で勾配が取得できることがわかったので、実際に重みとバイアスを更新していく関数を実装していきます
学習にはExlaのCPUを使っても時間がかかるので覚悟しておきましょう

Enum.reduceで複数の値を更新したい場合はMapを使えば問題なく行えます

ch4/two_layer_net.ex

defmodule Ch4.TwoLayerNet do
...
  defn update({ w1, b1, w2, b2 } = params, x, t, lr, batch_size) do
    {grad_w1, grad_b1, grad_w2, grad_b2} = grad(params, loss(params, x, t, batch_size))

    {
      w1 - (grad_w1 * lr),
      b1 - (grad_b1 * lr),
      w2 - (grad_w2 * lr),
      b2 - (grad_b2 * lr)
    }
  end

  def mini_batch(x_train, t_train, batch_size) do
    row = Enum.count(x_train)
    batch_mask = 0..(row - 1) |> Enum.take_random(batch_size)
    x_batch = Enum.map(batch_mask, fn mask -> Enum.at(x_train, mask) end) |> Nx.tensor |> (& Nx.divide(&1, Nx.reduce_max(&1))).()
    t_batch = Enum.map(batch_mask, fn mask -> Enum.at(t_train, mask) end) |> Nx.tensor
    { x_batch, t_batch }
  end

  def train(params) do
    x_train = Dataset.train_image
    t_train = Dataset.train_label |> Dataset.to_one_hot
    IO.puts("data load")
    iteras_num = 1000
    batch_size = 100
    lr = 0.1
    result = Enum.reduce(1..iteras_num, %{params: params, loss_list: []}, fn i, acc ->
      IO.puts("#{i} epoch start")
      { x_batch, t_batch } = mini_batch(x_train, t_train, batch_size)
      params = update(acc.params, x_batch, t_batch, lr, batch_size)
      train_loss_list = [loss(acc.params, x_batch, t_batch, batch_size) |> Nx.to_number | acc.loss_list ]

      IO.inspect(train_loss_list |> Enum.reverse)
      %{params: params, loss_list: train_loss_list}
    end)
    IO.inspect(result.loss_list)
  end
end

4.5.3 テストデータで評価

1epoch = 100iterates として rem(i,batch_size)が０のときにaccuracyを計るようにしていきます
accuracyも時間がかかるので覚悟しておきましょう・・・

書籍にも、numerical_gradientを使用した学習に時間がかかるので、飛ばして次の高速版に行っても構いませんとあるので、
全体の流れだけ追って、特にエラーがないようでしたら途中で止めても大丈夫かと思います

ch4/two_layer_net.ex

defmodule Ch4.TwoLayerNet
...
  def train(params) do
    x_train = Dataset.train_image
    t_train = Dataset.train_label |> Dataset.to_one_hot
    x_test = Dataset.test_image |> Nx.tensor |> (& Nx.divide(&1, Nx.reduce_max(&1))).()
    t_test = Dataset.test_label |> Dataset.to_one_hot |> Nx.tensor
    IO.puts("data load")
    iteras_num = 1000
    batch_size = 100
    lr = 0.1
    result = Enum.reduce(
      1..iteras_num,
      %{params: params, loss_list: [], train_acc: [], test_acc: []},
    fn i, acc ->
      IO.puts("#{i} epoch start")
      { x_batch, t_batch } = mini_batch(x_train, t_train, batch_size)
      params = update(acc.params, x_batch, t_batch, lr, batch_size)
      train_loss_list = [loss(acc.params, x_batch, t_batch, batch_size) |> Nx.to_number | acc.loss_list ]

      IO.inspect(train_loss_list |> Enum.reverse)
      if (rem(i,2) == 0) do
        IO.inspect(acc.train_acc)
        IO.inspect(acc.test_acc)
        %{
          params: params,
          loss_list: train_loss_list,
          train_acc: [accuracy(params, x_batch, t_batch) |> Nx.to_number | acc.train_acc ],
          test_acc: [accuracy(params, x_test, t_test) |> Nx.to_number | acc.test_acc]
        }
      else
        %{
          params: params,
          loss_list: train_loss_list,
          train_acc: acc.train_acc,
          test_acc: acc.test_acc
        }
      end
    end)
    IO.inspect(result.loss_list)
  end
end

最後に

実装は以上になります、ありがとうございました

今回のコード
https://github.com/thehaigo/nx_dl/commit/39ea4711d4e35a8dbb2a4475466623ecb05e68c9
https://github.com/thehaigo/nx_dl/commit/0c0f1f3661637da6a7c9c2591413ea4489e6ace8

参考ページ

https://qiita.com/piacerex/items/421753c033647b6cfb99
https://github.com/elixir-nx/nx/blob/main/exla/examples/mnist.exs

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

Nxで始めるゼロから作るディープラーニング ４章ニューラルネットワークの学習

はじめに

4.2.1 2乗和誤差

4.2.2 交差エントロピー誤差

4.2.3 ミニバッチ学習

4.3.1 微分

4.3.2 数値微分の例

4.3.3 偏微分

4.4 勾配

4.4.1 勾配法

4.4.2 ニューラルネットワークに対する勾配

4.5.1 ２層ニューラルネットワークのクラス

4.5.2 ミニバッチ学習の実装

4.5.3 テストデータで評価

最後に

参考ページ

Nxで始めるゼロから作るディープラーニング４章ニューラルネットワークの学習