# ED法の重みをGPUで並列更新したら3,000倍高速化した。

Last updated at Posted at 2024-04-30

# GPUを使った重みの並列更新

`forward`で 3.02 s ± 491 ms

となりました（RTX3070）。
シーケンシャルな`forward`と比較し約3,000倍の速度という、圧倒的な並列計算の力を感じます。

`forward`で 363 ms ± 90.3 ms
`update`で 3.26 s ± 38.2 ms
`forward`の結果は逆転していますね。

``````import numpy as np
import cupy as cp
# cp.cuda.set_allocator(cp.cuda.MemoryPool().malloc)

def sigmoid_(x, u0=.4):
x *= 2 / u0
return cp.exp(cp.minimum(x, 0)) / (1 + cp.exp(- cp.abs(x)))

def dsigmoid(x):
return x * (1 - x)

sigmoid_.d = dsigmoid

class ED_Linear:
def __init__(self, in_, out_, *, w=None, b=None):
self.weight = abs(cp.random.randn(2, 2, in_, out_)) if w is None else w
self.bias = cp.random.rand(2, 1, out_) if b is None else b
self.weight[:, 1] *= -1
self.ope = cp.array([[[1]], [[-1]]])

def __call__(self, x):
return self.forward(x)

def forward(self, x):
return ((x @ self.weight).sum(1) + self.bias) * self.ope

class ED_IOLayer:
def __init__(self, in_, out_, alpha=1., f=sigmoid_):
self.alpha = alpha
self.f = f
self.cells= ED_Linear(in_, out_)

def __call__(self, x):
return self.forward(x)

def forward(self, x):
self.x = x
self.y = self.f(self.cells(self.x))
return self.y

def update(self, d):
db = cp.einsum("b,pbo->pbo", d,  self.f.d(self.y))
dw = cp.einsum("pbo,qbi->pqbio", db, self.x)
self.cells.weight += self.alpha * dw.mean(2)
self.cells.bias += self.alpha * db.mean(1, keepdims=True)

class ED_HiddenLayer:
def __init__(self, width, depth, alpha=1., f=sigmoid_):
self.weights = abs(cp.random.randn(depth, 2, 2, width, width))
self.biases = cp.random.rand(depth, 2, 1, width)
self.alpha = alpha
self.f = f
self.layers = [ED_Linear(width, width, w=w, b=b) for w, b in zip(self.weights, self.biases)]

def __call__(self, x):
return self.forward(x)

def forward(self, x):
self.x = []
for layer in self.layers:
self.x.append(x)
x = self.f(layer(x))
self.x.append(x)
self.x = cp.asarray(self.x)
return x

def update(self, d):
db = cp.einsum("b,dpbo->dpbo", d,  self.f.d(self.x[1:]))
dw = cp.einsum("dpbo,dqbi->dpqbio", db, self.x[:-1])
self.weights += self.alpha * dw.mean(3)
self.biases += self.alpha * db.mean(2, keepdims=True)

class ED:
def __init__(self, in_, hidden_width, hidden_depth=1, alpha=.8):
self.layers = [
ED_IOLayer(in_, hidden_width, alpha, sigmoid_),
ED_HiddenLayer(hidden_width, hidden_depth, alpha, sigmoid_),
ED_IOLayer(hidden_width, 1, f=sigmoid_)
]

def __call__(self, x):
return self.forward(x)

def forward(self, x):
x = cp.asarray(x[None].repeat(2, 0))
for layer in self.layers:
x = layer(x)
return x[0, :, 0].get()

def update(self, d):
for layer in self.layers:
layer.update(d)
``````
MNIST 0/1テスト
``````import torch
import torchvision

train_dataset = torchvision.datasets.MNIST(root='./MNIST',
train=True,
transform=lambda x: cp.array(x).ravel()/255.,
download = True)
test_dataset = torchvision.datasets.MNIST(root='./MNIST',
train=False,
transform=lambda x: cp.array(x).ravel()/255.,
download = True)

sub_train_dataset = [(img, label) for img, label in train_dataset if label in [0, 1]]
sub_test_dataset = [(img, label) for img, label in test_dataset if label in [0, 1]]
train_images, train_labels = zip(*sub_train_dataset)
train_images, train_labels = cp.asarray(train_images), cp.asarray(train_labels)
test_images, test_labels = zip(*sub_test_dataset)
test_images, test_labels = cp.asarray(test_images), cp.asarray(test_labels)
idxs = np.random.choice(range(len(train_images)), len(train_images), replace=False)
idxs = [idxs[batch_size*i:batch_size*(i+1)] for i in range(int(np.ceil(len(train_images)/batch_size)))]

batch_size = 64
ed = ED(28*28, 32, 4096)

ts = []
for i, idx in enumerate(tqdm(idxs)):
images, labels = train_images[idx], train_labels[idx]
outputs = ed(images)
err = labels - outputs
ed.update(err)
if i % 50 == 0:
outputs = ed(test_images)
correct = (outputs.round()==test_labels).sum()
print(f"{(correct/len(test_labels))*100:.02f}%")
``````

# モデルの構造

バイアステンソルのサイズは`（隠れ層の数, 2（興奮性/抑制性）, 1（計算の都合）, 出力サイズ）`です。

# おわりに

パラメータ数が非常に多いLLMをED法で学習できたら革命が起こりそうですね。。。

