80
71

# ED法の重みをGPUで並列更新したら3,000倍高速化した。

Last updated at Posted at 2024-04-30

# GPUを使った重みの並列更新

`forward`で 3.02 s ± 491 ms

となりました（RTX3070）。
シーケンシャルな`forward`と比較し約3,000倍の速度という、圧倒的な並列計算の力を感じます。

`forward`で 363 ms ± 90.3 ms
`update`で 3.26 s ± 38.2 ms
`forward`の結果は逆転していますね。

``````import numpy as np
import cupy as cp
# cp.cuda.set_allocator(cp.cuda.MemoryPool().malloc)

def sigmoid_(x, u0=.4):
x *= 2 / u0
return cp.exp(cp.minimum(x, 0)) / (1 + cp.exp(- cp.abs(x)))

def dsigmoid(x):
return x * (1 - x)

sigmoid_.d = dsigmoid

class ED_Linear:
def __init__(self, in_, out_, *, w=None, b=None):
self.weight = abs(cp.random.randn(2, 2, in_, out_)) if w is None else w
self.bias = cp.random.rand(2, 1, out_) if b is None else b
self.weight[:, 1] *= -1
self.ope = cp.array([[[1]], [[-1]]])

def __call__(self, x):
return self.forward(x)

def forward(self, x):
return ((x @ self.weight).sum(1) + self.bias) * self.ope

class ED_IOLayer:
def __init__(self, in_, out_, alpha=1., f=sigmoid_):
self.alpha = alpha
self.f = f
self.cells= ED_Linear(in_, out_)

def __call__(self, x):
return self.forward(x)

def forward(self, x):
self.x = x
self.y = self.f(self.cells(self.x))
return self.y

def update(self, d):
db = cp.einsum("b,pbo->pbo", d,  self.f.d(self.y))
dw = cp.einsum("pbo,qbi->pqbio", db, self.x)
self.cells.weight += self.alpha * dw.mean(2)
self.cells.bias += self.alpha * db.mean(1, keepdims=True)

class ED_HiddenLayer:
def __init__(self, width, depth, alpha=1., f=sigmoid_):
self.weights = abs(cp.random.randn(depth, 2, 2, width, width))
self.biases = cp.random.rand(depth, 2, 1, width)
self.alpha = alpha
self.f = f
self.layers = [ED_Linear(width, width, w=w, b=b) for w, b in zip(self.weights, self.biases)]

def __call__(self, x):
return self.forward(x)

def forward(self, x):
self.x = []
for layer in self.layers:
self.x.append(x)
x = self.f(layer(x))
self.x.append(x)
self.x = cp.asarray(self.x)
return x

def update(self, d):
db = cp.einsum("b,dpbo->dpbo", d,  self.f.d(self.x[1:]))
dw = cp.einsum("dpbo,dqbi->dpqbio", db, self.x[:-1])
self.weights += self.alpha * dw.mean(3)
self.biases += self.alpha * db.mean(2, keepdims=True)

class ED:
def __init__(self, in_, hidden_width, hidden_depth=1, alpha=.8):
self.layers = [
ED_IOLayer(in_, hidden_width, alpha, sigmoid_),
ED_HiddenLayer(hidden_width, hidden_depth, alpha, sigmoid_),
ED_IOLayer(hidden_width, 1, f=sigmoid_)
]

def __call__(self, x):
return self.forward(x)

def forward(self, x):
x = cp.asarray(x[None].repeat(2, 0))
for layer in self.layers:
x = layer(x)
return x[0, :, 0].get()

def update(self, d):
for layer in self.layers:
layer.update(d)
``````
MNIST 0/1テスト
``````import torch
import torchvision

train_dataset = torchvision.datasets.MNIST(root='./MNIST',
train=True,
transform=lambda x: cp.array(x).ravel()/255.,
download = True)
test_dataset = torchvision.datasets.MNIST(root='./MNIST',
train=False,
transform=lambda x: cp.array(x).ravel()/255.,
download = True)

sub_train_dataset = [(img, label) for img, label in train_dataset if label in [0, 1]]
sub_test_dataset = [(img, label) for img, label in test_dataset if label in [0, 1]]
train_images, train_labels = zip(*sub_train_dataset)
train_images, train_labels = cp.asarray(train_images), cp.asarray(train_labels)
test_images, test_labels = zip(*sub_test_dataset)
test_images, test_labels = cp.asarray(test_images), cp.asarray(test_labels)
idxs = np.random.choice(range(len(train_images)), len(train_images), replace=False)
idxs = [idxs[batch_size*i:batch_size*(i+1)] for i in range(int(np.ceil(len(train_images)/batch_size)))]

batch_size = 64
ed = ED(28*28, 32, 4096)

ts = []
for i, idx in enumerate(tqdm(idxs)):
images, labels = train_images[idx], train_labels[idx]
outputs = ed(images)
err = labels - outputs
ed.update(err)
if i % 50 == 0:
outputs = ed(test_images)
correct = (outputs.round()==test_labels).sum()
print(f"{(correct/len(test_labels))*100:.02f}%")
``````

# モデルの構造

バイアステンソルのサイズは`（隠れ層の数, 2（興奮性/抑制性）, 1（計算の都合）, 出力サイズ）`です。

# おわりに

パラメータ数が非常に多いLLMをED法で学習できたら革命が起こりそうですね。。。

80
71
3

Register as a new user and use Qiita more conveniently

1. You get articles that match your needs
2. You can efficiently read back useful information
3. You can use dark theme
What you can do with signing up
80
71