# coding: utf-8
import numpy as np
import random
np.random.seed(0)
# Read from sample text.
data = open('sample.txt', 'r').read()
chars = list(set(data))
data_size, chars_size = len(data), len(chars)
H_size = 100
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def dsigmoid(y):
return y * (1 - y)
def tanh(x):
return np.tanh(x)
def dtanh(y):
return 1 - y * y
def softmax(x):
return np.exp(x) / np.sum(np.exp(x))
class RMSprop:
"""RMSprop"""
def __init__(self, alpha=0.001, gamma=0.99):
self.alpha = alpha
# decay rate
self.gamma = gamma
self.r = None
def update(self, params, grads):
if self.r is None:
self.r = {}
for key, val in params.items():
self.r[key] = np.zeros_like(val)
for key in params.keys():
self.r[key] *= self.gamma
self.r[key] += (1 - self.gamma) * grads[key] * grads[key]
params[key] -= self.alpha * grads[key] / (np.sqrt(self.r[key]) + 1e-7)
class SGD:
def __init__(self, lr=0.01):
self.lr = lr
def update(self, params, grads):
for key in params.keys():
params[key] -= self.lr * grads[key]
class Adam:
"""Adam (http://arxiv.org/abs/1412.6980v8)"""
def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.iter = 0
self.m = None
self.v = None
def update(self, params, grads):
if self.m is None:
self.m, self.v = {}, {}
for key, val in params.items():
self.m[key] = np.zeros_like(val)
self.v[key] = np.zeros_like(val)
self.iter += 1
lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)
for key in params.keys():
# self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
# self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])
params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
# unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
# unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
# params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)
class LSTM():
def __init__(self, input_num, neuron_num):
np.random.seed(0)
H = neuron_num # Number of LSTM hidden layer's neurons (seq_len)
D = input_num # Number of input dimension == number of items in vocabulary
Z = H + D # Because we will concatenate LSTM state with the input
# Input weights
self.W_f = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
self.W_i = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
self.W_a = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
self.W_o = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
self.W_y = np.random.randn(H, D).astype(np.float32) / np.sqrt(D / 2.)
# Bias values
self.b_f = np.zeros((1, H)).astype(np.float32)
self.b_i = np.zeros((1, H)).astype(np.float32)
self.b_a = np.zeros((1, H)).astype(np.float32)
self.b_o = np.zeros((1, H)).astype(np.float32)
self.b_y = np.zeros((1, D)).astype(np.float32)
# # Peephole values :shape(N,)
# self.p_f = np.zeros(1)
# self.p_i = np.zeros(1)
# self.p_o = np.zeros(1)
# delta weight
self.dW_f = np.zeros((Z, H)).astype(np.float32)
self.dW_i = np.zeros((Z, H)).astype(np.float32)
self.dW_a = np.zeros((Z, H)).astype(np.float32)
self.dW_o = np.zeros((Z, H)).astype(np.float32)
self.dW_y = np.zeros((H, D)).astype(np.float32)
# delta bias
self.db_f = np.zeros((1, H)).astype(np.float32)
self.db_i = np.zeros((1, H)).astype(np.float32)
self.db_a = np.zeros((1, H)).astype(np.float32)
self.db_o = np.zeros((1, H)).astype(np.float32)
self.db_y = np.zeros((1, D)).astype(np.float32)
# Output (Initialized by 0)
self.h = [0]
# Delta cell state
self.dc = [0]
# Delta out
self.dh = [0]
self.input_num = input_num
def clear_parameters(self):
self.h = [0]
self.dc = [0]
self.dh = [0]
# delta weight
self.dW_f *= 0
self.dW_i *= 0
self.dW_a *= 0
self.dW_o *= 0
self.dW_y *= 0
# delta bias
self.db_f *= 0
self.db_i *= 0
self.db_a *= 0
self.db_o *= 0
self.db_y *= 0
def fw(self, x, h_prev, c_prev):
x = np.hstack((h_prev, x))
# Forget gate
_f = (x @ self.W_f) + self.b_f
f = sigmoid(_f)
# Input gate
_i = (x @ self.W_i) + self.b_i
i = sigmoid(_i)
# Input condition
_a = (x @ self.W_a) + self.b_a
a = tanh(_a)
# Current cell state
c = (c_prev * f) + (i * a)
# Output gate
_o = (x @ self.W_o) + self.b_o
o = sigmoid(_o)
# Output for cell
h = o * tanh(c)
self.h.append(h)
# Output for target
_y = (h @ self.W_y) + self.b_y
y = softmax(_y)
# return value
cache = (h, c, f, i, a, o, x)
return y, cache
def bw(self, y, target_index, cache, c_prev):
h = cache[0]
c = cache[1]
f = cache[2]
i = cache[3]
a = cache[4]
o = cache[5]
x = cache[6]
# Output
dy = np.copy(y)
dy[0, target_index] -= 1
_dh = dy @ self.W_y.T
dh = (_dh + self.dh).astype(np.float32)
# Cell state
dc = self.dc + dh * o * (1 - np.tanh(c) ** 2)
self.dc = dc * f # For next parameter
# Gates
da = dc * i * dtanh(a)
di = dc * a * dsigmoid(i)
df = dc * c_prev * dsigmoid(f)
do = dh * tanh(c) * dsigmoid(o)
dx_f = df @ self.W_f.T
dx_i = di @ self.W_i.T
dx_a = da @ self.W_a.T
dx_o = do @ self.W_o.T
dx = dx_f + dx_i + dx_a + dx_o
self.dh = dx[:, :H_size]
# delta weight
self.dW_f += x.T @ df
self.dW_i += x.T @ di
self.dW_a += x.T @ da
self.dW_o += x.T @ do
self.dW_y += h.T @ dy
# delta bias
self.db_f += np.sum(df)
self.db_i += np.sum(di)
self.db_a += np.sum(da)
self.db_o += np.sum(do)
self.db_y += np.sum(dy)
# Calculate loss from Supervised data. (one-hot vector)
t = np.zeros(self.input_num)
t[target_index] = 1
loss = np.mean(-np.log(y) * t)
return loss
def update_grad(self, lr=0.1):
params = {}
params["W_f"] = self.W_f
params["W_i"] = self.W_i
params["W_a"] = self.W_a
params["W_o"] = self.W_o
params["W_y"] = self.W_y # Output
params["b_f"] = self.b_f
params["b_i"] = self.b_i
params["b_a"] = self.b_a
params["b_o"] = self.b_o
params["b_y"] = self.b_y # Output
grads = {}
grads["W_f"] = self.dW_f
grads["W_i"] = self.dW_i
grads["W_a"] = self.dW_a
grads["W_o"] = self.dW_o
grads["W_y"] = self.dW_y # Output
grads["b_f"] = self.db_f
grads["b_i"] = self.db_i
grads["b_a"] = self.db_a
grads["b_o"] = self.db_o
grads["b_y"] = self.db_y # Output
optimizer = SGD(lr=lr) #
# optimizer = Adam(lr=lr) #
# optimizer = RMSprop(alpha=0.001) #
optimizer.update(params, grads)
# Maybe unnecessary below
self.W_f = params["W_f"]
self.W_i = params["W_i"]
self.W_a = params["W_a"]
self.W_o = params["W_o"]
self.W_y = params["W_y"] # Output
self.b_f = params["b_f"]
self.b_i = params["b_i"]
self.b_a = params["b_a"]
self.b_o = params["b_o"]
self.b_y = params["b_y"] # Output
# In[642]:
# Training
input_size = chars_size
batch_size = 1
h = np.array([0])
c = np.array([0])
cache = []
sentences = []
loss = 0
loss_list = []
T_steps = 10
sentence_size = 10
h_size = 100
learning_rate = 0.1
epochs = 300
# Create instance from LSTM network
nn = LSTM(chars_size, h_size)
for epoch in range(epochs):
# Create random pointer
pointer = np.random.random_integers(low=0, high=data_size - sentence_size - T_steps)
end_pointer = pointer + sentence_size
# Chainge input string to index
inputs = ([char_to_idx[ch]
for ch in data[pointer: end_pointer]])
# Chainge supervised data into index
targets = ([char_to_idx[ch]
for ch in data[pointer + 1: end_pointer + 1]])
# Initialize
Y = []
cache = []
loss = 0
h_prev = np.zeros((1, h_size)).astype(np.float32)
c_prev = np.zeros((1, h_size)).astype(np.float32)
# Cellect forward caches
for t in range(len(inputs)):
# One hot vector
x = np.zeros((1, chars_size)).astype(np.float32)
x[0][inputs[t]] = 1
# Predict step
y, _cache = nn.fw(x, h_prev, c_prev)
# _cache: [h,c,f,i,a,o]
h_prev = _cache[0]
c_prev = _cache[1]
cache.append(_cache)
Y.append(y)
# X.append(x)
# Index of cell state
idx = sentence_size - 1
# Train step
for t in range(len(targets)):
# cache: [idx][h,c,f,i,a,o]
if idx == 0:
c_prev = 0
else:
# 一つ前の出力を代入
c_prev = cache[idx - 1][1]
# Get costs
loss += nn.bw(Y[t], targets[t], cache[idx], c_prev)
# print(loss);input()
idx -= 1
# Update grads
nn.update_grad(lr=learning_rate)
# Initialize delta gates and variables
nn.clear_parameters()
loss /= len(targets)
print("epoch:" + str(epoch) + " loss:" + str(loss))
loss_list.append(loss)
sample.txt
The Great Gatsby, by F. Scott Fitzgerald
Chapter 1
In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in my mind ever since.
“Whenever you feel like criticizing any one,” he told me, “just remember that all the people in this world haven’t had the advantages that you’ve had.”
He didn’t say any more, but we’ve always been unusually communicative in a reserved way, and I understood that he meant a great deal more than that. In consequence, I’m inclined to reserve all judgments, a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores. The abnormal mind is quick to detect and attach itself to this quality when it appears in a normal person, and so it came about that in college I was unjustly accused of being a politician, because I was privy to the secret griefs of wild, unknown men. Most of the confidences were unsought — frequently I have feigned sleep, preoccupation, or a hostile levity when I realized by some unmistakable sign that an intimate revelation was quivering on the horizon; for the intimate revelations of young men, or at least the terms in which they express them, are usually plagiaristic and marred by obvious suppressions. Reserving judgments is a matter of infinite hope. I am still a little afraid of missing something if I forget that, as my father snobbishly suggested, and I snobbishly repeat, a sense of the fundamental decencies is parcelled out unequally at birth.
And, after boasting this way of my tolerance, I come to the admission that it has a limit. Conduct may be founded on the hard rock or the wet marshes, but after a certain point I don’t care what it’s founded on. When I came back from the East last autumn I felt that I wanted the world to be in uniform and at a sort of moral attention forever; I wanted no more riotous excursions with privileged glimpses into the human heart. Only Gatsby, the man who gives his name to this book, was exempt from my reaction — Gatsby, who represented everything for which I have an unaffected scorn. If personality is an unbroken series of successful gestures, then there was something gorgeous about him, some heightened sensitivity to the promises of life, as if he were related to one of those intricate machines that register earthquakes ten thousand miles away. This responsiveness had nothing to do with that flabby impressionability which is dignified under the name of the “creative temperament.”— it was an extraordinary gift for hope, a romantic readiness such as I have never found in any other person and which it is not likely I shall ever find again. No — Gatsby turned out all right at the end; it is what preyed on Gatsby, what foul dust floated in the wake of his dreams that temporarily closed out my interest in the abortive sorrows and short-winded elations of men.