More than 5 years have passed since last update.

LSTM (full scratch) But cannot converge to minimum.

Last updated at 2018-04-26Posted at 2018-04-26


# coding: utf-8

import numpy as np
import random
np.random.seed(0)

# Read from sample text.
data = open('sample.txt', 'r').read()
chars = list(set(data))
data_size, chars_size = len(data), len(chars)
H_size = 100
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}



def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def dsigmoid(y):
    return y * (1 - y)


def tanh(x):
    return np.tanh(x)


def dtanh(y):
    return 1 - y * y


def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))



class RMSprop:
    """RMSprop"""

    def __init__(self, alpha=0.001, gamma=0.99):
        self.alpha = alpha

        # decay rate
        self.gamma = gamma
        self.r = None

    def update(self, params, grads):

        if self.r is None:
            self.r = {}
            for key, val in params.items():
                self.r[key] = np.zeros_like(val)

        for key in params.keys():
            self.r[key] *= self.gamma
            self.r[key] += (1 - self.gamma) * grads[key] * grads[key]
            params[key] -= self.alpha * grads[key] / (np.sqrt(self.r[key]) + 1e-7)




class SGD:

    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]



class Adam:
    """Adam (http://arxiv.org/abs/1412.6980v8)"""

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None

    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)

        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)

        for key in params.keys():
            # self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            # self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])

            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

            # unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
            # unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
            # params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)




class LSTM():

    def __init__(self, input_num, neuron_num):
        np.random.seed(0)

        H = neuron_num  # Number of LSTM hidden layer's neurons (seq_len)
        D = input_num  # Number of input dimension == number of items in vocabulary
        Z = H + D  # Because we will concatenate LSTM state with the input

        # Input weights
        self.W_f = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
        self.W_i = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
        self.W_a = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
        self.W_o = np.random.randn(Z, H).astype(np.float32) / np.sqrt(D / 2.)
        self.W_y = np.random.randn(H, D).astype(np.float32) / np.sqrt(D / 2.)

        # Bias values
        self.b_f = np.zeros((1, H)).astype(np.float32)
        self.b_i = np.zeros((1, H)).astype(np.float32)
        self.b_a = np.zeros((1, H)).astype(np.float32)
        self.b_o = np.zeros((1, H)).astype(np.float32)
        self.b_y = np.zeros((1, D)).astype(np.float32)

        #         # Peephole values :shape(N,)
        #         self.p_f = np.zeros(1)
        #         self.p_i = np.zeros(1)
        #         self.p_o = np.zeros(1)

        # delta weight
        self.dW_f = np.zeros((Z, H)).astype(np.float32)
        self.dW_i = np.zeros((Z, H)).astype(np.float32)
        self.dW_a = np.zeros((Z, H)).astype(np.float32)
        self.dW_o = np.zeros((Z, H)).astype(np.float32)
        self.dW_y = np.zeros((H, D)).astype(np.float32)

        # delta bias
        self.db_f = np.zeros((1, H)).astype(np.float32)
        self.db_i = np.zeros((1, H)).astype(np.float32)
        self.db_a = np.zeros((1, H)).astype(np.float32)
        self.db_o = np.zeros((1, H)).astype(np.float32)
        self.db_y = np.zeros((1, D)).astype(np.float32)

        # Output (Initialized by 0)
        self.h = [0]

        # Delta cell state
        self.dc = [0]

        # Delta out
        self.dh = [0]

        self.input_num = input_num

    def clear_parameters(self):
        self.h = [0]
        self.dc = [0]
        self.dh = [0]

        # delta weight
        self.dW_f *= 0
        self.dW_i *= 0
        self.dW_a *= 0
        self.dW_o *= 0
        self.dW_y *= 0

        # delta bias
        self.db_f *= 0
        self.db_i *= 0
        self.db_a *= 0
        self.db_o *= 0
        self.db_y *= 0

    def fw(self, x, h_prev, c_prev):

        x = np.hstack((h_prev, x))

        # Forget gate
        _f = (x @ self.W_f) + self.b_f
        f = sigmoid(_f)

        # Input gate
        _i = (x @ self.W_i) + self.b_i
        i = sigmoid(_i)

        # Input condition
        _a = (x @ self.W_a) + self.b_a
        a = tanh(_a)

        # Current cell state
        c = (c_prev * f) + (i * a)

        # Output gate
        _o = (x @ self.W_o) + self.b_o
        o = sigmoid(_o)

        # Output for cell
        h = o * tanh(c)
        self.h.append(h)

        # Output for target
        _y = (h @ self.W_y) + self.b_y
        y = softmax(_y)

        # return value
        cache = (h, c, f, i, a, o, x)
        return y, cache


    def bw(self, y, target_index, cache, c_prev):

        h = cache[0]
        c = cache[1]
        f = cache[2]
        i = cache[3]
        a = cache[4]
        o = cache[5]
        x = cache[6]

        # Output
        dy = np.copy(y)
        dy[0, target_index] -= 1

        _dh = dy @ self.W_y.T
        dh = (_dh + self.dh).astype(np.float32)

        # Cell state
        dc = self.dc + dh * o * (1 - np.tanh(c) ** 2)
        self.dc = dc * f  # For next parameter

        # Gates
        da = dc * i * dtanh(a)
        di = dc * a * dsigmoid(i)
        df = dc * c_prev * dsigmoid(f)
        do = dh * tanh(c) * dsigmoid(o)

        dx_f = df @ self.W_f.T
        dx_i = di @ self.W_i.T
        dx_a = da @ self.W_a.T
        dx_o = do @ self.W_o.T

        dx = dx_f + dx_i + dx_a + dx_o
        self.dh = dx[:, :H_size]

        # delta weight
        self.dW_f += x.T @ df
        self.dW_i += x.T @ di
        self.dW_a += x.T @ da
        self.dW_o += x.T @ do
        self.dW_y += h.T @ dy

        # delta bias
        self.db_f += np.sum(df)
        self.db_i += np.sum(di)
        self.db_a += np.sum(da)
        self.db_o += np.sum(do)
        self.db_y += np.sum(dy)

        # Calculate loss from Supervised data. (one-hot vector)
        t = np.zeros(self.input_num)
        t[target_index] = 1
        loss = np.mean(-np.log(y) * t)
        return loss

    def update_grad(self, lr=0.1):

        params = {}
        params["W_f"] = self.W_f
        params["W_i"] = self.W_i
        params["W_a"] = self.W_a
        params["W_o"] = self.W_o
        params["W_y"] = self.W_y  # Output

        params["b_f"] = self.b_f
        params["b_i"] = self.b_i
        params["b_a"] = self.b_a
        params["b_o"] = self.b_o
        params["b_y"] = self.b_y  # Output

        grads = {}
        grads["W_f"] = self.dW_f
        grads["W_i"] = self.dW_i
        grads["W_a"] = self.dW_a
        grads["W_o"] = self.dW_o
        grads["W_y"] = self.dW_y  # Output

        grads["b_f"] = self.db_f
        grads["b_i"] = self.db_i
        grads["b_a"] = self.db_a
        grads["b_o"] = self.db_o
        grads["b_y"] = self.db_y  # Output

        optimizer = SGD(lr=lr)  #
        # optimizer = Adam(lr=lr)  #
        # optimizer = RMSprop(alpha=0.001) #
        optimizer.update(params, grads)

        # Maybe unnecessary below
        self.W_f = params["W_f"]
        self.W_i = params["W_i"]
        self.W_a = params["W_a"]
        self.W_o = params["W_o"]
        self.W_y = params["W_y"]  # Output

        self.b_f = params["b_f"]
        self.b_i = params["b_i"]
        self.b_a = params["b_a"]
        self.b_o = params["b_o"]
        self.b_y = params["b_y"]  # Output


# In[642]:


# Training
input_size = chars_size
batch_size = 1
h = np.array([0])
c = np.array([0])
cache = []
sentences = []

loss = 0
loss_list = []

T_steps = 10
sentence_size = 10
h_size = 100
learning_rate = 0.1
epochs = 300

# Create instance from LSTM network
nn = LSTM(chars_size, h_size)

for epoch in range(epochs):

    # Create random pointer
    pointer = np.random.random_integers(low=0, high=data_size - sentence_size - T_steps)
    end_pointer = pointer + sentence_size

    # Chainge input string to index
    inputs = ([char_to_idx[ch]
                for ch in data[pointer: end_pointer]])

    # Chainge supervised data into index
    targets = ([char_to_idx[ch]
                for ch in data[pointer + 1: end_pointer + 1]])

    # Initialize
    Y = []
    cache = []
    loss = 0
    h_prev = np.zeros((1, h_size)).astype(np.float32)
    c_prev = np.zeros((1, h_size)).astype(np.float32)

    # Cellect forward caches
    for t in range(len(inputs)):
        # One hot vector
        x = np.zeros((1, chars_size)).astype(np.float32)
        x[0][inputs[t]] = 1

        # Predict step
        y, _cache = nn.fw(x, h_prev, c_prev)

        # _cache: [h,c,f,i,a,o]
        h_prev = _cache[0]
        c_prev = _cache[1]

        cache.append(_cache)
        Y.append(y)
        # X.append(x)



    # Index of cell state
    idx = sentence_size - 1

    # Train step
    for t in range(len(targets)):

        # cache: [idx][h,c,f,i,a,o]
        if idx == 0:
            c_prev = 0
        else:
            # 一つ前の出力を代入
            c_prev = cache[idx - 1][1]

        # Get costs
        loss += nn.bw(Y[t], targets[t], cache[idx], c_prev)

        # print(loss);input()
        idx -= 1

    # Update grads
    nn.update_grad(lr=learning_rate)

    # Initialize delta gates and variables
    nn.clear_parameters()

    loss /= len(targets)
    print("epoch:" + str(epoch) + "  loss:" + str(loss))
    loss_list.append(loss)

sample.txt

The Great Gatsby, by F. Scott Fitzgerald

Chapter 1

In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in my mind ever since.

“Whenever you feel like criticizing any one,” he told me, “just remember that all the people in this world haven’t had the advantages that you’ve had.”

He didn’t say any more, but we’ve always been unusually communicative in a reserved way, and I understood that he meant a great deal more than that. In consequence, I’m inclined to reserve all judgments, a habit that has opened up many curious natures to me and also made me the victim of not a few veteran bores. The abnormal mind is quick to detect and attach itself to this quality when it appears in a normal person, and so it came about that in college I was unjustly accused of being a politician, because I was privy to the secret griefs of wild, unknown men. Most of the confidences were unsought — frequently I have feigned sleep, preoccupation, or a hostile levity when I realized by some unmistakable sign that an intimate revelation was quivering on the horizon; for the intimate revelations of young men, or at least the terms in which they express them, are usually plagiaristic and marred by obvious suppressions. Reserving judgments is a matter of infinite hope. I am still a little afraid of missing something if I forget that, as my father snobbishly suggested, and I snobbishly repeat, a sense of the fundamental decencies is parcelled out unequally at birth.

And, after boasting this way of my tolerance, I come to the admission that it has a limit. Conduct may be founded on the hard rock or the wet marshes, but after a certain point I don’t care what it’s founded on. When I came back from the East last autumn I felt that I wanted the world to be in uniform and at a sort of moral attention forever; I wanted no more riotous excursions with privileged glimpses into the human heart. Only Gatsby, the man who gives his name to this book, was exempt from my reaction — Gatsby, who represented everything for which I have an unaffected scorn. If personality is an unbroken series of successful gestures, then there was something gorgeous about him, some heightened sensitivity to the promises of life, as if he were related to one of those intricate machines that register earthquakes ten thousand miles away. This responsiveness had nothing to do with that flabby impressionability which is dignified under the name of the “creative temperament.”— it was an extraordinary gift for hope, a romantic readiness such as I have never found in any other person and which it is not likely I shall ever find again. No — Gatsby turned out all right at the end; it is what preyed on Gatsby, what foul dust floated in the wake of his dreams that temporarily closed out my interest in the abortive sorrows and short-winded elations of men.

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up