LSTM for Kaggle (PredictFutureSales) の実装に関するメモ #Python

Reference

Data

def eda(data):
  print('----------Top-5- Record----------')
  print(data.head(5))
  print('-----------Information-----------')
  print(data.info())
  print('-----------Data Types-----------')
  print(data.dtypes)
  print('----------Missing value-----------')
  print(data.isnull().sum())
  print('----------Null value-----------')
  print(data.isna().sum())
  print('----------Shape of Data----------')
  print(data.shape)

def graph_insight(data):
  df_num = data.select_dtypes(include = ['float64', 'int64'])
  df_num.hist(figsize = (10, 10), bins=50)

eda(train)
graph_insight(train)

plt.figure(figsize = (5, 3))
plt.xlim(-100, 3000)
sns.boxplot(x = train.item_cnt_day)
plt.show()

plt.figure(figsize = (5, 3))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x = train.item_price)
plt.show()

train_org = pd.read_csv('/content/sales_train_v2.csv')
test_org = pd.read_csv('/content/test.csv')
submission = pd.read_csv('/content/sample_submission.csv')
items = pd.read_csv('/content/items.csv')
item_cats = pd.read_csv('/content/item_categories.csv')
shops = pd.read_csv('/content/shops.csv')

train_copy = train_org.copy()

# drop duplicates
print(train_copy.duplicated().value_counts())
print ()

subset = ['date','date_block_num','shop_id','item_id','item_cnt_day']
print(train_copy.duplicated(subset = subset).value_counts())
print ()

train_copy2 = train_copy.drop_duplicates(subset = subset)
print (len(train_copy.item_id))
print (len(train_copy2.item_id))

#train_copy2 = train_copy2[train_copy2.item_price<100000]
#train_copy2 = train_copy2[train_copy2.item_cnt_day<1001]

plt.figure(figsize = (5, 3))
plt.xlim(-100, 3000)
sns.boxplot(x = train_copy2.item_cnt_day)
plt.show()

plt.figure(figsize = (5, 3))
plt.xlim(train_copy2.item_price.min(), train_copy2.item_price.max()*1.1)
sns.boxplot(x = train_copy2.item_price)
plt.show()

train_copy3 = train_copy2.pivot_table(index = ['shop_id', 'item_id'], \
                                      values = ['item_cnt_day'], \
                                      columns = 'date_block_num', fill_value = 0, \
                                      aggfunc = 'sum')
train_copy3 = train_copy3.reset_index()

dataset = pd.merge(test_copy, train_copy3, on=['shop_id', 'item_id'], how='left')
dataset = dataset.fillna(0)

dataset = dataset.drop(['ID', 'shop_id', 'item_id'], axis = 1)

x_data = dataset.values[:, :-1]
y_data = dataset.values[:, -1:]
test_data = dataset.values[:, 1:]
print(x_data.shape, y_data.shape, predict_data.shape)

Sample Code

class LSTM():
  def __init__(self):
    pass

  def weight_variable(self, name, shape):
    initializer = tf.truncated_normal_initializer(mean = 0.0, stddev = 0.01, dtype = tf.float32)
    return tf.get_variable(name, shape, initializer = initializer)

  def bias_variable(self, name, shape):
    initializer = tf.constant_initializer(value = 0.0, dtype = tf.float32)
    return tf.get_variable(name, shape, initializer = initializer)

  def get_zoneout_mask(self, zoneout_prob, shape):
    keep_prob = tf.convert_to_tensor(zoneout_prob)
    random_tensor = keep_prob + tf.random_uniform(shape)
    binary_tensor = tf.floor(random_tensor)
    zoneout_mask = binary_tensor

    return zoneout_mask

  def lstm(self, x, length, n_in, n_units_l, n_units_f, n_predictions, batch_size, \
           forget_bias, zoneout_prob, keep_prob, reuse = False):

    x = tf.reshape(x, [-1, length, n_in])
    h = tf.zeros(shape = [batch_size, n_units_l], dtype = tf.float32)
    c = tf.zeros(shape = [batch_size, n_units_l], dtype = tf.float32)

    with tf.variable_scope('lstm', reuse = reuse):
      w_x = self.weight_variable('w_x', [n_in, n_units_l * 4])
      w_h = self.weight_variable('w_h', [n_units_l, n_units_l * 4])
      b = self.bias_variable('b', [n_units_l * 4])

      zoneout_mask_c = self.get_zoneout_mask(zoneout_prob, [n_units_l])
      zoneout_mask_complement_c = tf.ones(shape = [n_units_l], dtype = tf.float32) - zoneout_mask_c
      zoneout_mask_h = self.get_zoneout_mask(zoneout_prob, [n_units_l])
      zoneout_mask_complement_h = tf.ones(shape = [n_units_l], dtype = tf.float32) - zoneout_mask_h

      for t in range(length):

        t_x = tf.matmul(x[:, t, :], w_x)
        t_h = tf.matmul(h, w_h)

        i, f, o, g = tf.split(tf.add(tf.add(t_x, t_h), b), 4, axis = 1)

        i = tf.nn.sigmoid(i)
        f = tf.nn.sigmoid(f + forget_bias)
        o = tf.nn.sigmoid(o)
        g = tf.nn.tanh(g)

        # zoneout
        c_temp = tf.add(tf.multiply(f, c), tf.multiply(i, g))
        h_temp = tf.multiply(o, tf.nn.tanh(c))

        c = zoneout_mask_c * c + \
                        zoneout_mask_complement_c * c_temp
        h = zoneout_mask_h * h + \
                        zoneout_mask_complement_h * h_temp

      w_2 = self.weight_variable('w_2', [n_units_l, n_units_f])
      b_2 = self.bias_variable('b_2', [n_units_f])

      y = tf.matmul(h, w_2) + b_2
      y = tf.nn.relu(y)
      y = tf.nn.dropout(y, keep_prob)

      w_3 = self.weight_variable('w_3', [n_units_f, n_predictions])
      b_3 = self.bias_variable('b_3', [n_predictions])

      y = tf.matmul(y, w_3) + b_3

      return y

  def loss_mse(self, y, t):
    mse = tf.reduce_mean(tf.reduce_sum(tf.square(t - y), axis = 1))
    return mse

  def loss_cross_entropy(self, y, t):
    cross_entropy = - tf.reduce_mean(tf.reduce_sum(t * tf.log(tf.clip_by_value(y, 1e-10, 1.0)), axis = 1))
    return cross_entropy

  def loss_entropy(self, p):
    entropy = tf.reduce_mean(tf.reduce_sum(p * tf.log(tf.clip_by_value(p, 1e-10, 1.0)), axis = 1))
    return entropy

  def accuracy(self, y, t):
    correct_preds = tf.equal(tf.argmax(y, axis = 1), tf.argmax(t, axis = 1))
    accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
    return accuracy

  def accuracy_rmse(self, y, t):
    mse = tf.reduce_mean(tf.reduce_sum(tf.square(t - y), axis = 1))
    return tf.sqrt(mse)

  def accuracy_mae(self, y, t, n_predictions):
    accuracy = tf.reduce_mean(tf.reduce_sum(tf.abs(y - t), axis = 1)) / n_predictions
    return accuracy

  def training(self, loss, learning_rate, var_list):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
    train_step = optimizer.minimize(loss, var_list = var_list)
    return train_step

  def fit(self, x_data, y_data, max_length, length, n_in, n_units_l, \
          n_units_f, n_predictions, learning_rate, n_epoch, n_iter, \
          batch_size, show_step, is_saving, model_path):

    tf.reset_default_graph()

    x = tf.placeholder(shape = [None, length], dtype = tf.float32)
    y = tf.placeholder(shape = [None, n_predictions], dtype = tf.float32)
    keep_prob = tf.placeholder(shape = [], dtype = tf.float32)

    preds = self.lstm(x, length, n_in, n_units_l, n_units_f, n_predictions, \
                      batch_size, 1.0, 0.0, keep_prob, reuse = False)
    loss = self.loss_mse(preds, y)

    var_list = tf.trainable_variables('lstm')
    train_step = self.training(loss, learning_rate, var_list)

    acc =  self.accuracy_rmse(preds, y)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:

      sess.run(init)

      for e in range (n_epoch):
        train_indices = np.random.choice(len(x_data), round((len(x_data)*0.8)), \
                                replace = False)
        test_indices = np.array(list(set(range(len(x_data))) - set(train_indices)))

        x_train = x_data[train_indices][:, max_length - length:]
        x_test = x_data[test_indices][:, max_length - length:]
        y_train = y_data[train_indices]
        y_test = y_data[test_indices]

        history_loss_train = []
        history_loss_test = []
        history_acc_train = []
        history_acc_test = []

        for i in range(n_iter):
          # Training
          rand_index = np.random.choice(len(x_train), size = batch_size)
          x_batch = x_train[rand_index]
          y_batch = y_train[rand_index]

          feed_dict = {x: x_batch, y: y_batch, keep_prob: 0.7}
          sess.run(train_step, feed_dict = feed_dict)

          temp_loss = sess.run(loss, feed_dict = feed_dict)
          temp_acc = sess.run(acc, feed_dict = feed_dict)

          history_loss_train.append(temp_loss)
          history_acc_train.append(temp_acc)

          if (i + 1) % show_step == 0:
            print ('-' * 100)
            print ('epoch: ' + str(e + 1) + ' Iteration: ' + str(i + 1) + '  Loss: ' + str(temp_loss) \
                  + '  Accuracy: ' + str(temp_acc))

          # Test
          rand_index = np.random.choice(len(x_test), size = batch_size)
          x_batch = x_test[rand_index]
          y_batch = y_test[rand_index]

          feed_dict = {x: x_batch, y: y_batch, keep_prob: 1.0}
          temp_loss = sess.run(loss, feed_dict = feed_dict)
          temp_acc = sess.run(acc, feed_dict = feed_dict)

          history_loss_test.append(temp_loss)
          history_acc_test.append(temp_acc)

        if is_saving:
          model_path = saver.save(sess, model_path)
          print ('-' * 100)
          print ('done saving at ', model_path)

        print ('-' * 100)    
        fig = plt.figure(figsize = (10, 3))
        ax1 = fig.add_subplot(1, 2, 1)
        ax1.plot(range(n_iter), history_loss_train, 'b-', label = 'Training')
        ax1.plot(range(n_iter), history_loss_test, 'r-', label = 'Test')
        ax1.set_ylim(0.0, 3.0)
        ax1.set_title('Loss')
        ax1.legend(loc = 'upper right')

        ax2 = fig.add_subplot(1, 2, 2)
        ax2.plot(range(n_iter), history_acc_train, 'b-', label = 'Training')
        ax2.plot(range(n_iter), history_acc_test, 'r-', label = 'Test')
        ax2.set_ylim(0.0, 3.0)
        ax2.set_title('Accuracy')
        ax2.legend(loc = 'upper right')

        plt.show()

  def predict(self, x_input, length, n_in, n_units_l, n_predictions, \
          batch_size, model_path):

    x = tf.placeholder(shape = [None, length], dtype = tf.float32)
    y = tf.placeholder(shape = [None, n_predictions], dtype = tf.float32)
    keep_prob = tf.placeholder(shape = [], dtype = tf.float32)

    preds = self.lstm(x, length, n_in, n_units_l, n_predictions, batch_size, 1.0, 0.0, reuse = True)

    saver = tf.train.Saver()

    with tf.Session() as sess:

      saver.restore(sess, model_path)

      feed_dict = {x: x_input, keep_prob: 1.0}
      prediction = sess.run(preds, feed_dict = feed_dict)

    return prediction

Parameters

lstm = LSTM()

max_length = 33
length = 15
n_in = 1
n_units_l = 32
n_units_f = 32
n_predictions = 1
learning_rate = 0.01
n_epoch = 3
n_iter = 300
batch_size = 500
show_step = 100
model_path = 'datalab/model'

is_saving = False

lstm.fit(x_data, y_data, max_length, length, n_in, n_units_l, n_units_f, n_predictions, \
         learning_rate, n_epoch, n_iter, batch_size, show_step, is_saving, model_path)

LSTM for Kaggle (PredictFutureSales) の実装に関するメモ

Reference

Data

Sample Code

Parameters

Output