LoginSignup
0
1

More than 5 years have passed since last update.

K-means clustering の実装に関するメモ

Posted at

ポイント

  • K-means clustering を実装し、具体的な数値例で確認。
  • MNIST データを使用し、clustering と prediction を試行。

データ 1

import matplotlib.cm as cm

n_input = 10
dim = 2

x_1 = np.random.normal(loc = [7.0, 7.0], scale = [2.0, 2.0], size = [n_input//2, dim])
x_2 = np.random.normal(loc = [5.0, 5.0], scale = [1.0, 1.0], size = [n_input//2, dim])
x = np.concatenate([x_1, x_2], axis = 0).astype(np.float32)

y_1 = np.array([0] * (n_input//2))
y_2 = np.array([1] * (n_input//2))
y = np.concatenate([y_1, y_2]).astype(np.int32)

fig = plt.figure(figsize = (5, 3))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(x[:, 0], x[:, 1], c = y, cmap = cm.Accent)
plt.show()

image.png

image.png

サンプルコード 1-1 (Class)

class k_means():
  def __init__(self):
    pass

  # Manual
  def inference_1(self, x, dim, n_clusters):

    indices = [1, 6]
    c_init = x[indices]

    c = tf.Variable(c_init, dtype = tf.float32)

    x_tmp = tf.tile(tf.expand_dims(x, 1), [1, n_clusters, 1])
    c_tmp = tf.expand_dims(c, 0)

    all_scores = tf.reduce_sum(tf.square(x_tmp - c_tmp), axis = 2)
    scores = tf.reduce_min(all_scores, axis = 1)
    cluster_idx = tf.argmin(all_scores, axis = 1)
    cluster_one_hot = tf.one_hot(cluster_idx, depth = n_clusters)

    tmp_1 = tf.tile(tf.expand_dims(cluster_one_hot, n_clusters), [1, 1, dim])
    tmp_2 = tf.expand_dims(tf.reduce_sum(cluster_one_hot, axis = 0), axis = 1)

    c_next = tf.reduce_sum(x_tmp * tmp_1, axis= 0) / tmp_2

    training_op = tf.assign(c, c_next)

    return c, all_scores, scores, cluster_idx, training_op

  # API
  def inference_2(self, x, n_clusters):

    indices = [1, 6]
    c_init = x[indices]

    graph = tf.contrib.factorization.KMeans(x, \
                    num_clusters = n_clusters, initial_clusters = c_init)
#    graph = tf.contrib.factorization.KMeans(x, num_clusters = n_clusters)

    return graph.training_graph()

  def fit(self):
    pass

サンプルコード 1-2 (Manual)

tf.reset_default_graph()

km = k_means()
dim = 2
n_clasters = 2
c, all_scores, scores, cluster_idx, training_op = km.inference_1(x, dim, n_clasters)

init_var = tf.global_variables_initializer()

with tf.Session() as sess:

  sess.run(init_var)

  print ('-' * 30)
  print ('Initial')
  print ('Centroid')
  print (sess.run(c))
  print ('All Scores')
  print (sess.run(all_scores))
  print ('Cluster Index')
  print (sess.run(cluster_idx))

  initial_cluster_idx = sess.run(cluster_idx)

  sess.run(training_op)
  print ('-' * 30)
  print ('1st train')
  print ('Centroid')
  print (sess.run(c))
  print ('All Scores')
  print (sess.run(all_scores))
  print ('Cluster Index')
  print (sess.run(cluster_idx))

  sess.run(training_op)
  print ('-' * 30)
  print ('2nd train')
  print ('Centroid')
  print (sess.run(c))
  print ('All Scores')
  print (sess.run(all_scores))
  print ('Cluster Index')
  print (sess.run(cluster_idx))

  last_cluster_idx = sess.run(cluster_idx)

print ('-' * 30)
fig = plt.figure(figsize = (5, 3))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(x[:, 0], x[:, 1], c = initial_cluster_idx, cmap = cm.RdBu)
ax.set_title('Initial clustering')
plt.show()

print ('-' * 30)
fig = plt.figure(figsize = (5, 3))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(x[:, 0], x[:, 1], c = last_cluster_idx, cmap = cm.RdBu)
ax.set_title('Last clustering')
plt.show()

image.png

image.png

サンプルコード 1-3 (API)

tf.reset_default_graph()

km = k_means()
n_clasters = 2
all_scores, cluster_idx, scores, cluster_centers_initialized, init_op, training_op = km.inference_2(x, n_clasters)

init_var = tf.global_variables_initializer()

with tf.Session() as sess:

  sess.run(init_var)
  sess.run(init_op)

  print ('-' * 30)
  print ('Initial')
  print ('All Scores')
  print (sess.run(all_scores))
  print ('Cluster Index')
  print (sess.run(cluster_idx))

  initial_cluster_idx = sess.run(cluster_idx)

  sess.run(training_op)
  print ('-' * 30)
  print ('1st train')
  print ('All Scores')
  print (sess.run(all_scores))
  print ('Cluster Index')
  print (sess.run(cluster_idx))

  sess.run(training_op)
  print ('-' * 30)
  print ('2nd train')
  print ('All Scores')
  print (sess.run(all_scores))
  print ('Cluster Index')
  print (sess.run(cluster_idx))

  last_cluster_idx = sess.run(cluster_idx)

print ('-' * 30)
fig = plt.figure(figsize = (5, 3))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(x[:, 0], x[:, 1], c = initial_cluster_idx[0], cmap = cm.RdBu)
ax.set_title('Initial clustering')
plt.show()

print ('-' * 30)
fig = plt.figure(figsize = (5, 3))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(x[:, 0], x[:, 1], c = last_cluster_idx[0], cmap = cm.RdBu)
ax.set_title('Last clustering')
plt.show()  

image.png

image.png

データ 2

import matplotlib.cm as cm

def create_data(n_input, dim, n_classes):

  n_input_per_class = n_input // n_classes

  x_1 = np.random.normal(loc = [7.0, 7.0], scale = [1.5, 1.5], size = [n_input_per_class, dim])
  x_2 = np.random.normal(loc = [5.0, 1.0], scale = [1.5, 1.5], size = [n_input_per_class, dim])
  x_3 = np.random.normal(loc = [2.0, 4.0], scale = [1.5, 1.5], size = [n_input_per_class, dim])

  x = np.concatenate([x_1, x_2, x_3], axis = 0).astype(np.float32)

  y_1 = np.array([0] * (n_input_per_class))
  y_2 = np.array([1] * (n_input_per_class))
  y_3 = np.array([2] * (n_input_per_class))

  y = np.concatenate([y_1, y_2, y_3]).astype(np.int32)
  y_one_hot = np.identity(n_classes)[y].astype(np.int32)

  return x, y, y_one_hot


n_input = 30
dim = 2
n_classes = 3

x_train, y_train, y_one_hot_train = create_data(n_input, dim, n_classes)
x_test, y_test, y_one_hot_test = create_data(n_input, dim, n_classes)

fig = plt.figure(figsize = (8, 3))

ax = fig.add_subplot(1, 2, 1)
ax.scatter(x_train[:, 0], x_train[:, 1], c = y_train, cmap = 'Accent')  # Accent, RdYlBu
ax.set_xlim(-3.0, 12.0)
ax.set_ylim(-3.0, 12.0)
ax.set_title('Train')

ax = fig.add_subplot(1, 2, 2)
ax.scatter(x_test[:, 0], x_test[:, 1], c = y_test, cmap = 'Accent')  # Accent, RdYlBu
ax.set_xlim(-3.0, 12.0)
ax.set_ylim(-3.0, 12.0)
ax.set_title('Test')

plt.show()

image.png

サンプルコード 2-1 (Class)

class k_means_2():
  def __init__(self):
    pass

  def inference(self, x, n_clusters):
    r_seed = np.random.randint(100)
    graph = tf.contrib.factorization.KMeans(x, num_clusters = n_clusters, \
                                           random_seed = r_seed)

    return graph.training_graph()

  def predict(self):
    pass

  def accuracy(self, cluster_idx, labels, lookup_table):
    y = tf.nn.embedding_lookup(lookup_table, cluster_idx)
    correct_preds = tf.equal(tf.squeeze(y), tf.argmax(labels, axis = 1))
    accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))

    return accuracy

  def fit(self, input_train, labels_train, input_test, labels_test, \
          dim, n_clusters, n_classes, n_iter, show_step):

    tf.reset_default_graph()

    x = tf.placeholder(shape = [None, dim], dtype = tf.float32)
    t = tf.placeholder(shape = [None, n_classes], dtype = tf.int32)

    feed_dict_train = {x: input_train, t: labels_train}
    feed_dict_test = {x: input_test, t: labels_test}

    all_scores, cluster_idx, scores, cluster_centers_initialized, init_op, training_op = self.inference(x, n_clusters)

    average_score = tf.reduce_mean(scores)

    init_var = tf.global_variables_initializer()

    with tf.Session() as sess:

      sess.run(init_var)
      sess.run(init_op, feed_dict = feed_dict_train)      

      for i in range(n_iter):
        sess.run(training_op, feed_dict = feed_dict_train)

        if (i + 1) % show_step == 0:
          average_score_train = sess.run(average_score, feed_dict = feed_dict_train)
          average_score_test = sess.run(average_score, feed_dict = feed_dict_test)
          cluster_idx_train = sess.run(cluster_idx, feed_dict = feed_dict_train)
          cluster_idx_test = sess.run(cluster_idx, feed_dict = feed_dict_test)

          count = np.zeros(shape = [n_clusters, n_classes], dtype = np.int32)

          for j in range(len(labels_train)):
            count[cluster_idx_train[0][j]] += labels_train[j]

          lookup_table = np.reshape(np.argmax(count, axis = 1), [-1, 1])

          accuracy_train = sess.run(self.accuracy(cluster_idx_train[0], labels_train, lookup_table))
          accuracy_test = sess.run(self.accuracy(cluster_idx_test[0], labels_test, lookup_table))


          print ('--------------------')
          print (' Iteration: ' + str(i + 1))
          print (' Average score (train): {:.2f}        Average score (test): {:.2f}' \
                 .format(average_score_train, average_score_test))
          print (' Accuracy (train): {:.3f}             Accuracy (test): {:.3f}' \
                 .format(accuracy_train, accuracy_test))

          fig = plt.figure(figsize = (8, 3))

          ax = fig.add_subplot(1, 2, 1)
          ax.scatter(input_train[:, 0], input_train[:, 1], c = cluster_idx_train[0], cmap = 'RdYlBu')  # Accent, RdYlBu
          ax.set_xlim(-3.0, 12.0)
          ax.set_ylim(-3.0, 12.0)
          ax.set_title('Train')

          ax = fig.add_subplot(1, 2, 2)
          ax.scatter(input_test[:, 0], input_test[:, 1], c = cluster_idx_test[0], cmap = 'RdYlBu')  # Accent, RdYlBu
          ax.set_xlim(-3.0, 12.0)
          ax.set_ylim(-3.0, 12.0)
          ax.set_title('Test')

          plt.show()

サンプルコード 2-2 (n_clusters = 3)

input_train = x_train
labels_train = y_one_hot_train
input_test = x_test
labels_test = y_one_hot_test

dim = 2
n_clusters = 3
n_classes = 3
n_iter = 3
show_step = 1

kmeans.fit(input_train, labels_train, input_test, labels_test, \
          dim, n_clusters, n_classes, n_iter, show_step)

image.png

サンプルコード 2-3 (n_clusters = 5)

input_train = x_train
labels_train = y_one_hot_train
input_test = x_test
labels_test = y_one_hot_test

dim = 2
n_clusters = 5
n_classes = 3
n_iter = 3
show_step = 1

kmeans.fit(input_train, labels_train, input_test, labels_test, \
          dim, n_clusters, n_classes, n_iter, show_step)

image.png

データ 3 (MNIST)

サンプルコード 3

input_train = mnist.train.images[:55000]
labels_train = mnist.train.labels[:55000].astype(np.int32)
input_test = mnist.test.images[:10000]
labels_test = mnist.test.labels[:10000].astype(np.int32)

dim = 28*28
n_clusters = 20
n_classes = 10
n_iter = 50
show_step = 10

kmeans.fit(input_train, labels_train, input_test, labels_test, \
          dim, n_clusters, n_classes, n_iter, show_step)

image.png

0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1