0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

pythonで適当なクラスタを持つサンプルデータを作る

Posted at

pythonで適当なクラスタを持つサンプルデータを作る

1次元の適当なサンプルを作る

こんな感じの分布のサンプルを作ります

plot1.png

ソースです


import matplotlib.pyplot as plt
import random
import sys

def createDistribution(counter):
    samples = []
    for cnt in counter:
        sample = []
        for i in range(cnt):
            sample.append(random.gauss(0,1))
        samples.append(sample)
    
    y = []
    for i in range(len(counter)):
        for j in range(len(samples[i])):
            y.append(samples[i][j] + i * 3)
    return y

def main():
    # サンプル数を指定、配列数=クラスタ数と解釈されます
    x = createDistribution([200, 100, 300]);
    plt.hist(x, bins=100)
    plt.savefig("plot1.png");

if __name__ == "__main__":
    main()
# python distribution1.py

y.append(samples[i][j] + i * 3)

i * 3 は3σ分の距離だけクラスタ中心を離しています

2次元の適当なサンプルを作る

こんな感じの分布のサンプルを作ります

plot2.png

ソースです


import matplotlib.pyplot as plt
import random
import sys

def createDistribution(counter):
    xx, yy = [], []
    for cnt in counter:
        ix, iy = [], []
        for i in range(cnt):
            ix.append(random.gauss(0,1))
            iy.append(random.gauss(0,1))
        xx.append(ix)
        yy.append(iy)
    
    x, y = [], []
    for i in range(len(counter)):
        for j in range(len(xx[i])):
            x.append(xx[i][j] + i * 3)
            y.append(yy[i][j] + i * 3)
    return x, y

def main():
    # サンプル数を指定、配列数=クラスタ数と解釈されます
    x, y = createDistribution([200, 100, 300]);
    plt.scatter(x, y)
    plt.savefig("plot2.png");

if __name__ == "__main__":
    main()
# python distribution2.py

身長体重のサンプルデータを作ってみる

正規分布乱数を利用して身長体重のサンプルデータを作ってみます
こんな感じのデータを作ります

身長 体重
175.481225 68.018599
152.675035 58.921772
174.513797 64.603158
170.562922 76.858922
180.125424 93.762856
177.360964 63.895156
173.432610 52.541947
168.964758 71.661168
175.044804 93.379319

平均身長はこちらを参考にして20歳の平均身長を使用しました
https://www.e-stat.go.jp/dbview?sid=0003224177

身長別体重の平均と分散はこちらを参考にして適当に決めました
https://www.kumachu.gr.jp/department/other/nutrition/kg.php

ソースです


import io
import numpy as np
import random
import sys

# https://www.e-stat.go.jp/dbview?sid=0003224177
# https://www.kumachu.gr.jp/department/other/nutrition/kg.php
# 身長, 体重, やせ, 肥満
physical_params = [
    [130,37.2,31.3,42.3],
    [131,37.8,31.7,42.9],
    [132,38.3,32.2,43.6],
    [133,38.9,32.7,44.2],
    [134,39.5,33.2,44.9],
    [135,40.1,33.7,45.6],
    [136,40.7,34.2,46.2],
    [137,41.3,34.7,46.9],
    [138,41.9,35.2,47.6],
    [139,42.5,35.7,48.3],
    [140,43.1,36.3,49],
    [141,43.7,36.8,49.7],
    [142,44.4,37.3,50.4],
    [143,45,37.8,51.1],
    [144,45.6,38.4,51.8],
    [145,46.3,38.9,52.6],
    [146,46.9,39.4,53.3],
    [147,47.5,40,54],
    [148,48.2,40.5,54.8],
    [149,48.8,41.1,55.5],
    [150,49.5,41.6,56.3],
    [151,50.2,42.2,57],
    [152,50.8,42.7,57.8],
    [153,51.5,43.3,58.5],
    [154,52.2,43.9,59.3],
    [155,52.9,44.4,60.1],
    [156,53.5,45,60.8],
    [157,54.2,45.6,61.6],
    [158,54.9,46.2,62.4],
    [159,55.6,46.8,63.2],
    [160,56.3,47.4,64],
    [161,57,48,64.8],
    [162,57.7,48.6,65.6],
    [163,58.5,49.2,66.4],
    [164,59.2,49.8,67.2],
    [165,59.9,50.4,68.1],
    [166,60.6,51,68.9],
    [167,61.4,51.6,69.7],
    [168,62.1,52.2,70.6],
    [169,62.8,52.8,71.4],
    [170,63.6,53.5,72.3],
    [171,64.3,54.1,73.1],
    [172,65.1,54.7,74],
    [173,65.8,55.4,74.8],
    [174,66.6,56,75.7],
    [175,67.4,56.7,76.6],
    [176,68.1,57.3,77.4],
    [177,68.9,58,78.3],
    [178,69.7,58.6,79.2],
    [179,70.5,59.3,80.1],
    [180,71.3,59.9,81],
    [181,72.1,60.6,81.9],
    [182,72.9,61.3,82.8],
    [183,73.7,62,83.7],
    [184,74.5,62.6,84.6],
    [185,75.3,63.3,85.6],
    [186,76.1,64,86.5],
    [187,76.9,64.7,87.4],
    [188,77.8,65.4,88.4],
    [189,78.6,66.1,89.3],
]

def near_index(x, n):
    a = (x - n) ** 2
    idx = np.argmin(a)
    return idx

def main():
    params = np.array(physical_params).astype(float)
    params_len = len(params)
    height = params[:,0]
    weight = params[:,1]
    skinny = params[:,2]
    fatness = params[:,3]
    
    # base params # 20歳の身長平均、偏差を使用する
    height_avr = 169.1
    height_var = 6.8
    sample_count = 100
    
    samples = []
    for i in range(sample_count):
        h = random.gauss(height_avr, height_var)
        idx = near_index(height, h)
        weight_avr = weight[idx]
        weight_var = abs(fatness[idx] - skinny[idx]) / 2
        w = random.gauss(weight_avr, weight_var)
        samples.append([h, w])
    s = io.StringIO()
    np.savetxt(s, samples, fmt="%f", delimiter="\t")
    print(s.getvalue())

if __name__ == "__main__":
    main()
# python physical_feature.py

使い道とか

売上とか在庫とかなんでもいいのでダミーデータが件数欲しい時に一応それっぽい分散をしていてほしいとか、そういう時に役に立つかもしれません
後は機械学習系のアルゴリズムの試験やる時にあからさまなクラスターを使用して性能チェックするとかなんとか、そういう時にも役に立つかもしれません

以上です

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?