とりあえず経験分布になんらかのモデルをフィットさせたい Python/Scipy #Python

とりあえず、分布にフィットさせたい

ガウス分布、ｔ分布、いろいろありますが、正直どの分布を使ってモデリングすれば良いのかわからないときや、手っ取り早くそれっぽい分析をしたいときがあるかと思います。
そんなときに使えるコードを見つけたので、紹介します。

ほぼこれなんです→ https://stackoverflow.com/a/37616966

scipyの分布は82種類

調べてみたところ、scipyには82種類の分布が用意されてあり、どれを使ってもfit()で再尤度推定によるパラメータ調整ができるとのこと。

    DISTRIBUTIONS = [        
        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
        st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
        st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
        st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable,
        st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
        st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
        st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
        st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
    ]

↑どれにすれば良いんだ！？

答え、全部試して1番良いのを使う

残差平方和でフィットした82種類のモデルを評価して、それぞれの[分布名,分布パラメータ,誤差]を格納したリストを誤差の少ない順に返す、魔法の関数がありました。

#データからモデルを作る
def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # データからヒストグラムを作成する
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # 以下の分布でフィッティングする
    DISTRIBUTIONS = [        
        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
        st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
        st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
        st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable,
        st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
        st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
        st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
        st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
    ]

    # [分布名,分布パラメータ,誤差]を格納するためのリスト
    l = []
    i = 0
    print('starting|----------')
    print("progress|", end="")
    #　分布毎にそれぞれパラメータを推測してみる
    for distribution in DISTRIBUTIONS:

        #　プログレスバー
        i += 1 
        progress = i/len(DISTRIBUTIONS)*10
        number = int(progress)
        if number > int((i-1)/len(DISTRIBUTIONS)*10):
            print("#", end="")
        # 分布によってはフィットできないこともあるので、
        # フィットできなければ、passする
        try:
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # 分布をフィットさせる
                params = distribution.fit(data)

                # わかりやすい用にパラメータをばらばらにする
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # フィットさせた確率密度関数を計算して、分布にフィットさせる
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)

                # 残差平方和を誤差として計算する
                sse = np.sum(np.power(y - pdf, 2.0))

                # l
                l.append([
                    distribution.name,
                    params,
                    sse,
                ])
        except Exception:
            pass
    l.sort(key=lambda x: x[2])
    return l

使用例

楽にプロットするための関数とimportね↓

import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
import sys

matplotlib.style.use('ggplot')

def make_pdf(dist, params, size=10000):
    """Generate distributions's Probability Distribution Function """

    # わかりやすい用にパラメータをばらばらにする
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # 分布の開始と終わりを指定する
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=locb, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # 確率密度関数を作って、Seriesに変換する
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf

# プロットするための関数
# めっちゃてきとうでごめんなさいｗ
def plot(i):
    fit_name, fit_params, fit_sse = fit_list[i]
    dist = getattr(st, fit_name)
    # Make PDF with best params 
    pdf = make_pdf(dist, fit_params)

    # Display
    plt.figure(figsize=(12,8))
    ax = pdf.plot(lw=2, label='PDF', legend=True)
    data.plot(kind='hist', bins=50, density=True, alpha=0.5, label='Data', legend=True, ax=ax)

    param_names = (dist.shapes + ', loc, scale').split(', ') if dist.shapes else ['loc', 'scale']
    param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, fit_params)])
    dist_str = '{}({}) sse={}'.format(fit_name, param_str, fit_sse)

    ax.set_title(u'El Niño sea temp. with best fit distribution \n' + dist_str)
    ax.set_xlabel(u'Temp. (°C)')
    ax.set_ylabel('Frequency')
    plt.show()

例として使うデータはstatsmodelsのエルニーニョデータセット。
こんな感じです。

#　エルニーニョデータをロード
data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())

# 確率密度関数と比較する用にヒストグラムを作成する
plt.figure(figsize=(12,8))
ax = data.plot(kind='hist', bins=50, density=True, alpha=0.5)

# あとで使うので、Ｙ軸の情報を保存する
dataYLim = ax.get_ylim()

次に全部のモデルを試す関数を使います。ちょっと時間かかりますが、我慢強く待ってください。
僕は待ちきれなくて（というか動いているかどうか心配で）、stackoverflowにあったオリジナルにプログレスバーを加えました。笑

# 作った関数を使って、最適な分布を探す
fit_list = best_fit_distribution(data, 20, ax)

> starting|----------
> progress|##########

そして、プロット。
いくつか例としてぷろっとしますね。

plot(0)

plot(15)

plot(67)