以前に効果検証のプログラムを作ったのですが、
これについて図を作り差・t値・p値を各因子から自動で算出するプログラムを作ってみました。
関数
import scipy.stats as stats
import matplotlib.pyplot as plt
def effecttest2(df, columns, y_name, auto=False):
ave = []
dtr = []
pos = []
lab = []
x = 0
for col in columns:
values = list(set(df[col].values))
tmp_ave = []
tmp_dtr = []
tmp_pos = []
tmp_lab = []
for val in values:
df_tmp = df[df[col]==val]
tmp_ave.append(df_tmp[y_name].mean())
tmp_dtr.append(df_tmp[y_name])
tmp_pos.append(x)
tmp_lab.append(col+"_"+str(val))
x = x + 1
ave.append(tmp_ave)
dtr.append(tmp_dtr)
pos.append(tmp_pos)
lab.append(tmp_lab)
if auto:
for i in range(len(dtr)):
for j in range(len(dtr[i])):
for k in range(j, len(dtr[i])):
if j != k:
f, p = stats.bartlett(dtr[i][0], dtr[i][1])
if (2 * p) <= 0.05:
t, p = stats.ttest_ind(dtr[i][j], dtr[i][k], equal_var=False)
else:
t, p = stats.ttest_ind(dtr[i][j], dtr[i][k], equal_var=True)
print(lab[i][j], lab[i][k])
print("t = %f"%(t))
print("p = %f"%(p))
print("val = %f"%(ave[i][j]-ave[i][k]))
print()
for i in range(len(dtr)):
plt.boxplot(dtr[i], positions=pos[i], labels=lab[i])
plt.plot(pos[i], ave[i], marker="x")
plt.xticks(rotation=90)
plt.show()
return ave, dtr, pos , lab
使用例
import pandas as pd
df = pd.read_csv("gbsg2.csv")
ave, dtr, pos, lab = effecttest2(df, ["horTh", "menostat", "tgrade", "cens"], "tsize", auto=True)
horTh_no horTh_yes
t = 0.718341
p = 0.472793
val = 0.817849
menostat_Post menostat_Pre
t = -1.078167
p = 0.281413
val = -1.215378
tgrade_III tgrade_I
t = 2.697732
p = 0.007477
val = 4.911970
tgrade_III tgrade_II
t = 2.119908
p = 0.034422
val = 2.826968
tgrade_I tgrade_II
t = -1.204302
p = 0.229018
val = -2.085002
cens_0 cens_1
t = -3.366996
p = 0.000811
val = -3.773439
これで出力結果の通り同じ質的変数の違う値で差の比較ができるようになっています。