はじめに
本記事は言語処理100本ノックの解説です。
100本のノックを全てこなした記録をQiitaに残します。
使用言語はPythonです。
今回は第6章: 機械学習(55~59)までの解答例をご紹介します。
6章前半(50~54)の解答例はこちらです。
55. 混同行列の作成
52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を,学習データおよび評価データ上で作成せよ.
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
labels = [0, 1, 2, 3]
cm = confusion_matrix(Y_train, Y_pred_train, labels = labels)
cm = pd.DataFrame(data=cm, index=["e", "b", "m", "t"], columns=["e", "b", "m", "t"])
sns.heatmap(cm, square=True, cbar=True, annot=True, cmap='Blues')
plt.xlabel("Predict")
plt.ylabel("True")
plt.title("Train_Coufision")
plt.savefig("[PATH]/Train_confusion.png")
plt.gcf().clear()
cm = confusion_matrix(Y_test, Y_pred_test, labels = labels)
cm = pd.DataFrame(data=cm, index=["e", "b", "m", "t"], columns=["e", "b", "m", "t"])
sns.heatmap(cm, square=True, cbar=True, annot=True, cmap="Blues")
plt.xlabel("Predict")
plt.ylabel("True")
plt.title("Test_Coufision")
plt.savefig("[PATH]/Test_confusion.png")
コメント
数の多いカテゴリは予測しやすいですが、少ないのは微妙ですね。
56. 適合率,再現率,F1スコアの計測
52で学習したロジスティック回帰モデルの適合率,再現率,F1スコアを,評価データ上で計測せよ.カテゴリごとに適合率,再現率,F1スコアを求め,カテゴリごとの性能をマイクロ平均(micro-average)とマクロ平均(macro-average)で統合せよ.
from sklearn.metrics import precision_score, recall_score, f1_score
df = pd.DataFrame(columns = ["適合率", "再現率", "F1値"], index=["e", "b", "m", "t", "macro-average", "micro-average"])
df.loc["e":"t", "適合率"] = precision_score(Y_test, Y_pred_test, average=None)
df.loc["e":"t", "再現率"] = recall_score(Y_test, Y_pred_test, average=None)
df.loc["e":"t", "F1値"] = f1_score(Y_test, Y_pred_test, average=None)
df.loc["e":"t", "適合率"] = precision_score(Y_test, Y_pred_test, average=None)
df.loc["macro-average", :] = [precision_score(Y_test, Y_pred_test, average="macro"), recall_score(Y_test, Y_pred_test, average="macro"), f1_score(Y_test, Y_pred_test, average="macro")]
df.loc["micro-average", :] = [precision_score(Y_test, Y_pred_test, average="micro"), recall_score(Y_test, Y_pred_test, average="micro"), f1_score(Y_test, Y_pred_test, average="micro")]
df
コメント
scikit-learnには各指標を出す関数が用意されてていいですよ。
57. 特徴量の重みの確認
52で学習したロジスティック回帰モデルの中で,重みの高い特徴量トップ10と,重みの低い特徴量トップ10を確認せよ.
import pickle
import pandas as pd
import numpy as np
lr = pickle.load(open("[PATH]/Logistic_model.sav", 'rb'))
vectorizer = pickle.load(open("[PATH]/vectorizer.sav", 'rb'))
weights = lr.coef_
cat_list = ["e", "b", "m", "t"]
df_result57 = pd.DataFrame()
df = pd.DataFrame()
for cat, weight in zip(cat_list, weights):
top_word = []
bottom_word = []
sort_weight = sorted(weight, reverse = True)
top = sort_weight[:10]
bottom = sort_weight[-10:]
for t_index, b_index in zip(top, bottom):
wn = np.where(weight == t_index)
top_word.append(vectorizer.get_feature_names_out()[wn[0][0]])
wb = np.where(weight == b_index)
bottom_word.append(vectorizer.get_feature_names_out()[wb[0][0]])
df["WEIGHT_TOP"] = top
df["WORD_TOP"] = top_word
df["WEIGHT_BOTTOM"] = bottom[::-1]
df["WORD_BOTTOM"] = bottom_word[::-1]
df["CATEGORY"] = cat
df["RANK"] = list(range(1, 11))
df = df.reindex(columns=["RANK", "CATEGORY", "WEIGHT_TOP", "WORD_TOP", "WEIGHT_BOTTOM", "WORD_BOTTOM"])
df_result57 = pd.concat([df_result57, df])
df_result57.set_index("RANK", drop=True, inplace=True)
df_result57
コメント
ロジスティック回帰は単に分類問題を解くだけでなく、どの特徴が分類に影響しているか調べられるところが面白いです。
58. 正則化パラメータの変更
ロジスティック回帰モデルを学習するとき,正則化パラメータを調整することで,学習時の過学習(overfitting)の度合いを制御できる.異なる正則化パラメータでロジスティック回帰モデルを学習し,学習データ,検証データ,および評価データ上の正解率を求めよ.実験の結果は,正則化パラメータを横軸,正解率を縦軸としたグラフにまとめよ.
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from tqdm import tqdm
def Encoder(sign):
if sign == "e":
code = 0
elif sign == "b":
code = 1
elif sign == "m":
code = 2
elif sign == "t":
code = 3
else:
pass
return code
def LearningPreparation(name):
df = pd.read_csv("[PATH]/{}.feature.txt".format(name), index_col = 0)
df["CATEGORY"] = df["CATEGORY"].map(Encoder)
X = df.iloc[:,2:].values.tolist()
Y = df["CATEGORY"].values.tolist()
return X, Y
C_list = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000]
result58 = []
X_train, Y_train = LearningPreparation("train")
X_valid, Y_valid = LearningPreparation("valid")
X_test, Y_test = LearningPreparation("test")
for c in tqdm(C_list):
lr = LogisticRegression(C = c)
lr.fit(X_train, Y_train)
Y_pred_train = lr.predict(X_train)
train_result = accuracy_score(y_true = Y_train, y_pred = Y_pred_train)
Y_pred_valid = lr.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred_valid)
Y_pred_test = lr.predict(X_test)
test_result = accuracy_score(y_true = Y_test, y_pred = Y_pred_test)
result58.append([c, train_result, valid_result, test_result])
###描画#####
C_list = [x[0] for x in result58]
train_list = [x[1] for x in result58]
valid_list = [x[2] for x in result58]
test_list = [x[3] for x in result58]
fig, ax = plt.subplots()
ctr, cva, cte = "red", "blue", "yellow"
ltr, lva, lte = "train", "valid", "test"
ax.set_xlabel("C")
ax.set_ylabel("accuracy")
ax.set_title("Train, Valid, Test accuracy")
ax.set_xscale('log')
ax.grid()
ax.plot(C_list, train_list, color = ctr, label = ltr)
ax.plot(C_list, valid_list, color = cva, label = lva)
ax.plot(C_list, test_list, color = cte, label = lte)
ax.legend(loc=0)
fig.tight_layout()
plt.show()
fig.savefig('[PATH]/Train_Valid_test_accuracy.png')
コメント
ハイパーパラメータは対数的に変えていくと良いらしいです。
59. ハイパーパラメータの探索
学習アルゴリズムや学習パラメータを変えながら,カテゴリ分類モデルを学習せよ.検証データ上の正解率が最も高くなる学習アルゴリズム・パラメータを求めよ.また,その学習アルゴリズム・パラメータを用いたときの評価データ上の正解率を求めよ.
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression #ロジスティック回帰
from sklearn.svm import SVC#サポートベクターマシン
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier#決定木
from sklearn.gaussian_process.kernels import RBF#ガウス過程回帰
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier#ランダムフォレスト、アダブースト
from sklearn.naive_bayes import GaussianNB#ガウシアンナイブベイズ
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis#二次判別分析
from sklearn.neural_network import MLPClassifier#多層パーセプトロン
from tqdm import tqdm
import pickle
import pandas as pd
def savemodel(model):
with open("[PATH]/bestmodel.pickle", mode="wb") as f:
pickle.dump(model,f,protocol=2)
def savelog(name, acc, para):
with open("[PATH]/log.txt", mode="a+") as f:
f.write("{}, {}, {}\n".format(name, acc, para))
def Encoder(sign):
if sign == "e":
code = 0
elif sign == "b":
code = 1
elif sign == "m":
code = 2
elif sign == "t":
code = 3
else:
pass
return code
def LearningPreparation(name):
df = pd.read_csv("[PATH]/{}.feature.txt".format(name), index_col = 0)
df["CATEGORY"] = df["CATEGORY"].map(Encoder)
X = df.iloc[:,2:].values.tolist()
Y = df["CATEGORY"].values.tolist()
return X, Y
X_train, Y_train = LearningPreparation("train")
X_valid, Y_valid = LearningPreparation("valid")
X_test, Y_test = LearningPreparation("test")
tune_list = [0.01, 0.1, 1]
top_result = 0
model_name = ""
parameter_list = []
print("Linear SVM")
for c in tune_list:
print(c)
model = SVC(C = c, kernel="linear")
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "Linear SVM"
top_result = valid_result
parameter_list = [c]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("Gaussian SVM")
for c in tune_list:
for ga in tune_list:
print(c,ga)
model = SVC(C = c, gamma = ga, kernel="rbf")
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "Gaussian SVM"
top_result = valid_result
parameter_list = [c, ga]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("Poly SVM")
for de in [3, 5, 10]:
for c in tune_list:
for ga in tune_list:
print(de,c,ga)
model = SVC(C = c, gamma = ga, degree = de, kernel="poly")
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "Poly SVM"
top_result = valid_result
parameter_list = [de, c, ga]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("Sigmoid SVM")
for c in tune_list:
for ga in tune_list:
print(c,ga)
model = SVC(C = c, gamma = ga, kernel="sigmoid")
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "Sigmoid SVM"
top_result = valid_result
parameter_list = [c, ga]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("Logistic Regression")
for c in tune_list:
print(c)
model = LogisticRegression(C = c)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "Logistic Regression"
top_result = valid_result
parameter_list = [c]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("KNeighborsClassifier")
for k in [1, 3, 5, 10]:
print(k)
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "KNeighborsClassifier"
top_result = valid_result
parameter_list = [k]
print("best result: {}".format(top_result))
savemodel(model)
print("DecisionTreeClassifier")
for md in [3, 5, 10, 30, 50, 100]:
print(md)
model = DecisionTreeClassifier(max_depth=md)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "DecisionTreeClassifier"
top_model = model
top_result = valid_result
parameter_list = [md]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("AdaBoostClassifier")
model = AdaBoostClassifier()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "AdaBoostClassifier"
top_result = valid_result
parameter_list = []
print("best result: {}".format(top_result))
savemodel(model)
print("GaussianNB")
model = GaussianNB()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "GaussianNB"
top_model = model
top_result = valid_result
parameter_list = []
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("QuadraticDiscriminantAnalysis")
model = QuadraticDiscriminantAnalysis()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "QuadraticDiscriminantAnalysis"
top_result = valid_result
parameter_list = []
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("MLPClassifier 1-layer")
for activation in ["logistic", "tanh", "relu"]:
for solver in ["sgd", "adam"]:
for alpha in tune_list:
for hidden_layer_sizes in [50, 100, 300]:
print(activation,solver,alpha,hidden_layer_sizes)
model = MLPClassifier(activation=activation, solver=solver, hidden_layer_sizes = (hidden_layer_sizes), early_stopping=True)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "MLPClassifier 1-layer"
top_result = valid_result
parameter_list = [activation, solver, alpha, hidden_layer_sizes]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("MLPClassifier 2-layer")
for activation in ["logistic", "tanh", "relu"]:
for solver in ["sgd", "adam"]:
for alpha in tune_list:
for hidden_layer_sizes in [50, 100, 300]:
print(activation,solver,alpha,hidden_layer_sizes)
model = MLPClassifier(activation=activation, solver=solver, hidden_layer_sizes = (hidden_layer_sizes, hidden_layer_sizes), early_stopping=True)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "MLPClassifier 2-layer"
top_result = valid_result
parameter_list = [activation, solver, alpha, hidden_layer_sizes, hidden_layer_sizes]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("MLPClassifier 3-layer")
for activation in ["logistic", "tanh", "relu"]:
for solver in ["adam"]:
for alpha in tune_list:
for hidden_layer_sizes in [50, 100, 300]:
print(activation,solver,alpha,hidden_layer_sizes)
model = MLPClassifier(activation=activation, solver=solver, hidden_layer_sizes = (hidden_layer_sizes, hidden_layer_sizes, hidden_layer_sizes), early_stopping=True)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "MLPClassifier 3-layer"
top_result = valid_result
parameter_list = [activation, solver, alpha, hidden_layer_sizes, hidden_layer_sizes]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("MLPClassifier 4-layer")
for activation in ["logistic", "tanh", "relu"]:
for solver in ["adam"]:
for alpha in tune_list:
for hidden_layer_sizes in [50, 100, 300]:
print(activation,solver,alpha,hidden_layer_sizes)
model = MLPClassifier(activation=activation, solver=solver, hidden_layer_sizes = (hidden_layer_sizes, hidden_layer_sizes, hidden_layer_sizes, hidden_layer_sizes), early_stopping=True)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "MLPClassifier 4-layer"
top_result = valid_result
parameter_list = [activation, solver, alpha, hidden_layer_sizes, hidden_layer_sizes]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("MLPClassifier 5-layer")
for activation in ["logistic", "tanh", "relu"]:
for solver in ["adam"]:
for alpha in tune_list:
for hidden_layer_sizes in [50, 100, 300]:
print(activation,solver,alpha,hidden_layer_sizes)
model = MLPClassifier(activation=activation, solver=solver, hidden_layer_sizes = (hidden_layer_sizes, hidden_layer_sizes, hidden_layer_sizes, hidden_layer_sizes, hidden_layer_sizes), early_stopping=True)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_valid)
valid_result = accuracy_score(y_true = Y_valid, y_pred = Y_pred)
if valid_result > top_result:
model_name = "MLPClassifier 5-layer"
top_result = valid_result
parameter_list = [activation, solver, alpha, hidden_layer_sizes, hidden_layer_sizes]
print("best result: {}".format(top_result))
savelog(model_name, valid_result, " ".join(map(str, parameter_list)))
savemodel(model)
print("Top Result")
print(top_result)
print("model_name")
print(model_name)
print("parameter_list")
print(parameter_list)
Top Result
0.9220389805097451
model_name
MLPClassifier 1-layer
parameter_list
['logistic', 'adam', 1, 50]
コメント
scikit-learnで使える分類アルゴリズムを全部使いました。機械学習系の研究室の学生あるあるですが、ハイパーパラメータをチューニングして研究した気になってしまうので気を付けましょう(特大ブーメラン)。
他章の解答例