ロジスティック回帰モデル ~タイタニック生存~
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
%matplotlib inline
# データの確認
titanic_df = pd.read_csv('data/titanic_train.csv')
titanic_df.head()
titanic_df.info()
# 不要(と思われる)列の削除
titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
titanic_df.head()
#Ageカラムのnullを中央値で補完
titanic_df['AgeFill'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())
#nullを含んでいる行を表示
titanic_df[titanic_df.isnull().any(1)]
1変数からの予測
# チケット価格から運賃を予想
data1 = titanic_df.loc[:, ["Fare"]].values
label1 = titanic_df.loc[:,["Survived"]].values
# シグモイド関数
def sigmoid(x):
return 1 / (1+np.exp(-x))
model = LogisticRegression()
model.fit(data1, label1)
# 決定関数値(絶対値が大きいほど識別境界から離れている)
X_test_value = model.decision_function(data1)
# 決定関数値をシグモイド関数で確率に変換
X_test_prob = sigmoid(X_test_value)
print (model.intercept_)
print (model.coef_)
[-0.93290045]
[[0.01506685]]
w_0 = model.intercept_[0]
w_1 = model.coef_[0,0]
def sigmoid2(x):
return 1 / (1+np.exp(-(w_1*x+w_0)))
# -1 ~ 500 まで 3000分割
x_range = np.linspace(-1, 500, 3000)
# 新規のウィンドウを描画(引数: 図のサイズ。数字はインチ。デフォルトが(8,6))
plt.figure(figsize=(9,5))
# 漫画風にする(xkcd: 世界で最も人気のあるWEB漫画)
plt.xkcd()
plt.plot(data1,np.zeros(len(data1)), 'o')
plt.plot(data1, model.predict_proba(data1), 'o')
plt.plot(x_range, sigmoid2(x_range), '-')
# グラフからなんとんく60辺りが境界っぽい
[<matplotlib.lines.Line2D at 0x1d33f424470>]
model.predict([[61]])
array([0], dtype=int64)
model.predict([[62]])
array([1], dtype=int64)
2変数からの予測
titanic_df['Gender'] = titanic_df['Sex'].map({'female': 0, 'male': 1}).astype(int)
titanic_df['Pclass_Gender'] = titanic_df['Pclass'] + titanic_df['Gender']
titanic_df.head()
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | AgeFill | Gender | Pclass_Gender | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | 22.0 | 1 | 4 |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | 38.0 | 0 | 1 |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | 26.0 | 0 | 3 |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | 35.0 | 0 | 1 |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | 35.0 | 1 | 4 |
titanic_df = titanic_df.drop(['Pclass', 'Sex', 'Gender','Age'], axis=1)
titanic_df.head()
Survived | SibSp | Parch | Fare | Embarked | AgeFill | Pclass_Gender | |
---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 7.2500 | S | 22.0 | 4 |
1 | 1 | 1 | 0 | 71.2833 | C | 38.0 | 1 |
2 | 1 | 0 | 0 | 7.9250 | S | 26.0 | 3 |
3 | 1 | 1 | 0 | 53.1000 | S | 35.0 | 1 |
4 | 0 | 0 | 0 | 8.0500 | S | 35.0 | 4 |
np.random.seed = 0
xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5
index_survived = titanic_df[titanic_df["Survived"]==0].index
index_notsurvived = titanic_df[titanic_df["Survived"]==1].index
np.random.seed = 0
xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5
index_survived = titanic_df[titanic_df["Survived"]==0].index
index_notsurvived = titanic_df[titanic_df["Survived"]==1].index
from matplotlib.colors import ListedColormap
fig, ax = plt.subplots()
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
sc = ax.scatter(titanic_df.loc[index_survived, 'AgeFill'],
titanic_df.loc[index_survived, 'Pclass_Gender']+(np.random.rand(len(index_survived))-0.5)*0.1,
color='r', label='Not Survived', alpha=0.3)
sc = ax.scatter(titanic_df.loc[index_notsurvived, 'AgeFill'],
titanic_df.loc[index_notsurvived, 'Pclass_Gender']+(np.random.rand(len(index_notsurvived))-0.5)*0.1,
color='b', label='Survived', alpha=0.3)
ax.set_xlabel('AgeFill')
ax.set_ylabel('Pclass_Gender')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.legend(bbox_to_anchor=(1.4, 1.03))
# グラフからなんとんくPclass_Gender2かつ40歳以下と
# Pclass_Gender3かつ15歳以下辺りが境界っぽい
<matplotlib.legend.Legend at 0x1d34193ba90>
data2 = titanic_df.loc[:, ["AgeFill", "Pclass_Gender"]].values
label2 = titanic_df.loc[:,["Survived"]].values
data2
array([[22. , 4. ],
[38. , 1. ],
[26. , 3. ],
...,
[29.69911765, 3. ],
[26. , 2. ],
[32. , 4. ]])
model2 = LogisticRegression()
model2.fit(data2, label2)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False)
h = 0.02
xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = model2.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
fig, ax = plt.subplots()
levels = np.linspace(0, 1.0)
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
#contour = ax.contourf(xx, yy, Z, cmap=cm, levels=levels, alpha=0.5)
sc = ax.scatter(titanic_df.loc[index_survived, 'AgeFill'],
titanic_df.loc[index_survived, 'Pclass_Gender']+(np.random.rand(len(index_survived))-0.5)*0.1,
color='r', label='Not Survived', alpha=0.3)
sc = ax.scatter(titanic_df.loc[index_notsurvived, 'AgeFill'],
titanic_df.loc[index_notsurvived, 'Pclass_Gender']+(np.random.rand(len(index_notsurvived))-0.5)*0.1,
color='b', label='Survived', alpha=0.3)
ax.set_xlabel('AgeFill')
ax.set_ylabel('Pclass_Gender')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
#fig.colorbar(contour)
x1 = xmin
x2 = xmax
y1 = -1*(model2.intercept_[0]+model2.coef_[0][0]*xmin)/model2.coef_[0][1]
y2 = -1*(model2.intercept_[0]+model2.coef_[0][0]*xmax)/model2.coef_[0][1]
ax.plot([x1, x2] ,[y1, y2], 'k--')
[<matplotlib.lines.Line2D at 0x1d34196c7f0>]
model2.predict([[12, 3]])
array([0], dtype=int64)
model2.predict([[11, 3]])
array([1], dtype=int64)
model2.predict([[47, 2]])
array([1], dtype=int64)
model2.predict([[48, 2]])
array([0], dtype=int64)
モデルの評価
from sklearn.model_selection import train_test_split
traindata1, testdata1, trainlabel1, testlabel1 = train_test_split(data1, label1, test_size=0.2)
traindata1.shape
(712, 1)
# 簡易的に別々に分割
traindata2, testdata2, trainlabel2, testlabel2 = train_test_split(data2, label2, test_size=0.2)
traindata2.shape
(712, 2)
data = titanic_df.loc[:, ].values
label = titanic_df.loc[:,["Survived"]].values
traindata, testdata, trainlabel, testlabel = train_test_split(data, label, test_size=0.2)
eval_model1=LogisticRegression()
eval_model2=LogisticRegression()
predictor_eval1=eval_model1.fit(traindata1, trainlabel1).predict(testdata1)
predictor_eval2=eval_model2.fit(traindata2, trainlabel2).predict(testdata2)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
全体的にmodel2(2変数)のほうがスコアがいい
eval_model1.score(traindata1, trainlabel1)
0.6474719101123596
eval_model1.score(testdata1,testlabel1)
0.7318435754189944
eval_model2.score(traindata2, trainlabel2)
0.7710674157303371
eval_model2.score(testdata2,testlabel2)
0.7653631284916201
from sklearn import metrics
print(metrics.classification_report(testlabel1, predictor_eval1))
print(metrics.classification_report(testlabel2, predictor_eval2))
precision recall f1-score support
0 0.74 0.93 0.82 121
1 0.69 0.31 0.43 58
micro avg 0.73 0.73 0.73 179
macro avg 0.72 0.62 0.63 179
weighted avg 0.72 0.73 0.70 179
precision recall f1-score support
0 0.81 0.84 0.82 118
1 0.67 0.62 0.64 61
micro avg 0.77 0.77 0.77 179
macro avg 0.74 0.73 0.73 179
weighted avg 0.76 0.77 0.76 179
precision, recall, f1-scoreという代表的な評価指標と、support(=y_trueに含まれるデータ数)が、クラスごとと全体の各種平均で出ている。
from sklearn.metrics import confusion_matrix
confusion_matrix1=confusion_matrix(testlabel1, predictor_eval1)
confusion_matrix2=confusion_matrix(testlabel2, predictor_eval2)
confusion_matrix1
array([[113, 8],
[ 40, 18]], dtype=int64)
confusion_matrix2
array([[99, 19],
[23, 38]], dtype=int64)
fig = plt.figure(figsize = (7,7))
#plt.title(title)
sns.heatmap(
confusion_matrix1,
vmin=None,
vmax=None,
cmap="Blues",
center=None,
robust=False,
annot=True, fmt='.2g',
annot_kws=None,
linewidths=0,
linecolor='white',
cbar=True,
cbar_kws=None,
cbar_ax=None,
square=True, ax=None,
#xticklabels=columns,
#yticklabels=columns,
mask=None)
<matplotlib.axes._subplots.AxesSubplot at 0x1d3429988d0>
fig = plt.figure(figsize = (7,7))
#plt.title(title)
sns.heatmap(
confusion_matrix2,
vmin=None,
vmax=None,
cmap="Blues",
center=None,
robust=False,
annot=True, fmt='.2g',
annot_kws=None,
linewidths=0,
linecolor='white',
cbar=True,
cbar_kws=None,
cbar_ax=None,
square=True, ax=None,
#xticklabels=columns,
#yticklabels=columns,
mask=None)
<matplotlib.axes._subplots.AxesSubplot at 0x1d3423b7a90>
import seaborn as sns
sns.set(style="whitegrid")
# Load the example Titanic dataset
titanic = sns.load_dataset("titanic")
# Set up a grid to plot survival probability against several variables
g = sns.PairGrid(titanic, y_vars="survived",
x_vars=["class", "sex", "who", "alone"],
size=5, aspect=.5)
# Draw a seaborn pointplot onto each Axes
g.map(sns.pointplot, color=sns.xkcd_rgb["plum"])
g.set(ylim=(0, 1))
sns.despine(fig=g.fig, left=True)
plt.show()
C:\Users\taka0\Anaconda3\lib\site-packages\seaborn\axisgrid.py:1241: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
warnings.warn(UserWarning(msg))
C:\Users\taka0\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
import seaborn as sns
sns.set(style="darkgrid")
# Load the example titanic dataset
df = sns.load_dataset("titanic")
# Make a custom palette with gendered colors
pal = dict(male="#6495ED", female="#F08080")
# Show the survival proability as a function of age and sex
g = sns.lmplot(x="age", y="survived", col="sex", hue="sex", data=df,
palette=pal, y_jitter=.02, logistic=True)
g.set(xlim=(0, 80), ylim=(-.05, 1.05))
plt.show()