2
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

タイタニック(ロジスティック回帰)

Last updated at Posted at 2019-05-06

ロジスティック回帰モデル ~タイタニック生存~

import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
%matplotlib inline
# データの確認
titanic_df = pd.read_csv('data/titanic_train.csv')
titanic_df.head()
titanic_df.info()
# 不要(と思われる)列の削除
titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
titanic_df.head()
#Ageカラムのnullを中央値で補完
titanic_df['AgeFill'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())
#nullを含んでいる行を表示
titanic_df[titanic_df.isnull().any(1)]

1変数からの予測

# チケット価格から運賃を予想
data1 = titanic_df.loc[:, ["Fare"]].values
label1 = titanic_df.loc[:,["Survived"]].values
# シグモイド関数
def sigmoid(x):
     return 1 / (1+np.exp(-x))
model = LogisticRegression()
model.fit(data1, label1)
# 決定関数値(絶対値が大きいほど識別境界から離れている)
X_test_value = model.decision_function(data1) 
# 決定関数値をシグモイド関数で確率に変換
X_test_prob = sigmoid(X_test_value) 
print (model.intercept_)
print (model.coef_)
[-0.93290045]
[[0.01506685]]
w_0 = model.intercept_[0]
w_1 = model.coef_[0,0]

def sigmoid2(x):
    return 1 / (1+np.exp(-(w_1*x+w_0)))

# -1 ~ 500 まで 3000分割
x_range = np.linspace(-1, 500, 3000)

# 新規のウィンドウを描画(引数: 図のサイズ。数字はインチ。デフォルトが(8,6))
plt.figure(figsize=(9,5))
# 漫画風にする(xkcd: 世界で最も人気のあるWEB漫画)
plt.xkcd()

plt.plot(data1,np.zeros(len(data1)), 'o')
plt.plot(data1, model.predict_proba(data1), 'o')
plt.plot(x_range, sigmoid2(x_range), '-')

# グラフからなんとんく60辺りが境界っぽい
[<matplotlib.lines.Line2D at 0x1d33f424470>]

output_14_1.png

model.predict([[61]])
array([0], dtype=int64)
model.predict([[62]])
array([1], dtype=int64)

2変数からの予測

titanic_df['Gender'] = titanic_df['Sex'].map({'female': 0, 'male': 1}).astype(int)
titanic_df['Pclass_Gender'] = titanic_df['Pclass'] + titanic_df['Gender']
titanic_df.head()
Survived Pclass Sex Age SibSp Parch Fare Embarked AgeFill Gender Pclass_Gender
0 0 3 male 22.0 1 0 7.2500 S 22.0 1 4
1 1 1 female 38.0 1 0 71.2833 C 38.0 0 1
2 1 3 female 26.0 0 0 7.9250 S 26.0 0 3
3 1 1 female 35.0 1 0 53.1000 S 35.0 0 1
4 0 3 male 35.0 0 0 8.0500 S 35.0 1 4
titanic_df = titanic_df.drop(['Pclass', 'Sex', 'Gender','Age'], axis=1)
titanic_df.head()
Survived SibSp Parch Fare Embarked AgeFill Pclass_Gender
0 0 1 0 7.2500 S 22.0 4
1 1 1 0 71.2833 C 38.0 1
2 1 0 0 7.9250 S 26.0 3
3 1 1 0 53.1000 S 35.0 1
4 0 0 0 8.0500 S 35.0 4
np.random.seed = 0

xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5

index_survived = titanic_df[titanic_df["Survived"]==0].index
index_notsurvived = titanic_df[titanic_df["Survived"]==1].index
np.random.seed = 0

xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5

index_survived = titanic_df[titanic_df["Survived"]==0].index
index_notsurvived = titanic_df[titanic_df["Survived"]==1].index

from matplotlib.colors import ListedColormap
fig, ax = plt.subplots()
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
sc = ax.scatter(titanic_df.loc[index_survived, 'AgeFill'],
                titanic_df.loc[index_survived, 'Pclass_Gender']+(np.random.rand(len(index_survived))-0.5)*0.1,
                color='r', label='Not Survived', alpha=0.3)
sc = ax.scatter(titanic_df.loc[index_notsurvived, 'AgeFill'],
                titanic_df.loc[index_notsurvived, 'Pclass_Gender']+(np.random.rand(len(index_notsurvived))-0.5)*0.1,
                color='b', label='Survived', alpha=0.3)
ax.set_xlabel('AgeFill')
ax.set_ylabel('Pclass_Gender')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.legend(bbox_to_anchor=(1.4, 1.03))

# グラフからなんとんくPclass_Gender2かつ40歳以下と
# Pclass_Gender3かつ15歳以下辺りが境界っぽい
<matplotlib.legend.Legend at 0x1d34193ba90>

output_21_1.png

data2 = titanic_df.loc[:, ["AgeFill", "Pclass_Gender"]].values
label2 =  titanic_df.loc[:,["Survived"]].values
data2
array([[22.        ,  4.        ],
       [38.        ,  1.        ],
       [26.        ,  3.        ],
       ...,
       [29.69911765,  3.        ],
       [26.        ,  2.        ],
       [32.        ,  4.        ]])
model2 = LogisticRegression()
model2.fit(data2, label2)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
h = 0.02
xmin, xmax = -5, 85
ymin, ymax = 0.5, 4.5
xx, yy = np.meshgrid(np.arange(xmin, xmax, h), np.arange(ymin, ymax, h))
Z = model2.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots()
levels = np.linspace(0, 1.0)
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
#contour = ax.contourf(xx, yy, Z, cmap=cm, levels=levels, alpha=0.5)

sc = ax.scatter(titanic_df.loc[index_survived, 'AgeFill'],
                titanic_df.loc[index_survived, 'Pclass_Gender']+(np.random.rand(len(index_survived))-0.5)*0.1,
                color='r', label='Not Survived', alpha=0.3)
sc = ax.scatter(titanic_df.loc[index_notsurvived, 'AgeFill'],
                titanic_df.loc[index_notsurvived, 'Pclass_Gender']+(np.random.rand(len(index_notsurvived))-0.5)*0.1,
                color='b', label='Survived', alpha=0.3)

ax.set_xlabel('AgeFill')
ax.set_ylabel('Pclass_Gender')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
#fig.colorbar(contour)

x1 = xmin
x2 = xmax
y1 = -1*(model2.intercept_[0]+model2.coef_[0][0]*xmin)/model2.coef_[0][1]
y2 = -1*(model2.intercept_[0]+model2.coef_[0][0]*xmax)/model2.coef_[0][1]
ax.plot([x1, x2] ,[y1, y2], 'k--')
[<matplotlib.lines.Line2D at 0x1d34196c7f0>]

output_25_1.png

model2.predict([[12, 3]])
array([0], dtype=int64)
model2.predict([[11, 3]])
array([1], dtype=int64)
model2.predict([[47, 2]])
array([1], dtype=int64)
model2.predict([[48, 2]])
array([0], dtype=int64)

モデルの評価

from sklearn.model_selection import train_test_split
traindata1, testdata1, trainlabel1, testlabel1 = train_test_split(data1, label1, test_size=0.2)
traindata1.shape
(712, 1)
# 簡易的に別々に分割
traindata2, testdata2, trainlabel2, testlabel2 = train_test_split(data2, label2, test_size=0.2)
traindata2.shape
(712, 2)
data = titanic_df.loc[:, ].values
label =  titanic_df.loc[:,["Survived"]].values
traindata, testdata, trainlabel, testlabel = train_test_split(data, label, test_size=0.2)
eval_model1=LogisticRegression()
eval_model2=LogisticRegression()
predictor_eval1=eval_model1.fit(traindata1, trainlabel1).predict(testdata1)
predictor_eval2=eval_model2.fit(traindata2, trainlabel2).predict(testdata2)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\Users\taka0\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

全体的にmodel2(2変数)のほうがスコアがいい

eval_model1.score(traindata1, trainlabel1)
0.6474719101123596
eval_model1.score(testdata1,testlabel1)
0.7318435754189944
eval_model2.score(traindata2, trainlabel2)
0.7710674157303371
eval_model2.score(testdata2,testlabel2)
0.7653631284916201
from sklearn import metrics
print(metrics.classification_report(testlabel1, predictor_eval1))
print(metrics.classification_report(testlabel2, predictor_eval2))
              precision    recall  f1-score   support

           0       0.74      0.93      0.82       121
           1       0.69      0.31      0.43        58

   micro avg       0.73      0.73      0.73       179
   macro avg       0.72      0.62      0.63       179
weighted avg       0.72      0.73      0.70       179

              precision    recall  f1-score   support

           0       0.81      0.84      0.82       118
           1       0.67      0.62      0.64        61

   micro avg       0.77      0.77      0.77       179
   macro avg       0.74      0.73      0.73       179
weighted avg       0.76      0.77      0.76       179

precision, recall, f1-scoreという代表的な評価指標と、support(=y_trueに含まれるデータ数)が、クラスごとと全体の各種平均で出ている。

from sklearn.metrics import confusion_matrix
confusion_matrix1=confusion_matrix(testlabel1, predictor_eval1)
confusion_matrix2=confusion_matrix(testlabel2, predictor_eval2)
confusion_matrix1
array([[113,   8],
       [ 40,  18]], dtype=int64)
confusion_matrix2
array([[99, 19],
       [23, 38]], dtype=int64)
fig = plt.figure(figsize = (7,7))
#plt.title(title)
sns.heatmap(
    confusion_matrix1,
    vmin=None,
    vmax=None,
    cmap="Blues",
    center=None,
    robust=False,
    annot=True, fmt='.2g',
    annot_kws=None,
    linewidths=0,
    linecolor='white',
    cbar=True,
    cbar_kws=None,
    cbar_ax=None,
    square=True, ax=None, 
    #xticklabels=columns,
    #yticklabels=columns,
    mask=None)
<matplotlib.axes._subplots.AxesSubplot at 0x1d3429988d0>

output_47_1.png

fig = plt.figure(figsize = (7,7))
#plt.title(title)
sns.heatmap(
    confusion_matrix2,
    vmin=None,
    vmax=None,
    cmap="Blues",
    center=None,
    robust=False,
    annot=True, fmt='.2g',
    annot_kws=None,
    linewidths=0,
    linecolor='white',
    cbar=True,
    cbar_kws=None,
    cbar_ax=None,
    square=True, ax=None, 
    #xticklabels=columns,
    #yticklabels=columns,
    mask=None)
<matplotlib.axes._subplots.AxesSubplot at 0x1d3423b7a90>

output_48_1.png

import seaborn as sns
sns.set(style="whitegrid")

# Load the example Titanic dataset
titanic = sns.load_dataset("titanic")

# Set up a grid to plot survival probability against several variables
g = sns.PairGrid(titanic, y_vars="survived",
                 x_vars=["class", "sex", "who", "alone"],
                 size=5, aspect=.5)

# Draw a seaborn pointplot onto each Axes
g.map(sns.pointplot, color=sns.xkcd_rgb["plum"])
g.set(ylim=(0, 1))
sns.despine(fig=g.fig, left=True)

plt.show()
C:\Users\taka0\Anaconda3\lib\site-packages\seaborn\axisgrid.py:1241: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(UserWarning(msg))
C:\Users\taka0\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

output_49_1.png

import seaborn as sns
sns.set(style="darkgrid")

# Load the example titanic dataset
df = sns.load_dataset("titanic")

# Make a custom palette with gendered colors
pal = dict(male="#6495ED", female="#F08080")

# Show the survival proability as a function of age and sex
g = sns.lmplot(x="age", y="survived", col="sex", hue="sex", data=df,
               palette=pal, y_jitter=.02, logistic=True)
g.set(xlim=(0, 80), ylim=(-.05, 1.05))
plt.show()

output_50_0.png

2
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?