tldr
KggleのThe Estonia Disaster Passenger ListをShipwreck Survival Prediction - Data Every Day #045に沿ってやっていきます。
実行環境はGoogle Colaboratorです。
インポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm
import sklearn.svm as svm
import sklearn.neural_network as snn
import tensorflow as tf
データのダウンロード
Google Driveをマウントします。
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
KaggleのAPIクライアントを初期化し、認証します。
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle
)にkaggle.json
として置いてあります。
import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
Kaggle APIを使ってデータをダウンロードします。
dataset_id = 'christianlillelund/passenger-list-for-the-estonia-ferry-disaster'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
file_path
'/content/estonia-passenger-list.csv'
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
100%|██████████| 39.1k/39.1k [00:00<00:00, 10.5MB/s]
Downloading estonia-passenger-list.csv to /content
True
データの読み込み
Pedumagalhaes/quality-prediction-in-a-mining-processadasを使ってダウンロードしてきたCSVファイルを読み込みます。
data = pd.read_csv(file_path)
data
PassengerId | Country | Firstname | Lastname | Sex | Age | Category | Survived | |
---|---|---|---|---|---|---|---|---|
0 | 1 | Sweden | ARVID KALLE | AADLI | M | 62 | P | 0 |
1 | 2 | Estonia | LEA | AALISTE | F | 22 | C | 0 |
2 | 3 | Estonia | AIRI | AAVASTE | F | 21 | C | 0 |
3 | 4 | Sweden | JURI | AAVIK | M | 53 | C | 0 |
4 | 5 | Sweden | BRITTA ELISABET | AHLSTROM | F | 55 | P | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
984 | 985 | Sweden | ANNA INGRID BIRGITTA | OSTROM | F | 60 | P | 0 |
985 | 986 | Sweden | ELMAR MIKAEL | OUN | M | 34 | P | 1 |
986 | 987 | Sweden | ENN | QUNAPUU | M | 77 | P | 0 |
987 | 988 | Sweden | LY | GUNAPUU | F | 87 | P | 0 |
988 | 989 | Sweden | CARL | OVBERG | M | 42 | P | 1 |
989 rows × 8 columns
下準備
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 989 non-null int64
1 Country 989 non-null object
2 Firstname 989 non-null object
3 Lastname 989 non-null object
4 Sex 989 non-null object
5 Age 989 non-null int64
6 Category 989 non-null object
7 Survived 989 non-null int64
dtypes: int64(3), object(5)
memory usage: 61.9+ KB
data.isna().sum()
PassengerId 0
Country 0
Firstname 0
Lastname 0
Sex 0
Age 0
Category 0
Survived 0
dtype: int64
data = data.drop('PassengerId', axis=1)
data['Category'].unique()
array(['P', 'C'], dtype=object)
data['Country'].unique()
array(['Sweden', 'Estonia', 'Latvia', 'Russia', 'Germany', 'Finland',
'Great Britain', 'Morocco', 'Denmark', 'France', 'Netherlands',
'Norway', 'Lithuania', 'Nigeria', 'Canada', 'Belarus'],
dtype=object)
data['Lastname'] = data['Lastname'].apply(lambda x: x[0])
data = data.drop('Firstname', axis=1)
data
Country | Lastname | Sex | Age | Category | Survived | |
---|---|---|---|---|---|---|
0 | Sweden | A | M | 62 | P | 0 |
1 | Estonia | A | F | 22 | C | 0 |
2 | Estonia | A | F | 21 | C | 0 |
3 | Sweden | A | M | 53 | C | 0 |
4 | Sweden | A | F | 55 | P | 0 |
... | ... | ... | ... | ... | ... | ... |
984 | Sweden | O | F | 60 | P | 0 |
985 | Sweden | O | M | 34 | P | 1 |
986 | Sweden | Q | M | 77 | P | 0 |
987 | Sweden | G | F | 87 | P | 0 |
988 | Sweden | O | M | 42 | P | 1 |
989 rows × 6 columns
def binary_encode(df, column, positive_value):
df = df.copy()
df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
return df
def ordinal_encode(df, column, ordering):
df = df.copy()
df[column] = df[column].apply(lambda x: ordering.index(x))
return df
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column])
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
alphabet_ordering = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
data = binary_encode(data, 'Sex', 'M')
data = binary_encode(data, 'Category', 'P')
data = ordinal_encode(data, 'Lastname', alphabet_ordering)
data = onehot_encode(data, 'Country')
分割とスケーリング
y = data['Survived']
X = data.drop(['Survived'], axis=1)
scaler = sp.MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
トレーニング
log_model = slm.LogisticRegression()
svm_model = svm.SVC()
ann_model = snn.MLPClassifier(hidden_layer_sizes=(16, 16 ))
log_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
ann_model.fit(X_train, y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:571: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
% self.max_iter, ConvergenceWarning)
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(16, 16), learning_rate='constant',
learning_rate_init=0.001, max_fun=15000, max_iter=200,
momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
power_t=0.5, random_state=None, shuffle=True, solver='adam',
tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
from sklearn.metrics import roc_auc_score
log_acc = log_model.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)
ann_acc = ann_model.score(X_test, y_test)
log_preds = log_model.predict(X_test)
svm_preds = svm_model.predict(X_test)
ann_preds = ann_model.predict(X_test)
log_auc = roc_auc_score(y_test, log_preds)
svm_auc = roc_auc_score(y_test, svm_preds)
ann_auc = roc_auc_score(y_test, ann_preds)
import plotly.express as px
acc_fig = px.bar(
x = ["Logistic Regression", "Support Vector Machine", "Neural Network"],
y = [log_acc, svm_acc, ann_acc],
labels={'x': "Model", 'y': "Accuracy"},
color=["Logistic Regression", "Support Vector Machine", "Neural Network"],
title="Model Accuracy"
)
acc_fig.show()
auc_fig = px.bar(
x = ["Logistic Regression", "Support Vector Machine", "Neural Network"],
y = [log_auc, svm_auc, ann_auc],
labels={'x': "Model", 'y': "ROC AUC"},
color=["Logistic Regression", "Support Vector Machine", "Neural Network"],
title="Model ROC AUC"
)
auc_fig.show()