LoginSignup
1

More than 3 years have passed since last update.

Data Every Day: エストニア災害乗客リスト

Posted at

tldr

KggleのThe Estonia Disaster Passenger ListShipwreck Survival Prediction - Data Every Day #045に沿ってやっていきます。

実行環境はGoogle Colaboratorです。

インポート

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm
import sklearn.svm as svm
import sklearn.neural_network as snn

import tensorflow as tf

データのダウンロード

Google Driveをマウントします。

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

KaggleのAPIクライアントを初期化し、認証します。
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle)にkaggle.jsonとして置いてあります。

import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate() 

Kaggle APIを使ってデータをダウンロードします。

dataset_id = 'christianlillelund/passenger-list-for-the-estonia-ferry-disaster'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
file_path
'/content/estonia-passenger-list.csv'
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
100%|██████████| 39.1k/39.1k [00:00<00:00, 10.5MB/s]

Downloading estonia-passenger-list.csv to /content









True

データの読み込み

Pedumagalhaes/quality-prediction-in-a-mining-processadasを使ってダウンロードしてきたCSVファイルを読み込みます。

data = pd.read_csv(file_path)
data
PassengerId Country Firstname Lastname Sex Age Category Survived
0 1 Sweden ARVID KALLE AADLI M 62 P 0
1 2 Estonia LEA AALISTE F 22 C 0
2 3 Estonia AIRI AAVASTE F 21 C 0
3 4 Sweden JURI AAVIK M 53 C 0
4 5 Sweden BRITTA ELISABET AHLSTROM F 55 P 0
... ... ... ... ... ... ... ... ...
984 985 Sweden ANNA INGRID BIRGITTA OSTROM F 60 P 0
985 986 Sweden ELMAR MIKAEL OUN M 34 P 1
986 987 Sweden ENN QUNAPUU M 77 P 0
987 988 Sweden LY GUNAPUU F 87 P 0
988 989 Sweden CARL OVBERG M 42 P 1

989 rows × 8 columns

下準備

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  989 non-null    int64 
 1   Country      989 non-null    object
 2   Firstname    989 non-null    object
 3   Lastname     989 non-null    object
 4   Sex          989 non-null    object
 5   Age          989 non-null    int64 
 6   Category     989 non-null    object
 7   Survived     989 non-null    int64 
dtypes: int64(3), object(5)
memory usage: 61.9+ KB
data.isna().sum()
PassengerId    0
Country        0
Firstname      0
Lastname       0
Sex            0
Age            0
Category       0
Survived       0
dtype: int64
data = data.drop('PassengerId', axis=1)
data['Category'].unique()
array(['P', 'C'], dtype=object)
data['Country'].unique()
array(['Sweden', 'Estonia', 'Latvia', 'Russia', 'Germany', 'Finland',
       'Great Britain', 'Morocco', 'Denmark', 'France', 'Netherlands',
       'Norway', 'Lithuania', 'Nigeria', 'Canada', 'Belarus'],
      dtype=object)
data['Lastname'] = data['Lastname'].apply(lambda x: x[0])
data = data.drop('Firstname', axis=1)
data
Country Lastname Sex Age Category Survived
0 Sweden A M 62 P 0
1 Estonia A F 22 C 0
2 Estonia A F 21 C 0
3 Sweden A M 53 C 0
4 Sweden A F 55 P 0
... ... ... ... ... ... ...
984 Sweden O F 60 P 0
985 Sweden O M 34 P 1
986 Sweden Q M 77 P 0
987 Sweden G F 87 P 0
988 Sweden O M 42 P 1

989 rows × 6 columns

def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df
alphabet_ordering = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
data = binary_encode(data, 'Sex', 'M')
data = binary_encode(data, 'Category', 'P')

data = ordinal_encode(data, 'Lastname', alphabet_ordering)

data = onehot_encode(data, 'Country')

分割とスケーリング

y = data['Survived']
X = data.drop(['Survived'], axis=1)
scaler = sp.MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

トレーニング

log_model = slm.LogisticRegression()
svm_model = svm.SVC()
ann_model = snn.MLPClassifier(hidden_layer_sizes=(16, 16    ))

log_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
ann_model.fit(X_train, y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:571: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)





MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(16, 16), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)
from sklearn.metrics import roc_auc_score
log_acc = log_model.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)
ann_acc = ann_model.score(X_test, y_test)

log_preds = log_model.predict(X_test)
svm_preds = svm_model.predict(X_test)
ann_preds = ann_model.predict(X_test)

log_auc = roc_auc_score(y_test, log_preds)
svm_auc = roc_auc_score(y_test, svm_preds)
ann_auc = roc_auc_score(y_test, ann_preds)
import plotly.express as px
acc_fig = px.bar(
    x = ["Logistic Regression", "Support Vector Machine", "Neural Network"],
    y = [log_acc, svm_acc, ann_acc],
    labels={'x': "Model", 'y': "Accuracy"},
    color=["Logistic Regression", "Support Vector Machine", "Neural Network"],
    title="Model Accuracy"
)

acc_fig.show()
auc_fig = px.bar(
    x = ["Logistic Regression", "Support Vector Machine", "Neural Network"],
    y = [log_auc, svm_auc, ann_auc],
    labels={'x': "Model", 'y': "ROC AUC"},
    color=["Logistic Regression", "Support Vector Machine", "Neural Network"],
    title="Model ROC AUC"
)

auc_fig.show()

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1