tldr
KggleのHealth Insurance Cross Sell PredictionをHealth Insurance Interest Prediction - Data Every Day #038に沿ってやっていきます。
実行環境はGoogle Colaboratorです。
インポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm
import tensorflow as tf
データのダウンロード
Google Driveをマウントします。
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
KaggleのAPIクライアントを初期化し、認証します。
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle
)にkaggle.json
として置いてあります。
import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
Kaggle APIを使ってデータをダウンロードします。
dataset_id = 'anmolkumar/health-insurance-cross-sell-prediction'
dataset = api.dataset_list_files(dataset_id)
file_name_test = dataset.files[0].name
file_name_train = dataset.files[1].name
file_path_test = os.path.join(api.get_default_download_dir(), file_name_test)
file_path_train = os.path.join(api.get_default_download_dir(), file_name_train)
api.dataset_download_file(dataset_id, file_name_test, force=True, quiet=False)
api.dataset_download_file(dataset_id, file_name_train, force=True, quiet=False)
100%|██████████| 1.54M/1.54M [00:00<00:00, 129MB/s]
Downloading test.csv.zip to /content
100%|██████████| 4.66M/4.66M [00:00<00:00, 33.8MB/s]
Downloading train.csv.zip to /content
True
データの読み込み
Padasを使ってダウンロードしてきたCSVファイルを読み込みます。
train_df = pd.read_csv(file_path_train+'.zip')
test_df = pd.read_csv(file_path_test+'.zip')
train_df
id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | Response | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Male | 44 | 1 | 28.0 | 0 | > 2 Years | Yes | 40454.0 | 26.0 | 217 | 1 |
1 | 2 | Male | 76 | 1 | 3.0 | 0 | 1-2 Year | No | 33536.0 | 26.0 | 183 | 0 |
2 | 3 | Male | 47 | 1 | 28.0 | 0 | > 2 Years | Yes | 38294.0 | 26.0 | 27 | 1 |
3 | 4 | Male | 21 | 1 | 11.0 | 1 | < 1 Year | No | 28619.0 | 152.0 | 203 | 0 |
4 | 5 | Female | 29 | 1 | 41.0 | 1 | < 1 Year | No | 27496.0 | 152.0 | 39 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
381104 | 381105 | Male | 74 | 1 | 26.0 | 1 | 1-2 Year | No | 30170.0 | 26.0 | 88 | 0 |
381105 | 381106 | Male | 30 | 1 | 37.0 | 1 | < 1 Year | No | 40016.0 | 152.0 | 131 | 0 |
381106 | 381107 | Male | 21 | 1 | 30.0 | 1 | < 1 Year | No | 35118.0 | 160.0 | 161 | 0 |
381107 | 381108 | Female | 68 | 1 | 14.0 | 0 | > 2 Years | Yes | 44617.0 | 124.0 | 74 | 0 |
381108 | 381109 | Male | 46 | 1 | 29.0 | 0 | 1-2 Year | No | 41777.0 | 26.0 | 237 | 0 |
381109 rows × 12 columns
test_df
id | Gender | Age | Driving_License | Region_Code | Previously_Insured | Vehicle_Age | Vehicle_Damage | Annual_Premium | Policy_Sales_Channel | Vintage | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 381110 | Male | 25 | 1 | 11.0 | 1 | < 1 Year | No | 35786.0 | 152.0 | 53 |
1 | 381111 | Male | 40 | 1 | 28.0 | 0 | 1-2 Year | Yes | 33762.0 | 7.0 | 111 |
2 | 381112 | Male | 47 | 1 | 28.0 | 0 | 1-2 Year | Yes | 40050.0 | 124.0 | 199 |
3 | 381113 | Male | 24 | 1 | 27.0 | 1 | < 1 Year | Yes | 37356.0 | 152.0 | 187 |
4 | 381114 | Male | 27 | 1 | 28.0 | 1 | < 1 Year | No | 59097.0 | 152.0 | 297 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
127032 | 508142 | Female | 26 | 1 | 37.0 | 1 | < 1 Year | No | 30867.0 | 152.0 | 56 |
127033 | 508143 | Female | 38 | 1 | 28.0 | 0 | 1-2 Year | Yes | 28700.0 | 122.0 | 165 |
127034 | 508144 | Male | 21 | 1 | 46.0 | 1 | < 1 Year | No | 29802.0 | 152.0 | 74 |
127035 | 508145 | Male | 71 | 1 | 28.0 | 1 | 1-2 Year | No | 62875.0 | 26.0 | 265 |
127036 | 508146 | Male | 41 | 1 | 29.0 | 1 | 1-2 Year | No | 27927.0 | 124.0 | 231 |
127037 rows × 11 columns
下準備
エンコード
def get_uniques(df, columns):
return {column: list(df[column].unique()) for column in columns}
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
get_uniques(train_df, categorical_features)
{'Gender': ['Male', 'Female'],
'Vehicle_Age': ['> 2 Years', '1-2 Year', '< 1 Year'],
'Vehicle_Damage': ['Yes', 'No']}
binary_features = ['Gender', 'Vehicle_Damage']
ordinal_features = ['Vehicle_Age']
def binary_encode(df, column, positive_label):
df = df.copy()
df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
return df
train_df = binary_encode(train_df, 'Vehicle_Damage', 'Yes')
test_df = binary_encode(test_df, 'Vehicle_Damage', 'Yes')
train_df = binary_encode(train_df, 'Gender', 'Male')
test_df = binary_encode(test_df, 'Gender', 'Male')
def ordinal_encode(df, column, ordering):
df = df.copy()
df[column] = df[column].apply(lambda x: ordering.index(x))
return df
age_ordering = ['< 1 Year', '1-2 Year', '> 2 Years']
train_df = ordinal_encode(train_df, 'Vehicle_Age', age_ordering)
test_df = ordinal_encode(test_df, 'Vehicle_Age', age_ordering)
X, Yデータの分割
test_ids = test_df['id'].tolist()
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)
y = train_df['Response']
X = train_df.drop('Response', axis=1)
スケーリング
scaler = sp.MinMaxScaler()
X = scaler.fit_transform(X)
test_df = scaler.fit_transform(test_df)
トレーニング、テストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
トレーニング
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(10,)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid'),
])
model.summary()
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=[tf.keras.metrics.AUC(name='auc')],
)
batch_size=64
epochs=25
history = model.fit(
X_train,
y_train,
validation_split=0.2,
batch_size=batch_size,
epochs=epochs,
callbacks=[tf.keras.callbacks.ReduceLROnPlateau()],
verbose=0,
)
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 64) 704
_________________________________________________________________
dense_1 (Dense) (None, 64) 4160
_________________________________________________________________
dense_2 (Dense) (None, 1) 65
=================================================================
Total params: 4,929
Trainable params: 4,929
Non-trainable params: 0
_________________________________________________________________
結果
plt.figure(figsize=(14, 10))
epochs_range = range(1, epochs+1)
train_loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(epochs_range, train_loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()
model.evaluate(X_test, y_test)
3573/3573 [==============================] - 4s 1ms/step - loss: 0.2692 - auc: 0.8494
[0.2692367732524872, 0.8494384288787842]
予測
preds = model.predict(test_df)
preds = list(map(lambda x: np.int(x[0]), preds >= 0.5))
submission = pd.concat([pd.Series(test_ids), pd.Series(preds)], axis=1)
submission.columns = ['id', 'Response']
submission
id | Response | |
---|---|---|
0 | 381110 | 0 |
1 | 381111 | 0 |
2 | 381112 | 0 |
3 | 381113 | 0 |
4 | 381114 | 0 |
... | ... | ... |
127032 | 508142 | 0 |
127033 | 508143 | 0 |
127034 | 508144 | 0 |
127035 | 508145 | 0 |
127036 | 508146 | 0 |
127037 rows × 2 columns