KggleのTelco Customer ChurnをPredicting Customer Churn - Data Every Day #040に沿ってやっていきます。
実行環境はGoogle Colaboratorです。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm
import tensorflow as tf
Google Driveをマウントします。
from google.colab import drive
Mounted at /content/drive
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle
import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
Kaggle APIを使ってデータをダウンロードします。
dataset_id = 'blastchar/telco-customer-churn'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
100%|██████████| 955k/955k [00:00<00:00, 95.4MB/s]
Downloading WA_Fn-UseC_-Telco-Customer-Churn.csv to /content
data = pd.read_csv(file_path)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7038 | 6840-RESVB | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | No | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
7039 | 2234-XADUH | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | Yes | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
7040 | 4801-JZAZL | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | No | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7043 rows × 21 columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 gender 7043 non-null object
2 SeniorCitizen 7043 non-null int64
3 Partner 7043 non-null object
4 Dependents 7043 non-null object
5 tenure 7043 non-null int64
6 PhoneService 7043 non-null object
7 MultipleLines 7043 non-null object
8 InternetService 7043 non-null object
9 OnlineSecurity 7043 non-null object
10 OnlineBackup 7043 non-null object
11 DeviceProtection 7043 non-null object
12 TechSupport 7043 non-null object
13 StreamingTV 7043 non-null object
14 StreamingMovies 7043 non-null object
15 Contract 7043 non-null object
16 PaperlessBilling 7043 non-null object
17 PaymentMethod 7043 non-null object
18 MonthlyCharges 7043 non-null float64
19 TotalCharges 7043 non-null object
20 Churn 7043 non-null object
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
data = data.drop(['customerID'], axis=1)
def get_uniques(df, columns):
return {column: list(df[column].unique()) for column in columns}
def get_categorical_columns(df):
return [column for column in df.columns if df.dtypes[column] == 'object']
get_uniques(data, get_categorical_columns(data))
{'Churn': ['No', 'Yes'],
'Contract': ['Month-to-month', 'One year', 'Two year'],
'Dependents': ['No', 'Yes'],
'DeviceProtection': ['No', 'Yes', 'No internet service'],
'InternetService': ['DSL', 'Fiber optic', 'No'],
'MultipleLines': ['No phone service', 'No', 'Yes'],
'OnlineBackup': ['Yes', 'No', 'No internet service'],
'OnlineSecurity': ['No', 'Yes', 'No internet service'],
'PaperlessBilling': ['Yes', 'No'],
'Partner': ['Yes', 'No'],
'PaymentMethod': ['Electronic check',
'Mailed check',
'Bank transfer (automatic)',
'Credit card (automatic)'],
'PhoneService': ['No', 'Yes'],
'StreamingMovies': ['No', 'Yes', 'No internet service'],
'StreamingTV': ['No', 'Yes', 'No internet service'],
'TechSupport': ['No', 'Yes', 'No internet service'],
'TotalCharges': ['29.85',
' ',
'gender': ['Female', 'Male']}
[' ',
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.NaN)
data['TotalCharges'] = data['TotalCharges'].astype(np.float)
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].mean())
get_uniques(data, get_categorical_columns(data))
{'Churn': ['No', 'Yes'],
'Contract': ['Month-to-month', 'One year', 'Two year'],
'Dependents': ['No', 'Yes'],
'DeviceProtection': ['No', 'Yes', 'No internet service'],
'InternetService': ['DSL', 'Fiber optic', 'No'],
'MultipleLines': ['No phone service', 'No', 'Yes'],
'OnlineBackup': ['Yes', 'No', 'No internet service'],
'OnlineSecurity': ['No', 'Yes', 'No internet service'],
'PaperlessBilling': ['Yes', 'No'],
'Partner': ['Yes', 'No'],
'PaymentMethod': ['Electronic check',
'Mailed check',
'Bank transfer (automatic)',
'Credit card (automatic)'],
'PhoneService': ['No', 'Yes'],
'StreamingMovies': ['No', 'Yes', 'No internet service'],
'StreamingTV': ['No', 'Yes', 'No internet service'],
'TechSupport': ['No', 'Yes', 'No internet service'],
'gender': ['Female', 'Male']}
data['MultipleLines'] = data['MultipleLines'].replace('No phone service', 'No')
data[['DeviceProtection', 'OnlineBackup', 'OnlineSecurity', 'StreamingMovies', 'StreamingTV', 'TechSupport']] = data[['DeviceProtection', 'OnlineBackup', 'OnlineSecurity', 'StreamingMovies', 'StreamingTV', 'TechSupport']].replace('No internet service', 'No')
binary_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'DeviceProtection', 'OnlineBackup', 'OnlineSecurity', 'StreamingMovies', 'StreamingTV', 'TechSupport', 'PaperlessBilling']
ordinal_features = ['InternetService', 'Contract']
nominal_features = ['PaymentMethod']
target_column = ['Churn']
internet_ordering = ['No', 'DSL', 'Fiber optic']
contract_ordering = ['Month-to-month', 'One year', 'Two year']
def binary_encode(df, column, positive_value):
df = df.copy()
df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
return df
def ordinal_encode(df, column, ordering):
df = df.copy()
df[column] = df[column].apply(lambda x: ordering.index(x))
return df
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column])
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
data = binary_encode(data, 'gender', 'Male')
yes_features = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'DeviceProtection', 'OnlineBackup', 'OnlineSecurity', 'StreamingMovies', 'StreamingTV', 'TechSupport', 'PaperlessBilling']
for column in yes_features:
data = binary_encode(data, column, 'Yes')
data = ordinal_encode(data, 'InternetService', internet_ordering)
data = ordinal_encode(data, 'Contract', contract_ordering)
data = onehot_encode(data, 'PaymentMethod')
data = binary_encode(data, 'Churn', 'Yes')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 23 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 7043 non-null int64
1 SeniorCitizen 7043 non-null int64
2 Partner 7043 non-null int64
3 Dependents 7043 non-null int64
4 tenure 7043 non-null int64
5 PhoneService 7043 non-null int64
6 MultipleLines 7043 non-null int64
7 InternetService 7043 non-null int64
8 OnlineSecurity 7043 non-null int64
9 OnlineBackup 7043 non-null int64
10 DeviceProtection 7043 non-null int64
11 TechSupport 7043 non-null int64
12 StreamingTV 7043 non-null int64
13 StreamingMovies 7043 non-null int64
14 Contract 7043 non-null int64
15 PaperlessBilling 7043 non-null int64
16 MonthlyCharges 7043 non-null float64
17 TotalCharges 7043 non-null float64
18 Churn 7043 non-null int64
19 Bank transfer (automatic) 7043 non-null uint8
20 Credit card (automatic) 7043 non-null uint8
21 Electronic check 7043 non-null uint8
22 Mailed check 7043 non-null uint8
dtypes: float64(2), int64(17), uint8(4)
memory usage: 1.0 MB
y = data['Churn']
X = data.drop('Churn', axis=1)
scaelr = sp.StandardScaler()
X = scaelr.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
(7043, 22)
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(22,)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid'),
history =
Model: "sequential_2"
Layer (type) Output Shape Param #
dense_6 (Dense) (None, 64) 1472
dense_7 (Dense) (None, 64) 4160
dense_8 (Dense) (None, 1) 65
Total params: 5,697
Trainable params: 5,697
Non-trainable params: 0
plt.figure(figsize=(14, 10))
epochs_range = range(1, epochs+1)
train_loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(epochs_range, train_loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
model.evaluate(X_test, y_test)
67/67 [==============================] - 0s 1ms/step - loss: 0.4744 - auc: 0.8117
[0.4744356870651245, 0.8116970062255859]