tldr
KggleのTop Personality DatasetをPredicting Movie Preferences - Data Every Day #049に沿ってやっていきます。
実行環境はGoogle Colaboratorです。
インポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm
import sklearn.svm as svm
import sklearn.neural_network as snn
import tensorflow as tf
データのダウンロード
Google Driveをマウントします。
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
KaggleのAPIクライアントを初期化し、認証します。
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle
)にkaggle.json
として置いてあります。
import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
Kaggle APIを使ってデータをダウンロードします。
dataset_id = 'arslanali4343/top-personality-dataset'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
file_path
'/content/2018-personality-data.csv'
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
100%|██████████| 608k/608k [00:00<00:00, 74.4MB/s]
Downloading 2018-personality-data.csv to /content
True
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
データの読み込み
data = pd.read_csv(file_path)
data
userid | openness | agreeableness | emotional_stability | conscientiousness | extraversion | assigned metric | assigned condition | movie_1 | predicted_rating_1 | movie_2 | predicted_rating_2 | movie_3 | predicted_rating_3 | movie_4 | predicted_rating_4 | movie_5 | predicted_rating_5 | movie_6 | predicted_rating_6 | movie_7 | predicted_rating_7 | movie_8 | predicted_rating_8 | movie_9 | predicted_rating_9 | movie_10 | predicted_rating_10 | movie_11 | predicted_rating_11 | movie_12 | predicted_rating_12 | is_personalized | enjoy_watching | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8e7cebf9a234c064b75016249f2ac65e | 5.0 | 2.0 | 3.0 | 2.5 | 6.5 | serendipity | high | 77658 | 4.410466 | 95858 | 4.271995 | 115713 | 4.611922 | 26674 | 4.459407 | 93040 | 4.147292 | 117533 | 4.098206 | 108979 | 4.064843 | 112582 | 4.149100 | 120138 | 4.244817 | 121372 | 4.396004 | 127152 | 4.120456 | 95311 | 4.053847 | 4 | 4 |
1 | 77c7d756a093150d4377720abeaeef76 | 7.0 | 4.0 | 6.0 | 5.5 | 4.0 | all | default | 94959 | 4.207280 | 1247 | 4.266540 | 953 | 4.211322 | 2010 | 4.408341 | 1234 | 4.090358 | 5291 | 4.202424 | 106642 | 4.113912 | 1209 | 4.094422 | 56782 | 4.019599 | 5618 | 3.963953 | 969 | 4.174188 | 1232 | 4.334877 | 2 | 3 |
2 | b7e8a92987a530cc368719a0e60e26a3 | 4.0 | 3.0 | 4.5 | 2.0 | 2.5 | serendipity | medium | 110501 | 4.868064 | 77658 | 4.710444 | 101895 | 5.029360 | 1260 | 4.698602 | 5971 | 4.660769 | 98491 | 4.962319 | 926 | 4.706864 | 1204 | 4.645191 | 2288 | 4.823212 | 3307 | 4.676756 | 1172 | 4.649281 | 1212 | 4.744990 | 2 | 2 |
3 | 92561f21446e017dd6b68b94b23ad5b7 | 5.5 | 5.5 | 4.0 | 4.5 | 4.0 | popularity | medium | 2905 | 4.526371 | 2843 | 4.456451 | 3629 | 4.668444 | 3022 | 4.676067 | 3307 | 4.530360 | 1211 | 4.292660 | 3462 | 4.341634 | 5291 | 4.261166 | 3030 | 4.425689 | 1281 | 4.479921 | 940 | 4.355061 | 905 | 4.317927 | 3 | 3 |
4 | 030001ac2145a938b07e686a35a2d638 | 5.5 | 5.5 | 3.5 | 4.5 | 2.5 | popularity | medium | 2905 | 4.526371 | 2843 | 4.456451 | 3629 | 4.668444 | 3022 | 4.676067 | 3307 | 4.530360 | 1211 | 4.292660 | 3462 | 4.341634 | 5291 | 4.261166 | 3030 | 4.425689 | 1281 | 4.479921 | 940 | 4.355061 | 905 | 4.317927 | 2 | 3 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1829 | cff910b71f09b3120289ff6b461a9e03 | 5.5 | 3.5 | 2.5 | 4.0 | 5.5 | popularity | low | 108979 | 4.246346 | 33779 | 4.002232 | 96728 | 3.981009 | 77307 | 4.059373 | 111759 | 3.888507 | 7762 | 4.368495 | 26082 | 4.554531 | 89753 | 4.145526 | 6643 | 4.409108 | 115122 | 3.960470 | 7700 | 4.178546 | 67997 | 4.085300 | 3 | 3 |
1830 | 1ab3a4c2921d8da640854819b0f6cfce | 4.0 | 3.5 | 4.5 | 4.0 | 2.5 | serendipity | high | 93040 | 4.227140 | 1199 | 4.069527 | 83134 | 4.336292 | 5971 | 4.091813 | 68237 | 4.217929 | 745 | 4.101192 | 4973 | 4.244278 | 47099 | 4.473696 | 5618 | 4.149697 | 903 | 4.116152 | 38061 | 4.155210 | 1197 | 4.045751 | 3 | 4 |
1831 | a06386edadf3bc614dadb7044708c46c | 6.0 | 3.0 | 5.5 | 3.5 | 6.0 | serendipity | low | 106173 | 3.935297 | 126397 | 4.006561 | 109710 | 3.947412 | 99171 | 4.003978 | 119194 | 4.002666 | 69483 | 4.235115 | 45691 | 4.191760 | 58530 | 3.965657 | 26519 | 3.998642 | 89707 | 4.144870 | 2571 | 3.860041 | 108709 | 3.899857 | 3 | 4 |
1832 | bad56d9506832cd79d874a6b66b3d813 | 5.0 | 3.5 | 1.5 | 3.5 | 2.5 | serendipity | medium | 6874 | 4.241766 | 38061 | 4.350788 | 46578 | 4.399071 | 4848 | 4.749688 | 44195 | 4.493639 | 4979 | 4.397887 | 7438 | 4.207513 | 3897 | 4.212995 | 92259 | 4.819710 | 32 | 4.059369 | 3730 | 4.427336 | 3435 | 4.844386 | 4 | 4 |
1833 | 721ea658e148fc0f76ddd6e2b0e02422 | 6.5 | 6.5 | 2.5 | 6.5 | 2.0 | popularity | high | 140737 | 4.456842 | 110366 | 4.283821 | 89315 | 4.287975 | 112006 | 4.292487 | 113190 | 4.345421 | 26614 | 4.218514 | 50872 | 4.256004 | 4447 | 4.187709 | 8360 | 4.247956 | 8961 | 4.321693 | 597 | 4.278465 | 457 | 4.352895 | 2 | 2 |
1834 rows × 34 columns
下準備
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1834 entries, 0 to 1833
Data columns (total 34 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 userid 1834 non-null object
1 openness 1834 non-null float64
2 agreeableness 1834 non-null float64
3 emotional_stability 1834 non-null float64
4 conscientiousness 1834 non-null float64
5 extraversion 1834 non-null float64
6 assigned metric 1834 non-null object
7 assigned condition 1834 non-null object
8 movie_1 1834 non-null int64
9 predicted_rating_1 1834 non-null float64
10 movie_2 1834 non-null int64
11 predicted_rating_2 1834 non-null float64
12 movie_3 1834 non-null int64
13 predicted_rating_3 1834 non-null float64
14 movie_4 1834 non-null int64
15 predicted_rating_4 1834 non-null float64
16 movie_5 1834 non-null int64
17 predicted_rating_5 1834 non-null float64
18 movie_6 1834 non-null int64
19 predicted_rating_6 1834 non-null float64
20 movie_7 1834 non-null int64
21 predicted_rating_7 1834 non-null float64
22 movie_8 1834 non-null int64
23 predicted_rating_8 1834 non-null float64
24 movie_9 1834 non-null int64
25 predicted_rating_9 1834 non-null float64
26 movie_10 1834 non-null int64
27 predicted_rating_10 1834 non-null float64
28 movie_11 1834 non-null int64
29 predicted_rating_11 1834 non-null float64
30 movie_12 1834 non-null int64
31 predicted_rating_12 1834 non-null float64
32 is_personalized 1834 non-null int64
33 enjoy_watching 1834 non-null int64
dtypes: float64(17), int64(14), object(3)
memory usage: 487.3+ KB
data.isna().sum()
userid 0
openness 0
agreeableness 0
emotional_stability 0
conscientiousness 0
extraversion 0
assigned metric 0
assigned condition 0
movie_1 0
predicted_rating_1 0
movie_2 0
predicted_rating_2 0
movie_3 0
predicted_rating_3 0
movie_4 0
predicted_rating_4 0
movie_5 0
predicted_rating_5 0
movie_6 0
predicted_rating_6 0
movie_7 0
predicted_rating_7 0
movie_8 0
predicted_rating_8 0
movie_9 0
predicted_rating_9 0
movie_10 0
predicted_rating_10 0
movie_11 0
predicted_rating_11 0
movie_12 0
predicted_rating_12 0
is_personalized 0
enjoy_watching 0
dtype: int64
data = data.drop([
'userid',
' movie_1', ' predicted_rating_1',
' movie_2', ' predicted_rating_2',
' movie_3', ' predicted_rating_3',
' movie_4', ' predicted_rating_4',
' movie_5', ' predicted_rating_5',
' movie_6', ' predicted_rating_6',
' movie_7', ' predicted_rating_7',
' movie_8', ' predicted_rating_8',
' movie_9', ' predicted_rating_9',
' movie_10', ' predicted_rating_10',
' movie_11', ' predicted_rating_11',
' movie_12', ' predicted_rating_12'], axis=1)
エンコード
{column: list(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}
{' assigned condition': [' high', ' default', ' medium', ' low'],
' assigned metric': [' serendipity', ' all', ' popularity', ' diversity']}
condition_ordering = [' low', ' medium', ' default', ' high']
def ordinal_encode(df, column, ordering):
df = df.copy()
df[column] = df[column].apply(lambda x: ordering.index(x))
return df
def onehot_encode(df, column, prefix):
df = df.copy()
dummies = pd.get_dummies(df[column], prefix=prefix)
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
data = ordinal_encode(data, ' assigned condition', condition_ordering)
data = onehot_encode(data, ' assigned metric', 'm')
分割とスケーリング
y = data[' enjoy_watching ']
X = data.drop([' enjoy_watching '], axis=1)
scaler = sp.StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
トレーニング
log_model = slm.LogisticRegression()
svm_model = svm.SVC()
ann_model = snn.MLPClassifier(hidden_layer_sizes=(16, 16))
log_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
ann_model.fit(X_train, y_train)
log_acc = log_model.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)
ann_acc = ann_model.score(X_test, y_test)
/usr/local/lib/python3.6/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:571: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
% self.max_iter, ConvergenceWarning)
import plotly.express as px
fig = px.bar(
x=['Logistic Regression', 'SVM', 'NN'],
y=[log_acc, svm_acc, ann_acc],
color=['Logistic Regression', 'SVM', 'NN'],
labels={'x': 'Model', 'y': 'Accuracy'},
title='Model Accuracy',
)
fig.show()