Data Every Day: トップパーソナリティデータセット

Kaggle

Posted at 2020-12-22

tldr

KggleのTop Personality DatasetをPredicting Movie Preferences - Data Every Day #049に沿ってやっていきます。

実行環境はGoogle Colaboratorです。

インポート

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm
import sklearn.svm as svm
import sklearn.neural_network as snn

import tensorflow as tf

データのダウンロード

Google Driveをマウントします。

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

KaggleのAPIクライアントを初期化し、認証します。
認証情報はGoogle Drive内（/content/drive/My Drive/Colab Notebooks/Kaggle）にkaggle.jsonとして置いてあります。

import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

Kaggle APIを使ってデータをダウンロードします。

dataset_id = 'arslanali4343/top-personality-dataset'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
file_path

'/content/2018-personality-data.csv'

api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)

100%|██████████| 608k/608k [00:00<00:00, 74.4MB/s]

Downloading 2018-personality-data.csv to /content









True


api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)

データの読み込み

data = pd.read_csv(file_path)

data

	userid	openness	agreeableness	emotional_stability	conscientiousness	extraversion	assigned metric	assigned condition	movie_1	predicted_rating_1	movie_2	predicted_rating_2	movie_3	predicted_rating_3	movie_4	predicted_rating_4	movie_5	predicted_rating_5	movie_6	predicted_rating_6	movie_7	predicted_rating_7	movie_8	predicted_rating_8	movie_9	predicted_rating_9	movie_10	predicted_rating_10	movie_11	predicted_rating_11	movie_12	predicted_rating_12	is_personalized	enjoy_watching
0	8e7cebf9a234c064b75016249f2ac65e	5.0	2.0	3.0	2.5	6.5	serendipity	high	77658	4.410466	95858	4.271995	115713	4.611922	26674	4.459407	93040	4.147292	117533	4.098206	108979	4.064843	112582	4.149100	120138	4.244817	121372	4.396004	127152	4.120456	95311	4.053847	4	4
1	77c7d756a093150d4377720abeaeef76	7.0	4.0	6.0	5.5	4.0	all	default	94959	4.207280	1247	4.266540	953	4.211322	2010	4.408341	1234	4.090358	5291	4.202424	106642	4.113912	1209	4.094422	56782	4.019599	5618	3.963953	969	4.174188	1232	4.334877	2	3
2	b7e8a92987a530cc368719a0e60e26a3	4.0	3.0	4.5	2.0	2.5	serendipity	medium	110501	4.868064	77658	4.710444	101895	5.029360	1260	4.698602	5971	4.660769	98491	4.962319	926	4.706864	1204	4.645191	2288	4.823212	3307	4.676756	1172	4.649281	1212	4.744990	2	2
3	92561f21446e017dd6b68b94b23ad5b7	5.5	5.5	4.0	4.5	4.0	popularity	medium	2905	4.526371	2843	4.456451	3629	4.668444	3022	4.676067	3307	4.530360	1211	4.292660	3462	4.341634	5291	4.261166	3030	4.425689	1281	4.479921	940	4.355061	905	4.317927	3	3
4	030001ac2145a938b07e686a35a2d638	5.5	5.5	3.5	4.5	2.5	popularity	medium	2905	4.526371	2843	4.456451	3629	4.668444	3022	4.676067	3307	4.530360	1211	4.292660	3462	4.341634	5291	4.261166	3030	4.425689	1281	4.479921	940	4.355061	905	4.317927	2	3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1829	cff910b71f09b3120289ff6b461a9e03	5.5	3.5	2.5	4.0	5.5	popularity	low	108979	4.246346	33779	4.002232	96728	3.981009	77307	4.059373	111759	3.888507	7762	4.368495	26082	4.554531	89753	4.145526	6643	4.409108	115122	3.960470	7700	4.178546	67997	4.085300	3	3
1830	1ab3a4c2921d8da640854819b0f6cfce	4.0	3.5	4.5	4.0	2.5	serendipity	high	93040	4.227140	1199	4.069527	83134	4.336292	5971	4.091813	68237	4.217929	745	4.101192	4973	4.244278	47099	4.473696	5618	4.149697	903	4.116152	38061	4.155210	1197	4.045751	3	4
1831	a06386edadf3bc614dadb7044708c46c	6.0	3.0	5.5	3.5	6.0	serendipity	low	106173	3.935297	126397	4.006561	109710	3.947412	99171	4.003978	119194	4.002666	69483	4.235115	45691	4.191760	58530	3.965657	26519	3.998642	89707	4.144870	2571	3.860041	108709	3.899857	3	4
1832	bad56d9506832cd79d874a6b66b3d813	5.0	3.5	1.5	3.5	2.5	serendipity	medium	6874	4.241766	38061	4.350788	46578	4.399071	4848	4.749688	44195	4.493639	4979	4.397887	7438	4.207513	3897	4.212995	92259	4.819710	32	4.059369	3730	4.427336	3435	4.844386	4	4
1833	721ea658e148fc0f76ddd6e2b0e02422	6.5	6.5	2.5	6.5	2.0	popularity	high	140737	4.456842	110366	4.283821	89315	4.287975	112006	4.292487	113190	4.345421	26614	4.218514	50872	4.256004	4447	4.187709	8360	4.247956	8961	4.321693	597	4.278465	457	4.352895	2	2

1834 rows × 34 columns

下準備

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1834 entries, 0 to 1833
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   userid                1834 non-null   object 
 1    openness             1834 non-null   float64
 2    agreeableness        1834 non-null   float64
 3    emotional_stability  1834 non-null   float64
 4    conscientiousness    1834 non-null   float64
 5    extraversion         1834 non-null   float64
 6    assigned metric      1834 non-null   object 
 7    assigned condition   1834 non-null   object 
 8    movie_1              1834 non-null   int64  
 9    predicted_rating_1   1834 non-null   float64
 10   movie_2              1834 non-null   int64  
 11   predicted_rating_2   1834 non-null   float64
 12   movie_3              1834 non-null   int64  
 13   predicted_rating_3   1834 non-null   float64
 14   movie_4              1834 non-null   int64  
 15   predicted_rating_4   1834 non-null   float64
 16   movie_5              1834 non-null   int64  
 17   predicted_rating_5   1834 non-null   float64
 18   movie_6              1834 non-null   int64  
 19   predicted_rating_6   1834 non-null   float64
 20   movie_7              1834 non-null   int64  
 21   predicted_rating_7   1834 non-null   float64
 22   movie_8              1834 non-null   int64  
 23   predicted_rating_8   1834 non-null   float64
 24   movie_9              1834 non-null   int64  
 25   predicted_rating_9   1834 non-null   float64
 26   movie_10             1834 non-null   int64  
 27   predicted_rating_10  1834 non-null   float64
 28   movie_11             1834 non-null   int64  
 29   predicted_rating_11  1834 non-null   float64
 30   movie_12             1834 non-null   int64  
 31   predicted_rating_12  1834 non-null   float64
 32   is_personalized      1834 non-null   int64  
 33   enjoy_watching       1834 non-null   int64  
dtypes: float64(17), int64(14), object(3)
memory usage: 487.3+ KB

data.isna().sum()

userid                  0
 openness               0
 agreeableness          0
 emotional_stability    0
 conscientiousness      0
 extraversion           0
 assigned metric        0
 assigned condition     0
 movie_1                0
 predicted_rating_1     0
 movie_2                0
 predicted_rating_2     0
 movie_3                0
 predicted_rating_3     0
 movie_4                0
 predicted_rating_4     0
 movie_5                0
 predicted_rating_5     0
 movie_6                0
 predicted_rating_6     0
 movie_7                0
 predicted_rating_7     0
 movie_8                0
 predicted_rating_8     0
 movie_9                0
 predicted_rating_9     0
 movie_10               0
 predicted_rating_10    0
 movie_11               0
 predicted_rating_11    0
 movie_12               0
 predicted_rating_12    0
 is_personalized        0
 enjoy_watching         0
dtype: int64

data = data.drop([
                  'userid', 
                  ' movie_1',  ' predicted_rating_1',
                  ' movie_2',  ' predicted_rating_2',
                  ' movie_3',  ' predicted_rating_3',
                  ' movie_4',  ' predicted_rating_4',
                  ' movie_5',  ' predicted_rating_5',
                  ' movie_6',  ' predicted_rating_6',
                  ' movie_7',  ' predicted_rating_7',
                  ' movie_8',  ' predicted_rating_8',
                  ' movie_9',  ' predicted_rating_9',
                  ' movie_10',  ' predicted_rating_10',
                  ' movie_11',  ' predicted_rating_11',
                  ' movie_12',  ' predicted_rating_12'], axis=1)

エンコード

{column: list(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{' assigned condition': [' high', ' default', ' medium', ' low'],
 ' assigned metric': [' serendipity', ' all', ' popularity', ' diversity']}

condition_ordering = [' low', ' medium', ' default', ' high']

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

data = ordinal_encode(data, ' assigned condition', condition_ordering)
data = onehot_encode(data, ' assigned metric', 'm')

分割とスケーリング

y = data[' enjoy_watching ']
X = data.drop([' enjoy_watching '], axis=1)

scaler = sp.StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

トレーニング

log_model = slm.LogisticRegression()
svm_model = svm.SVC()
ann_model = snn.MLPClassifier(hidden_layer_sizes=(16, 16))

log_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
ann_model.fit(X_train, y_train)

log_acc = log_model.score(X_test, y_test)
svm_acc = svm_model.score(X_test, y_test)
ann_acc = ann_model.score(X_test, y_test)

/usr/local/lib/python3.6/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:571: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)

import plotly.express as px

fig = px.bar(
    x=['Logistic Regression', 'SVM', 'NN'],
    y=[log_acc, svm_acc, ann_acc],
    color=['Logistic Regression', 'SVM', 'NN'],
    labels={'x': 'Model', 'y': 'Accuracy'},
    title='Model Accuracy',
)
fig.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up