tldr
KggleのFood PreferencesをPredicting Age From Food Preferenceに沿ってやっていきます。
実行環境はGoogle Colaboratorです。
インポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm
import tensorflow as tf
データのダウンロード
Google Driveをマウントします。
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
KaggleのAPIクライアントを初期化し、認証します。
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle
)にkaggle.json
として置いてあります。
import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
Kaggle APIを使ってデータをダウンロードします。
dataset_id = 'vijayashreer/food-preferences'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
file_path
Warning: Looks like you're using an outdated API Version, please consider updating (server 1.5.10 / client 1.5.9)
'/content/Food_Preference.csv'
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
100%|██████████| 24.3k/24.3k [00:00<00:00, 15.1MB/s]
Downloading Food_Preference.csv to /content
True
データの読み込み
Padasを使ってダウンロードしてきたCSVファイルを読み込みます。
data = pd.read_csv(file_path)
data
Timestamp | Participant_ID | Gender | Nationality | Age | Food | Juice | Dessert | |
---|---|---|---|---|---|---|---|---|
0 | 2019/05/07 2:59:13 PM GMT+8 | FPS001 | Male | Indian | 24 | Traditional food | Fresh Juice | Maybe |
1 | 2019/05/07 2:59:45 PM GMT+8 | FPS002 | Female | Indian | 22 | Western Food | Carbonated drinks | Yes |
2 | 2019/05/07 3:00:05 PM GMT+8 | FPS003 | Male | Indian | 31 | Western Food | Fresh Juice | Maybe |
3 | 2019/05/07 3:00:11 PM GMT+8 | FPS004 | Female | Indian | 25 | Traditional food | Fresh Juice | Maybe |
4 | 2019/05/07 3:02:50 PM GMT+8 | FPS005 | Male | Indian | 27 | Traditional food | Fresh Juice | Maybe |
... | ... | ... | ... | ... | ... | ... | ... | ... |
283 | 2019/05/10 9:24:00 AM GMT+8 | FPS284 | Male | Indian | 27 | Western Food | Fresh Juice | Yes |
284 | 2019/05/10 9:32:54 AM GMT+8 | FPS285 | Male | Indian | 24 | Traditional food | Fresh Juice | Yes |
285 | 2019/05/10 12:09:17 PM GMT+8 | FPS286 | Male | Indian | 25 | Traditional food | Fresh Juice | Yes |
286 | 2019/05/10 12:52:17 PM GMT+8 | FPS287 | Male | Indian | 27 | Traditional food | Fresh Juice | Yes |
287 | 2019/05/10 12:55:42 PM GMT+8 | FPS288 | Male | Indian | 27 | Traditional food | Fresh Juice | No |
288 rows × 8 columns
下準備
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Timestamp 288 non-null object
1 Participant_ID 288 non-null object
2 Gender 284 non-null object
3 Nationality 288 non-null object
4 Age 288 non-null int64
5 Food 288 non-null object
6 Juice 288 non-null object
7 Dessert 288 non-null object
dtypes: int64(1), object(7)
memory usage: 18.1+ KB
不要な列の削除
data = data.drop(['Timestamp', 'Participant_ID'], axis=1)
欠損値の処理
data = data.dropna(axis=0)
data = data.reset_index(drop=True)
Ageを若者と年配者のバイナリに変換します
age_bins = pd.qcut(data['Age'], q=2, labels=[0,1])
pd.concat([data['Age'], age_bins], axis=1)
Age | Age | |
---|---|---|
0 | 24 | 0 |
1 | 22 | 0 |
2 | 31 | 1 |
3 | 25 | 0 |
4 | 27 | 0 |
... | ... | ... |
279 | 27 | 0 |
280 | 24 | 0 |
281 | 25 | 0 |
282 | 27 | 0 |
283 | 27 | 0 |
284 rows × 2 columns
data['Age'] = age_bins
エンコード
categorical_features = ['Gender', 'Nationality', 'Food', 'Juice', 'Dessert']
def get_uniques(df, columns):
return {column: list(df[column].unique()) for column in columns}
get_uniques(data, categorical_features)
{'Dessert': ['Maybe', 'Yes', 'No'],
'Food': ['Traditional food', 'Western Food'],
'Gender': ['Male', 'Female'],
'Juice': ['Fresh Juice', 'Carbonated drinks'],
'Nationality': ['Indian',
'Pakistani ',
'Tanzanian',
'Indonesia',
'Pakistan',
'Maldivian ',
'MY',
'Malaysian',
'Malaysian ',
'Indonesian ',
'Maldivian',
'MALAYSIAN',
'Malaysia ',
'Pakistani',
'Canadian',
'Nigerian ',
'Algerian ',
'Korean ',
'Seychellois',
'Indonesain',
'Indonesian',
'Malaysia',
'Japan',
'China',
'Mauritian',
'Yemen']}
フィーチャーを特徴に従い以下の3つのタイプに分けます。
binary_features = ['Gender', 'Food', 'Juice']
ordinal_features = ['Desert']
nominal_features = ['Nationality']
def binary_encode(df, column, positive_label):
df = df.copy()
df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
return df
def ordinal_encode(df, column, ordering):
df = df.copy()
df[column] = df[column].apply(lambda x: ordering.index(x))
return df
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column])
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
data = binary_encode(data, 'Gender', 'Male')
data = binary_encode(data, 'Food', 'Traditional food')
data = binary_encode(data, 'Juice', 'Fresh Juice')
desert_ordering = ['No', 'Maybe', 'Yes']
data = ordinal_encode(data, 'Dessert', desert_ordering)
data = onehot_encode(data, 'Nationality')
スケーリングとデータの分割
y = data['Age']
X = data.drop('Age', axis=1)
scaler = sp.MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
トレーニング
model = slm.LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)
0.7209302325581395
少ないデータとしてはまずまずの精度がでました。