KggleのFood PreferencesPredicting Age From Food Preferenceに沿ってやっていきます。

実行環境はGoogle Colaboratorです。


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm

import tensorflow as tf


Google Driveをマウントします。

from google.colab import drive
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle)にkaggle.jsonとして置いてあります。

import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()

Kaggle APIを使ってデータをダウンロードします。

dataset_id = 'vijayashreer/food-preferences'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
data = pd.read_csv(file_path)
Timestamp Participant_ID Gender Nationality Age Food Juice Dessert
0 2019/05/07 2:59:13 PM GMT+8 FPS001 Male Indian 24 Traditional food Fresh Juice Maybe
1 2019/05/07 2:59:45 PM GMT+8 FPS002 Female Indian 22 Western Food Carbonated drinks Yes
2 2019/05/07 3:00:05 PM GMT+8 FPS003 Male Indian 31 Western Food Fresh Juice Maybe
3 2019/05/07 3:00:11 PM GMT+8 FPS004 Female Indian 25 Traditional food Fresh Juice Maybe
4 2019/05/07 3:02:50 PM GMT+8 FPS005 Male Indian 27 Traditional food Fresh Juice Maybe
... ... ... ... ... ... ... ... ...
283 2019/05/10 9:24:00 AM GMT+8 FPS284 Male Indian 27 Western Food Fresh Juice Yes
284 2019/05/10 9:32:54 AM GMT+8 FPS285 Male Indian 24 Traditional food Fresh Juice Yes
285 2019/05/10 12:09:17 PM GMT+8 FPS286 Male Indian 25 Traditional food Fresh Juice Yes
286 2019/05/10 12:52:17 PM GMT+8 FPS287 Male Indian 27 Traditional food Fresh Juice Yes
287 2019/05/10 12:55:42 PM GMT+8 FPS288 Male Indian 27 Traditional food Fresh Juice No

288 rows × 8 columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Timestamp       288 non-null    object
 1   Participant_ID  288 non-null    object
 2   Gender          284 non-null    object
 3   Nationality     288 non-null    object
 4   Age             288 non-null    int64 
 5   Food            288 non-null    object
 6   Juice           288 non-null    object
 7   Dessert         288 non-null    object
dtypes: int64(1), object(7)
memory usage: 18.1+ KB


data = data.drop(['Timestamp', 'Participant_ID'], axis=1)


data = data.dropna(axis=0)
data = data.reset_index(drop=True)


age_bins = pd.qcut(data['Age'], q=2, labels=[0,1])
pd.concat([data['Age'], age_bins], axis=1)
Age Age
0 24 0
1 22 0
2 31 1
3 25 0
4 27 0
... ... ...
279 27 0
280 24 0
281 25 0
282 27 0
283 27 0

284 rows × 2 columns

data['Age'] = age_bins


categorical_features = ['Gender', 'Nationality', 'Food', 'Juice', 'Dessert']
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}
get_uniques(data, categorical_features)
{'Dessert': ['Maybe', 'Yes', 'No'],
 'Food': ['Traditional food', 'Western Food'],
 'Gender': ['Male', 'Female'],
 'Juice': ['Fresh Juice', 'Carbonated drinks'],
 'Nationality': ['Indian',
  'Pakistani ',
  'Maldivian ',
  'Malaysian ',
  'Indonesian ',
  'Malaysia ',
  'Nigerian ',
  'Algerian ',
  'Korean ',


binary_features = ['Gender', 'Food', 'Juice']
ordinal_features = ['Desert']
nominal_features = ['Nationality']
def binary_encode(df, column, positive_label):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df
data = binary_encode(data, 'Gender', 'Male')
data = binary_encode(data, 'Food', 'Traditional food')
data = binary_encode(data, 'Juice', 'Fresh Juice')

desert_ordering = ['No', 'Maybe', 'Yes']
data = ordinal_encode(data, 'Dessert', desert_ordering)

data = onehot_encode(data, 'Nationality')


y = data['Age']
X = data.drop('Age', axis=1)
scaler = sp.MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)


model = slm.LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)



