LoginSignup
1

More than 3 years have passed since last update.

Data Every Day: 学校における生徒の学力

Posted at

tldr

KggleのStudents' Academic Performance DatasetPredicting Performance in School - Data Every Day #046に沿ってやっていきます。

実行環境はGoogle Colaboratorです。

インポート

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.preprocessing as sp
from sklearn.model_selection import train_test_split
import sklearn.linear_model as slm

import tensorflow as tf

データのダウンロード

Google Driveをマウントします。

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

KaggleのAPIクライアントを初期化し、認証します。
認証情報はGoogle Drive内(/content/drive/My Drive/Colab Notebooks/Kaggle)にkaggle.jsonとして置いてあります。

import os
kaggle_path = "/content/drive/My Drive/Colab Notebooks/Kaggle"
os.environ['KAGGLE_CONFIG_DIR'] = kaggle_path

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate() 

Kaggle APIを使ってデータをダウンロードします。

dataset_id = 'aljarah/xAPI-Edu-Data'
dataset = api.dataset_list_files(dataset_id)
file_name = dataset.files[0].name
file_path = os.path.join(api.get_default_download_dir(), file_name)
file_path
'/content/xAPI-Edu-Data.csv'
api.dataset_download_file(dataset_id, file_name, force=True, quiet=False)
100%|██████████| 37.1k/37.1k [00:00<00:00, 19.1MB/s]

Downloading xAPI-Edu-Data.csv to /content









True

データの読み込み

data = pd.read_csv(file_path)
data
gender NationalITy PlaceofBirth StageID GradeID SectionID Topic Semester Relation raisedhands VisITedResources AnnouncementsView Discussion ParentAnsweringSurvey ParentschoolSatisfaction StudentAbsenceDays Class
0 M KW KuwaIT lowerlevel G-04 A IT F Father 15 16 2 20 Yes Good Under-7 M
1 M KW KuwaIT lowerlevel G-04 A IT F Father 20 20 3 25 Yes Good Under-7 M
2 M KW KuwaIT lowerlevel G-04 A IT F Father 10 7 0 30 No Bad Above-7 L
3 M KW KuwaIT lowerlevel G-04 A IT F Father 30 25 5 35 No Bad Above-7 L
4 M KW KuwaIT lowerlevel G-04 A IT F Father 40 50 12 50 No Bad Above-7 M
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
475 F Jordan Jordan MiddleSchool G-08 A Chemistry S Father 5 4 5 8 No Bad Above-7 L
476 F Jordan Jordan MiddleSchool G-08 A Geology F Father 50 77 14 28 No Bad Under-7 M
477 F Jordan Jordan MiddleSchool G-08 A Geology S Father 55 74 25 29 No Bad Under-7 M
478 F Jordan Jordan MiddleSchool G-08 A History F Father 30 17 14 57 No Bad Above-7 L
479 F Jordan Jordan MiddleSchool G-08 A History S Father 35 14 23 62 No Bad Above-7 L

480 rows × 17 columns

下準備

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    480 non-null    object
 1   NationalITy               480 non-null    object
 2   PlaceofBirth              480 non-null    object
 3   StageID                   480 non-null    object
 4   GradeID                   480 non-null    object
 5   SectionID                 480 non-null    object
 6   Topic                     480 non-null    object
 7   Semester                  480 non-null    object
 8   Relation                  480 non-null    object
 9   raisedhands               480 non-null    int64 
 10  VisITedResources          480 non-null    int64 
 11  AnnouncementsView         480 non-null    int64 
 12  Discussion                480 non-null    int64 
 13  ParentAnsweringSurvey     480 non-null    object
 14  ParentschoolSatisfaction  480 non-null    object
 15  StudentAbsenceDays        480 non-null    object
 16  Class                     480 non-null    object
dtypes: int64(4), object(13)
memory usage: 63.9+ KB
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

def get_categorical_columns(df):
    return [column for column in df.columns if df.dtypes[column] == 'object']
get_uniques(data, get_categorical_columns(data))
{'Class': ['M', 'L', 'H'],
 'GradeID': ['G-04',
  'G-07',
  'G-08',
  'G-06',
  'G-05',
  'G-09',
  'G-12',
  'G-11',
  'G-10',
  'G-02'],
 'NationalITy': ['KW',
  'lebanon',
  'Egypt',
  'SaudiArabia',
  'USA',
  'Jordan',
  'venzuela',
  'Iran',
  'Tunis',
  'Morocco',
  'Syria',
  'Palestine',
  'Iraq',
  'Lybia'],
 'ParentAnsweringSurvey': ['Yes', 'No'],
 'ParentschoolSatisfaction': ['Good', 'Bad'],
 'PlaceofBirth': ['KuwaIT',
  'lebanon',
  'Egypt',
  'SaudiArabia',
  'USA',
  'Jordan',
  'venzuela',
  'Iran',
  'Tunis',
  'Morocco',
  'Syria',
  'Iraq',
  'Palestine',
  'Lybia'],
 'Relation': ['Father', 'Mum'],
 'SectionID': ['A', 'B', 'C'],
 'Semester': ['F', 'S'],
 'StageID': ['lowerlevel', 'MiddleSchool', 'HighSchool'],
 'StudentAbsenceDays': ['Under-7', 'Above-7'],
 'Topic': ['IT',
  'Math',
  'Arabic',
  'Science',
  'English',
  'Quran',
  'Spanish',
  'French',
  'History',
  'Biology',
  'Chemistry',
  'Geology'],
 'gender': ['M', 'F']}
binary_feayures = ['gender', 'Semester', 'Relation', 'ParentAnsweringSurvey', 'ParentschoolSatisfaction', 'StudentAbsenceDays']
ordianl_features = ['StageID', 'GradeID']
nominal_features = ['NationalITy', 'PlaceofBirth', 'SectionID', 'Topic']

target_column = 'Class'
binary_positive = ['M', 'S', 'Father', 'Yes', 'Good', 'Above-7']
stage_ordering = ['lowerlevel', 'MiddleSchool', 'HighSchool']
grade_ordering = [
    'G-02',
    'G-04',
    'G-05',
    'G-06',
    'G-07',
    'G-08',
    'G-09',
    'G-10',
    'G-11',
    'G-12',
]

nominal_prefixes = ['N', 'B', 'S', 'T']
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df
for feature, positive_value in zip(binary_feayures, binary_positive):
    data = binary_encode(data, feature, positive_value)
data = ordinal_encode(data, 'StageID', stage_ordering)
data = ordinal_encode(data, 'GradeID', grade_ordering)
for feature, prefix in zip(nominal_features, nominal_prefixes):
    data = onehot_encode(data, feature, prefix)
target_ordering = ['L', 'M', 'H']
data = ordinal_encode(data, target_column, target_ordering)

分割とスケーリング

y = data[target_column]
X = data.drop([target_column], axis=1)
scaler = sp.StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

トレーニング

X.shape
(480, 55)
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(55,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax'),
])

model.summary()

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'],
)

batch_size = 64
epochs = 100

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    verbose=0,
)
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_12 (Dense)             (None, 64)                3584      
_________________________________________________________________
dense_13 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 195       
=================================================================
Total params: 7,939
Trainable params: 7,939
Non-trainable params: 0
_________________________________________________________________
plt.figure(figsize=(14, 10))

epochs_range = range(1, epochs+1)
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs_range, train_loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')

plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

Students'_Academic_Performance_Dataset_31_0.png

np.argmin(val_loss)
25

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1