こちらと同じことを、Titanic のデータで行いました。
scikit-learn の使い方
次のページを参考にしました。
KaggleのTitanicでモデルを選別する(kaggle④)
結果が分かっているトレーニングデータ [train.csv]を、トレーニングデータ用データとテスト用データに分けることで評価しています。
フォルダー構造
$ tree
.
├── data_prepare.py
├── logistic01.py
├── svc01.py
└── train.csv
データを用意する関数
data_prepare.py
#
# data_prepare.py
#
# Sep/04/2020
#
# --------------------------------------------------------------------------
import sys
import numpy
import pandas
#
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# --------------------------------------------------------------------------
def data_prepare_proc(file_csv):
df = pandas.read_csv(file_csv, header=0)
#
df = df[['Survived', 'Pclass', 'Sex', 'Fare']]
#
encoder_sex = LabelEncoder()
df['Sex'] = encoder_sex.fit_transform(df['Sex'].values)
standard = StandardScaler()
df_std = pandas.DataFrame(standard.fit_transform(df[['Pclass', 'Fare']]), columns=['Pclass', 'Fare'])
df['Pclass'] = df_std['Pclass']
df['Fare'] = df_std['Fare']
#
x = df.drop(columns='Survived')
y = df[['Survived']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, shuffle=True)
y_train = numpy.ravel(y_train)
y_test = numpy.ravel(y_test)
#
return x_train, x_test, y_train, y_test
# --------------------------------------------------------------------------
SVM
svc01.py
#! /usr/bin/python
#
# svc01.py
#
# Sep/04/2020
#
# --------------------------------------------------------------------------
import sys
#
from sklearn.svm import SVC
# --------------------------------------------------------------------------
from data_prepare import data_prepare_proc
# --------------------------------------------------------------------------
#
sys.stderr.write("*** 開始 ***\n")
#
x_train, x_test, y_train, y_test = data_prepare_proc("train.csv")
#
model = SVC(random_state=1,max_iter=5000)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print("score = %f" % score)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------
実行結果
$ ./svc01.py
*** 開始 ***
score = 0.779851
*** 終了 ***
ロジスティック回帰
logistic01.py
#! /usr/bin/python
#
# logisti01.py
#
# Sep/04/2020
#
# --------------------------------------------------------------------------
import sys
#
from sklearn.linear_model import LogisticRegression
# --------------------------------------------------------------------------
from data_prepare import data_prepare_proc
# --------------------------------------------------------------------------
#
sys.stderr.write("*** 開始 ***\n")
#
x_train, x_test, y_train, y_test = data_prepare_proc("train.csv")
#
model = LogisticRegression(random_state=1,max_iter=5000)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print("score = %f" % score)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------
実行結果
$ ./logistic01.py
*** 開始 ***
score = 0.753731
*** 終了 ***
確認したバージョン
$ python --version
Python 3.11.3