次のページを参考にしました。
Kaggle事始め
Arch Linux でのライブラリーのインストール
sudo pacman -S python-pandas
sudo pacman -S python-scikit-learn
Ubuntu 19.10 でのライブラリーのインストール
>```bash
sudo apt install python3-pandas
sudo apt install python3-sklearn
titanic01.py
#! /usr/bin/python
#
# titanic01.py
#
# Feb/19/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
#import csv
from sklearn.ensemble import RandomForestClassifier
# --------------------------------------------------------------------------
# [2]:
def read_train_proc():
train_df = pd.read_csv("train.csv", header=0)
# Convert "Sex" to be a dummy variable (female = 0, Male = 1)
train_df["Gender"] = train_df["Sex"].map({"female": 0, "male": 1}).astype(int)
# Complement the missing values of "Age" column with average of "Age"
median_age = train_df["Age"].dropna().median()
if 0 < len(train_df.Age[train_df.Age.isnull()]):
train_df.loc[(train_df.Age.isnull()), "Age"] = median_age
# remove un-used columns
train_df = train_df.drop(["Name", "Ticket", "Sex", "SibSp", "Parch", "Fare", "Cabin", "Embarked", "PassengerId"], axis=1)
#
return train_df
# --------------------------------------------------------------------------
# [4]:
def read_test_proc():
test_df = pd.read_csv("test.csv", header=0)
test_df["Gender"] = test_df["Sex"].map({"female": 0, "male": 1}).astype(int)
# Complement the missing values of "Age" column with average of "Age"
median_age = test_df["Age"].dropna().median()
if 0 < len(test_df.Age[test_df.Age.isnull()]):
test_df.loc[(test_df.Age.isnull()), "Age"] = median_age
# Copy test data's "PassengerId" column, and remove un-used columns
ids = test_df["PassengerId"].values
test_df = test_df.drop(["Name", "Ticket", "Sex", "SibSp", "Parch", "Fare", "Cabin", "Embarked", "PassengerId"], axis=1)
#
return ids,test_df
# --------------------------------------------------------------------------
# [8]:
def submit_proc(ids,output):
file_submit = "titanic_submit.csv"
#
dft = pd.DataFrame({'PassengerId': ids, 'Survived': output})
dft.to_csv(file_submit,index=False)
# --------------------------------------------------------------------------
# [6]:
def predict_proc(train_data,test_data):
# Predict with "Random Forest"
model = RandomForestClassifier(n_estimators=100)
output = model.fit(train_data[0::, 1::], train_data[0::, 0]).predict(test_data).astype(int)
#
return output
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
train_df = read_train_proc()
ids,test_df = read_test_proc()
train_data = train_df.values
test_data = test_df.values
train_df.to_csv("train01.csv")
test_df.to_csv("test01.csv")
output = predict_proc(train_data,test_data)
submit_proc(ids,output)
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------
Submit した結果です。 0.622 です。