More than 3 years have passed since last update.

Kaggle Titanic 0.62200

Last updated at 2020-03-23Posted at 2020-02-19

次のページを参考にしました。
Kaggle事始め

Arch Linux でのライブラリーのインストール

sudo pacman -S python-pandas
sudo pacman -S python-scikit-learn


Ubuntu 19.10 でのライブラリーのインストール

>```bash
sudo apt install python3-pandas
sudo apt install python3-sklearn

titanic01.py

#! /usr/bin/python
#
#	titanic01.py
#
#					Feb/19/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
#import csv
from sklearn.ensemble import RandomForestClassifier

# --------------------------------------------------------------------------
# [2]:
def read_train_proc():
	train_df = pd.read_csv("train.csv", header=0)

# Convert "Sex" to be a dummy variable (female = 0, Male = 1)
	train_df["Gender"] = train_df["Sex"].map({"female": 0, "male": 1}).astype(int)

# Complement the missing values of "Age" column with average of "Age"
	median_age = train_df["Age"].dropna().median()
	if 0 < len(train_df.Age[train_df.Age.isnull()]):
		train_df.loc[(train_df.Age.isnull()), "Age"] = median_age

# remove un-used columns
	train_df = train_df.drop(["Name", "Ticket", "Sex", "SibSp", "Parch", "Fare", "Cabin", "Embarked", "PassengerId"], axis=1)
#
	return	train_df

# --------------------------------------------------------------------------
# [4]:
def read_test_proc():
	test_df = pd.read_csv("test.csv", header=0)
	test_df["Gender"] = test_df["Sex"].map({"female": 0, "male": 1}).astype(int)

# Complement the missing values of "Age" column with average of "Age"
	median_age = test_df["Age"].dropna().median()
	if 0 < len(test_df.Age[test_df.Age.isnull()]):
		test_df.loc[(test_df.Age.isnull()), "Age"] = median_age

# Copy test data's "PassengerId" column, and remove un-used columns
	ids = test_df["PassengerId"].values
	test_df = test_df.drop(["Name", "Ticket", "Sex", "SibSp", "Parch", "Fare", "Cabin", "Embarked", "PassengerId"], axis=1)
#
	return ids,test_df
# --------------------------------------------------------------------------
# [8]:
def submit_proc(ids,output):
	file_submit = "titanic_submit.csv"
#
	dft = pd.DataFrame({'PassengerId': ids, 'Survived': output})
	dft.to_csv(file_submit,index=False)
# --------------------------------------------------------------------------
# [6]:
def predict_proc(train_data,test_data):
# Predict with "Random Forest"
	model = RandomForestClassifier(n_estimators=100)
	output = model.fit(train_data[0::, 1::], train_data[0::, 0]).predict(test_data).astype(int)
#
	return output
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")

train_df = read_train_proc()

ids,test_df = read_test_proc()

train_data = train_df.values
test_data = test_df.values

train_df.to_csv("train01.csv")
test_df.to_csv("test01.csv")

output = predict_proc(train_data,test_data)

submit_proc(ids,output)

sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------

Submit した結果です。　0.622 です。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up