1
2

More than 3 years have passed since last update.

Kaggle Titanic 0.78468

Last updated at Posted at 2020-03-22

次のプログラムを改造しました。

Kaggle Titanic 0.62200

そして、次の結果を得ました。
kaggle_mar2201.png

次のページを参考にしました。
Kaggle超初心者向け】Titanicにチャレンジしてみた

titanic03.py
#! /usr/bin/python
#
#   titanic03.py
#
#                   Mar/22/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  train_test_split
from sklearn.metrics import (roc_curve , auc ,accuracy_score)
# --------------------------------------------------------------------------
# [4-2]:
def convert_proc(df):
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')

    df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
    df['Embarked'] = df['Embarked'].map( {'S': 0 , 'C':1 , 'Q':2}).astype(int)
#
    df = df.drop(['Cabin','Name','PassengerId','Ticket'],axis =1)
#
    return df
# --------------------------------------------------------------------------
# [4]:
def read_train_proc():
    train_df = pd.read_csv("train.csv", header=0)
#
    train_df = convert_proc(train_df)
#
    return  train_df

# --------------------------------------------------------------------------
# [6]:
def read_test_proc():
    test_df = pd.read_csv("test.csv", header=0)
    ids = test_df["PassengerId"].values
#
    test_df = convert_proc(test_df)
#
    return ids,test_df
# --------------------------------------------------------------------------
# [10]:
def submit_proc(ids,output):
    file_submit = "titanic_submit.csv"
#
    dft = pd.DataFrame({'PassengerId': ids, 'Survived': output})
    dft.to_csv(file_submit,index=False)
# --------------------------------------------------------------------------
# [8]:
def predict_proc(train_df,test_df):
#
    train_X = train_df.drop('Survived',axis = 1)
    train_y = train_df.Survived
#
    (train_X , test_X , train_y , test_y) = train_test_split(train_X, train_y , test_size = 0.3 , random_state = 0)
#
    clf = RandomForestClassifier(n_estimators = 10,max_depth=5,random_state = 0)
    clf = clf.fit(train_X , train_y)
    pred = clf.predict(test_X)
    fpr, tpr , thresholds = roc_curve(test_y,pred,pos_label = 1)
    auc(fpr,tpr)
    score = accuracy_score(pred,test_y)
    print(score)
#
    predictions = clf.predict(test_df)
#
    return predictions
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")

train_df = read_train_proc()

ids,test_df = read_test_proc()
#
predictions = predict_proc(train_df,test_df)

submit_proc(ids,predictions)

sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------

titanic_submit.csv を kaggle.com に Submit して次の結果を得ました。

You advanced 11,478 places on the leaderboard!
Your submission scored 0.78468, which is an improvement of your previous score of 0.62200. Great job!
1
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
2