0
0

More than 3 years have passed since last update.

Kaggle Titanic (欠損値補完)

Last updated at Posted at 2020-03-23

次のプログラムの欠損値の補完を、train と test をマージしたもので行ってみました。結果に変化はありませんでした。

Kaggle Titanic 0.78468

titanic04.py
#! /usr/bin/python
#
#   titanic04.py
#
#                   Mar/23/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  train_test_split
from sklearn.metrics import (roc_curve , auc ,accuracy_score)
# --------------------------------------------------------------------------
# [4-4]:
def convert_proc(df):
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')

    df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
    df['Embarked'] = df['Embarked'].map( {'S': 0 , 'C':1 , 'Q':2}).astype(int)
    df['Survived'] = df['Survived'].apply(lambda x: 1 if x == 1.0 else 0)
#
    df = df.drop(['Cabin','Name','PassengerId','Ticket'],axis =1)
#
    print(len(df.index),len(df.columns))
    print("df",df.isnull().values.any())
#
    return df
# --------------------------------------------------------------------------
# [4]:
def read_proc(file_train,file_test):
    train_df = pd.read_csv(file_train, header=0)

    nnx = len(train_df.index)
    sys.stderr.write("len(train_df.index) = %d\n" % nnx)
    test_df = pd.read_csv(file_test, header=0)
    ids = test_df["PassengerId"].values

    all_df = train_df.append(test_df,sort=False)
    all_df = convert_proc(all_df)

    new_train_df = all_df.iloc[:nnx]
    new_test_df = all_df.iloc[nnx:]
    new_test_df = new_test_df.drop(['Survived'],axis =1)
#
    return  new_train_df,new_test_df,ids
#
# --------------------------------------------------------------------------
# [8-6]:
def predict_check_proc(clf,test_X,test_y):
    pred = clf.predict(test_X)
    fpr, tpr , thresholds = roc_curve(test_y,pred,pos_label = 1)
    auc(fpr,tpr)
    score = accuracy_score(pred,test_y)
    print(score)
#
# --------------------------------------------------------------------------
# [8]:
def predict_proc(train_df_in,test_df_in):
#
    train_X = train_df_in.drop('Survived',axis = 1)
    train_y = train_df_in.Survived
#
    (train_X , test_X , train_y , test_y) = train_test_split(train_X, train_y , test_size = 0.3 , random_state = 0)
#
    clf = RandomForestClassifier(n_estimators = 10,max_depth=5,random_state = 0)
    clf = clf.fit(train_X , train_y)
#
    predict_check_proc(clf,test_X,test_y)
#
    predictions = clf.predict(test_df_in)
#
    return predictions
#
# --------------------------------------------------------------------------
# [10]:
def submit_proc(file_submit,ids,output):
    dft = pd.DataFrame({'PassengerId': ids, 'Survived': output})
    dft.to_csv(file_submit,index=False)
#
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
file_train = "train.csv"
file_test = "test.csv"
file_submit = "titanic_submit.csv"
#
train_df,test_df,ids = read_proc(file_train,file_test)
#
predictions = predict_proc(train_df,test_df)
#
submit_proc(file_submit,ids,predictions)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0