次のプログラムの欠損値の補完を、train と test をマージしたもので行ってみました。結果に変化はありませんでした。
titanic04.py
#! /usr/bin/python
#
# titanic04.py
#
# Mar/23/2020
# --------------------------------------------------------------------------
import sys
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve , auc ,accuracy_score)
# --------------------------------------------------------------------------
# [4-4]:
def convert_proc(df):
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna('S')
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
df['Embarked'] = df['Embarked'].map( {'S': 0 , 'C':1 , 'Q':2}).astype(int)
df['Survived'] = df['Survived'].apply(lambda x: 1 if x == 1.0 else 0)
#
df = df.drop(['Cabin','Name','PassengerId','Ticket'],axis =1)
#
print(len(df.index),len(df.columns))
print("df",df.isnull().values.any())
#
return df
# --------------------------------------------------------------------------
# [4]:
def read_proc(file_train,file_test):
train_df = pd.read_csv(file_train, header=0)
nnx = len(train_df.index)
sys.stderr.write("len(train_df.index) = %d\n" % nnx)
test_df = pd.read_csv(file_test, header=0)
ids = test_df["PassengerId"].values
all_df = train_df.append(test_df,sort=False)
all_df = convert_proc(all_df)
new_train_df = all_df.iloc[:nnx]
new_test_df = all_df.iloc[nnx:]
new_test_df = new_test_df.drop(['Survived'],axis =1)
#
return new_train_df,new_test_df,ids
#
# --------------------------------------------------------------------------
# [8-6]:
def predict_check_proc(clf,test_X,test_y):
pred = clf.predict(test_X)
fpr, tpr , thresholds = roc_curve(test_y,pred,pos_label = 1)
auc(fpr,tpr)
score = accuracy_score(pred,test_y)
print(score)
#
# --------------------------------------------------------------------------
# [8]:
def predict_proc(train_df_in,test_df_in):
#
train_X = train_df_in.drop('Survived',axis = 1)
train_y = train_df_in.Survived
#
(train_X , test_X , train_y , test_y) = train_test_split(train_X, train_y , test_size = 0.3 , random_state = 0)
#
clf = RandomForestClassifier(n_estimators = 10,max_depth=5,random_state = 0)
clf = clf.fit(train_X , train_y)
#
predict_check_proc(clf,test_X,test_y)
#
predictions = clf.predict(test_df_in)
#
return predictions
#
# --------------------------------------------------------------------------
# [10]:
def submit_proc(file_submit,ids,output):
dft = pd.DataFrame({'PassengerId': ids, 'Survived': output})
dft.to_csv(file_submit,index=False)
#
# --------------------------------------------------------------------------
sys.stderr.write("*** 開始 ***\n")
file_train = "train.csv"
file_test = "test.csv"
file_submit = "titanic_submit.csv"
#
train_df,test_df,ids = read_proc(file_train,file_test)
#
predictions = predict_proc(train_df,test_df)
#
submit_proc(file_submit,ids,predictions)
#
sys.stderr.write("*** 終了 ***\n")
# --------------------------------------------------------------------------