ふと思ったのです、昔は欠損値の補完やらずに78%出していたなと。
そこで欠損値のある変数をすべて削除して予測してみようと。
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
では欠損のある項目と数値でない項目(性別除く)を削除します。
train.index = train["PassengerId"]
train = train.drop(["PassengerId", "Name", "Age", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)
test.index = test["PassengerId"]
test = test.drop(["PassengerId", "Name", "Age", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)
性別を片方削除します(多重共線性回避)
train = pd.get_dummies(train)
train = train.drop(["Sex_male"], axis=1)
test = pd.get_dummies(test)
test = test.drop(["Sex_male"], axis=1)
ここから予測に行きます
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from lightgbm import LGBMClassifier as LGBMC
from sklearn.model_selection import train_test_split as tts
models = []
x = train.drop("Survived", axis=1)
y = train["Survived"]
x_test = test
df = pd.concat([x, x_test])
ss = StandardScaler()
ss.fit(df)
x = ss.transform(x)
x_test = ss.transform(x_test)
for i in range(200):
x_train, x_val, y_train, y_val = tts(x, y, random_state=i, test_size=0.2)
model = SVC()
model.fit(x_train, y_train)
models.append([model, model.score(x_val, y_val)])
model = RFC()
model.fit(x_train, y_train)
models.append([model, model.score(x_val, y_val)])
model = LGBMC()
model.fit(x_train, y_train)
models.append([model, model.score(x_val, y_val)])
models = sorted(models, key=lambda x:x[1], reverse=True)
pred1 = models[0][0].predict(x_test)
pred2 = models[1][0].predict(x_test)
pred3 = models[2][0].predict(x_test)
from scipy.stats import mode
y_pred = []
for i in range(len(pred1)):
y_pred.append(mode([pred1[i], pred2[i], pred3[i]])[0])
提出用データ
pred = pd.DataFrame(y_pred)
pred.index = test.index
pred.columns = ["Survived"]
pred.sort_index().to_csv("submit_del2.csv")
え?Ageなくしたのに何で?と思ってカーネル密度関数を作ってみました。
sns.kdeplot(df_train[df_train["Survived"]==0]["Age"], label="0")
sns.kdeplot(df_train[df_train["Survived"]==1]["Age"], label="1")
plt.legend()