Kaggleのタイタニック問題
今回,ここでKaggleのタイタニックの問題に挑戦してみました.
挑戦といっても初心者なので普通にLightGBMでハイパーパラメータもチューニングせずにやってます.
コード
learn.py
でデータを学習し,テストデータで予測精度を測定します.
また,できた予測モデルをmodel.pkl
に保存します.
import lightgbm
import numpy
import pandas
import pickle
import re
# データの読み込みと前処理
gender_submission_data_frame = pandas.read_csv('gender_submission.csv')
test_data_frame = pandas.read_csv('test.csv')
train_data_frame = pandas.read_csv('train.csv')
embarked_marks = set(test_data_frame['Embarked']) | set(test_data_frame['Embarked'])
embarked_marks = sorted(list(embarked_marks))
embarked_mark_to_number = {embarked_marks[number] : number for number in range(len(embarked_marks))}
sex_marks = set(test_data_frame['Sex']) | set(test_data_frame['Sex'])
sex_marks = sorted(list(sex_marks))
sex_mark_to_number = {sex_marks[number] : number for number in range(len(sex_marks))}
test_data_frame['Cabin'] = test_data_frame['Cabin'].map(lambda cabin : len(cabin.split()) if isinstance(cabin, str) else numpy.nan)
train_data_frame['Cabin'] = train_data_frame['Cabin'].map(lambda cabin : len(cabin.split()) if isinstance(cabin, str) else numpy.nan)
test_data_frame['Embarked'] = test_data_frame['Embarked'].map(lambda embarked_mark : embarked_mark_to_number[embarked_mark] if isinstance(embarked_mark, str) else numpy.nan)
train_data_frame['Embarked'] = train_data_frame['Embarked'].map(lambda embarked_mark : embarked_mark_to_number[embarked_mark] if isinstance(embarked_mark, str) else numpy.nan)
test_data_frame['Name'] = test_data_frame['Name'].map(lambda name : len(name.split()))
train_data_frame['Name'] = train_data_frame['Name'].map(lambda name : len(name.split()))
test_data_frame['Sex'] = test_data_frame['Sex'].map(lambda sex_mark : sex_mark_to_number[sex_mark] if isinstance(sex_mark, str) else numpy.nan)
train_data_frame['Sex'] = train_data_frame['Sex'].map(lambda sex_mark : sex_mark_to_number[sex_mark] if isinstance(sex_mark, str) else numpy.nan)
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : re.split('[^0-9]', ticket)[-1])
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : int(ticket) if re.match('^[0-9]+$', ticket) else numpy.nan)
train_data_frame['Ticket'] = train_data_frame['Ticket'].map(lambda ticket : re.split('[^0-9]', ticket)[-1])
train_data_frame['Ticket'] = train_data_frame['Ticket'].map(lambda ticket : int(ticket) if re.match('^[0-9]+$', ticket) else numpy.nan)
gender_submission_column_names = set(gender_submission_data_frame.columns)
test_data_column_names = set(test_data_frame.columns)
output_column_names = gender_submission_column_names - test_data_column_names
input_column_names = test_data_column_names
test_input_data_frame = test_data_frame[list(input_column_names)]
test_output_data_frame = gender_submission_data_frame[list(output_column_names)]
train_input_data_frame = train_data_frame[list(input_column_names)]
train_output_data_frame = train_data_frame[list(output_column_names)]
# 学習
model = lightgbm.LGBMClassifier()
model.fit(train_input_data_frame, train_output_data_frame)
# 予測精度測定
score = model.score(test_input_data_frame, test_output_data_frame)
print('score = {}'.format(score))
# 学習モデルをmodel.pklに保存
with open('model.pkl', 'wb') as model_file:
pickle.dump(model, model_file)
また,evaluate.py
でmodel.pkl
を読み込み,テストデータを使用して予測精度を測定します.
import lightgbm
import numpy
import pandas
import pickle
import re
# テストデータの読み込みと前処理
gender_submission_data_frame = pandas.read_csv('gender_submission.csv')
test_data_frame = pandas.read_csv('test.csv')
embarked_marks = set(test_data_frame['Embarked']) | set(test_data_frame['Embarked'])
embarked_marks = sorted(list(embarked_marks))
embarked_mark_to_number = {embarked_marks[number] : number for number in range(len(embarked_marks))}
sex_marks = set(test_data_frame['Sex']) | set(test_data_frame['Sex'])
sex_marks = sorted(list(sex_marks))
sex_mark_to_number = {sex_marks[number] : number for number in range(len(sex_marks))}
test_data_frame['Cabin'] = test_data_frame['Cabin'].map(lambda cabin : len(cabin.split()) if isinstance(cabin, str) else numpy.nan)
test_data_frame['Embarked'] = test_data_frame['Embarked'].map(lambda embarked_mark : embarked_mark_to_number[embarked_mark] if isinstance(embarked_mark, str) else numpy.nan)
test_data_frame['Name'] = test_data_frame['Name'].map(lambda name : len(name.split()))
test_data_frame['Sex'] = test_data_frame['Sex'].map(lambda sex_mark : sex_mark_to_number[sex_mark] if isinstance(sex_mark, str) else numpy.nan)
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : re.split('[^0-9]', ticket)[-1])
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : int(ticket) if re.match('^[0-9]+$', ticket) else numpy.nan)
gender_submission_column_names = set(gender_submission_data_frame.columns)
test_data_column_names = set(test_data_frame.columns)
output_column_names = gender_submission_column_names - test_data_column_names
input_column_names = test_data_column_names
test_input_data_frame = test_data_frame[list(input_column_names)]
test_output_data_frame = gender_submission_data_frame[list(output_column_names)]
# 学習モデルmodel.pklの読み込み
with open('model.pkl', 'rb') as model_file:
model = pickle.load(model_file)
# 予測精度測定
score = model.score(test_input_data_frame, test_output_data_frame)
print('score = {}'.format(score))
また,データの取得から学習,モデルの保存と読み込み,予測精度測定までの一連の流れを自動化するためにMakefile
を作っています.
COMPETITION=$(shell pwd | awk -F / '{print $$NF}')
CSVS=gender_submission.csv test.csv train.csv
MODEL=model.pkl
UNZIPPED=.unzipped
ZIP=$(COMPETITION).zip
all: evaluate.py $(MODEL)
python $<
$(MODEL): learn.py $(CSVS)
python $<
clean:
rm $(CSVS) $(MODEL) $(ZIP)
rebuild: clean
make
$(ZIP):
kaggle competitions download -c $(COMPETITION)
$(UNZIPPED): $(ZIP)
unzip $^
touch $@
%.csv: $(UNZIPPED)
:
実行結果
# make
rm gender_submission.csv test.csv train.csv model.pkl titanic.zip
make
make[1]: Entering directory '/root/Kaggle/competitions/titanic'
kaggle competitions download -c titanic
Downloading titanic.zip to /root/Kaggle/competitions/titanic
0%| | 0.00/34.1k [00:00<?, ?B/s]
100%|███████████████████████████████████████████████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 529kB/s]
unzip titanic.zip
Archive: titanic.zip
inflating: gender_submission.csv
inflating: test.csv
inflating: train.csv
touch .unzipped
:
:
:
python learn.py
/root/Kaggle/kagglenv/lib/python3.10/site-packages/sklearn/preprocessing/_label.py:99: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
/root/Kaggle/kagglenv/lib/python3.10/site-packages/sklearn/preprocessing/_label.py:134: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
score = 0.84688995215311 <- learn.pyが算出した予測精度
python evaluate.py
score = 0.5885167464114832 <- evaluate.pyが算出した予測精度
make[1]: Leaving directory '/root/Kaggle/competitions/titanic'
あれ???learn.py
とevaluate.py
で予測精度だいぶ違ってませんか?
同じ予測モデルを使っているのに,こんなに予測精度がわかることってあるでしょうか?
原因
原因は学習データやテストデータを取り扱うpandas.DataFrame
の列の順番が実行プロセスごとに違っていたことでした.
pandas.DataFrame
は各列の列名を保持しているのでいいのですが,lightgbm.LGBMClassifier
クラスのfit
メソッドで学習を行う際に,この列名が捨てられてしまうのです.
learn.py
では同じ実行プロセス内で学習と予測精度を行うので予測精度測定時にテストデータが正しい順番で入力されますが,evaluate.py
ではテストデータの列の順番が変わってしまうため,予測精度が著しく落ちます.
つまりこれは,実行プロセスをまたいで予測モデルを使用したときにのみ発生する不具合です.
しかも予測モデルへの入力は全て数値であり順番を入れ替えても例外が発生したりはしないので,気づきにくいです.
対策
以下のように,学習や予測モデルへの入力の直前にpandas.DataFrame
の列の順番を列名でソートしましょう.
import lightgbm
import numpy
import pandas
import pickle
import re
# テストデータの読み込みと前処理
gender_submission_data_frame = pandas.read_csv('gender_submission.csv')
test_data_frame = pandas.read_csv('test.csv')
train_data_frame = pandas.read_csv('train.csv')
embarked_marks = set(test_data_frame['Embarked']) | set(test_data_frame['Embarked'])
embarked_marks = sorted(list(embarked_marks))
embarked_mark_to_number = {embarked_marks[number] : number for number in range(len(embarked_marks))}
sex_marks = set(test_data_frame['Sex']) | set(test_data_frame['Sex'])
sex_marks = sorted(list(sex_marks))
sex_mark_to_number = {sex_marks[number] : number for number in range(len(sex_marks))}
test_data_frame['Cabin'] = test_data_frame['Cabin'].map(lambda cabin : len(cabin.split()) if isinstance(cabin, str) else numpy.nan)
train_data_frame['Cabin'] = train_data_frame['Cabin'].map(lambda cabin : len(cabin.split()) if isinstance(cabin, str) else numpy.nan)
test_data_frame['Embarked'] = test_data_frame['Embarked'].map(lambda embarked_mark : embarked_mark_to_number[embarked_mark] if isinstance(embarked_mark, str) else numpy.nan)
train_data_frame['Embarked'] = train_data_frame['Embarked'].map(lambda embarked_mark : embarked_mark_to_number[embarked_mark] if isinstance(embarked_mark, str) else numpy.nan)
test_data_frame['Name'] = test_data_frame['Name'].map(lambda name : len(name.split()))
train_data_frame['Name'] = train_data_frame['Name'].map(lambda name : len(name.split()))
test_data_frame['Sex'] = test_data_frame['Sex'].map(lambda sex_mark : sex_mark_to_number[sex_mark] if isinstance(sex_mark, str) else numpy.nan)
train_data_frame['Sex'] = train_data_frame['Sex'].map(lambda sex_mark : sex_mark_to_number[sex_mark] if isinstance(sex_mark, str) else numpy.nan)
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : re.split('[^0-9]', ticket)[-1])
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : int(ticket) if re.match('^[0-9]+$', ticket) else numpy.nan)
train_data_frame['Ticket'] = train_data_frame['Ticket'].map(lambda ticket : re.split('[^0-9]', ticket)[-1])
train_data_frame['Ticket'] = train_data_frame['Ticket'].map(lambda ticket : int(ticket) if re.match('^[0-9]+$', ticket) else numpy.nan)
gender_submission_column_names = set(gender_submission_data_frame.columns)
test_data_column_names = set(test_data_frame.columns)
output_column_names = gender_submission_column_names - test_data_column_names
input_column_names = test_data_column_names
# ここでpandas.DataFrameをソートしている
test_input_data_frame = test_data_frame[list(input_column_names)].sort_index(axis = 1, ascending = True)
test_output_data_frame = gender_submission_data_frame[list(output_column_names)].sort_index(axis = 1, ascending = True)
train_input_data_frame = train_data_frame[list(input_column_names)].sort_index(axis = 1, ascending = True)
train_output_data_frame = train_data_frame[list(output_column_names)].sort_index(axis = 1, ascending = True)
# 学習
model = lightgbm.LGBMClassifier()
model.fit(train_input_data_frame, train_output_data_frame)
# 予測精度測定
score = model.score(test_input_data_frame, test_output_data_frame)
print('score = {}'.format(score))
# 学習モデルをmodel.pklに保存
with open('model.pkl', 'wb') as model_file:
pickle.dump(model, model_file)
evaluate.py
でも同様に予測精度測定の直前にpandas.DataFrame
の列の順番を列名でソートします.
import lightgbm
import numpy
import pandas
import pickle
import re
# テストデータの読み込みと前処理
gender_submission_data_frame = pandas.read_csv('gender_submission.csv')
test_data_frame = pandas.read_csv('test.csv')
embarked_marks = set(test_data_frame['Embarked']) | set(test_data_frame['Embarked'])
embarked_marks = sorted(list(embarked_marks))
embarked_mark_to_number = {embarked_marks[number] : number for number in range(len(embarked_marks))}
sex_marks = set(test_data_frame['Sex']) | set(test_data_frame['Sex'])
sex_marks = sorted(list(sex_marks))
sex_mark_to_number = {sex_marks[number] : number for number in range(len(sex_marks))}
test_data_frame['Cabin'] = test_data_frame['Cabin'].map(lambda cabin : len(cabin.split()) if isinstance(cabin, str) else numpy.nan)
test_data_frame['Embarked'] = test_data_frame['Embarked'].map(lambda embarked_mark : embarked_mark_to_number[embarked_mark] if isinstance(embarked_mark, str) else numpy.nan)
test_data_frame['Name'] = test_data_frame['Name'].map(lambda name : len(name.split()))
test_data_frame['Sex'] = test_data_frame['Sex'].map(lambda sex_mark : sex_mark_to_number[sex_mark] if isinstance(sex_mark, str) else numpy.nan)
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : re.split('[^0-9]', ticket)[-1])
test_data_frame['Ticket'] = test_data_frame['Ticket'].map(lambda ticket : int(ticket) if re.match('^[0-9]+$', ticket) else numpy.nan)
gender_submission_column_names = set(gender_submission_data_frame.columns)
test_data_column_names = set(test_data_frame.columns)
output_column_names = gender_submission_column_names - test_data_column_names
input_column_names = test_data_column_names
test_input_data_frame = test_data_frame[list(input_column_names)].sort_index(axis = 1, ascending = True)
# ここでpandas.DataFrameをソートしている
test_output_data_frame = gender_submission_data_frame[list(output_column_names)].sort_index(axis = 1, ascending = True)
# 予測精度測定
with open('model.pkl', 'rb') as model_file:
model = pickle.load(model_file)
score = model.score(test_input_data_frame, test_output_data_frame)
print('score = {}'.format(score))
修正結果
# make
rm gender_submission.csv test.csv train.csv model.pkl titanic.zip
make
make[1]: Entering directory '/root/Kaggle/competitions/titanic'
kaggle competitions download -c titanic
Downloading titanic.zip to /root/Kaggle/competitions/titanic
0%| | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████████████████████████████████████████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 5.59MB/s]
unzip titanic.zip
Archive: titanic.zip
inflating: gender_submission.csv
inflating: test.csv
inflating: train.csv
touch .unzipped
:
:
:
python learn.py
/root/Kaggle/kagglenv/lib/python3.10/site-packages/sklearn/preprocessing/_label.py:99: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
/root/Kaggle/kagglenv/lib/python3.10/site-packages/sklearn/preprocessing/_label.py:134: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
score = 0.8492822966507177 <- learn.pyが算出した予測精度
python evaluate.py
score = 0.8492822966507177 <- evaluate.pyが算出した予測精度
make[1]: Leaving directory '/root/Kaggle/competitions/titanic'
pandas.DataFrame
の列の順番を,学習モデルへの入力の直前に列名でソートすることにより,別プロセス間で学習モデルへの入力の整合性が保たれ,全く同じ予測精度を出せています.