現在作成中です...もう少々お待ち下さい...
タスク
- https://www.kaggle.com/c/quora-question-pairs
- 質問の重複一致判定コンペ
開発環境
- GPU: GTX-1080 Ti(多分,メモリが10GB以上必要)
- TPU: 利用してない
- 動作しない場合は
train_batch_size
の値を小さくしてください. - BERT Model: (Base版でLarge版ではないです)
前準備
- コンペのデータセットのダウンロード
- bertのコードと学習済みモデルのダウンロード
mkdir input
kaggle competitions download -c quora-question-pairs -p input
unzip "input/*.zip" -d input
git clone https://github.com/google-research/bert.git
wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
unzip uncased_L-12_H-768_A-12.zip
コード
# coding: utf-8
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
sys.path.append("./bert")
import modeling
import optimization
import run_classifier
import tokenization
DEBUG_MODE = False
config = {
"qqp_model_directory": "/tmp/bert_model",
"bert_config_filepath": "./uncased_L-12_H-768_A-12/bert_config.json",
"vocab_filepth": "./uncased_L-12_H-768_A-12/vocab.txt",
"input_directory": "./input",
"data_directory": "./data",
"output_directory": "./output",
"init_checkpoint": None,
"do_lower_case": True,
"max_seq_length": 128,
"train_batch_size": 32,
"eval_batch_size": 8,
"predict_batch_size": 8,
"learning_rate": 5e-5,
"num_train_epochs": 3.0,
"warmup_proportion": 0.1,
"save_checkpoints_steps": 1000,
"nrows": (2000 if DEBUG_MODE else None),
}
for directory in [config["data_directory"], config["output_directory"], config["qqp_model_directory"]]:
if not os.path.exists(directory):
os.makedirs(directory)
# 壊れたデータの修正
train_df = pd.read_csv(os.path.join(config["input_directory"], "train.csv"), encoding="utf-8")
test_df = pd.read_csv(os.path.join(config["input_directory"], "test.csv"), encoding="utf-8")
def remove(df, on):
# cf. Test data: https://www.kaggle.com/c/quora-question-pairs/discussion/59035
valid_ids =[type(x)==int for x in df[on]]
return df[valid_ids].drop_duplicates()
def fillna(df, value=""):
df["question1"].fillna(value, inplace=True)
df["question2"].fillna(value, inplace=True)
return df
train_df = fillna(remove(train_df, on="id"), value="__NA__")
test_df = fillna(remove(test_df, on="test_id"), value="__NA__")
train_df.to_csv(os.path.join(config["data_directory"], "train.csv"), encoding="utf-8", index=False)
test_df.to_csv(os.path.join(config["data_directory"], "test.csv"), encoding="utf-8", index=False)
# Bert用に変換
class QqpProcessor(object): # QQP: Quora Question Pairs
def __init__(self, data_directory, nrows=None, random_state=777):
self._all_train_df = pd.read_csv(os.path.join(data_directory, "train.csv"), nrows=nrows, encoding="utf-8")
self._train_df, self._dev_df = train_test_split(self._all_train_df, random_state=777)
self._test_df = pd.read_csv(os.path.join(data_directory, "test.csv"), nrows=nrows, encoding="utf-8")
self._test_df["id"] = self._test_df["test_id"]
def get_train_examples(self, data_dir):
return self._create_examples(self._train_df, "train")
def get_dev_examples(self, data_dir):
return self._create_examples(self._dev_df, "dev")
def get_test_examples(self, data_dir):
return self._create_examples(self._test_df, "test")
def get_labels(self):
return ["0", "1"]
def _create_examples(self, df, set_type):
def create_example(row):
guid = row["id"]
text_a = tokenization.convert_to_unicode(row["question1"])
text_b = tokenization.convert_to_unicode(row["question2"])
if set_type == "test":
label = "0"
else: # train, dev
label = tokenization.convert_to_unicode(str(row["is_duplicate"]))
return run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
return df.apply(create_example, axis=1).values.tolist()
processor = QqpProcessor(data_directory=config["data_directory"], nrows=config["nrows"])
tokenization.validate_case_matches_checkpoint(do_lower_case=config["do_lower_case"], init_checkpoint=None)
bert_config = modeling.BertConfig.from_json_file(config["bert_config_filepath"])
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file=config["vocab_filepth"], do_lower_case=config["do_lower_case"])
run_config = tf.contrib.tpu.RunConfig(
model_dir=config["qqp_model_directory"],
save_checkpoints_steps=1000,
)
train_examples = processor.get_train_examples(config["data_directory"])
num_train_steps = int(len(train_examples) / config["train_batch_size"] * config["num_train_epochs"])
num_warmup_steps = int(num_train_steps * config["warmup_proportion"])
model_fn = run_classifier.model_fn_builder(
bert_config=bert_config,
num_labels=len(label_list),
init_checkpoint=config["init_checkpoint"],
learning_rate=config["learning_rate"],
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_tpu=False,
use_one_hot_embeddings=False
)
estimator = tf.contrib.tpu.TPUEstimator( # TPU使わない場合自動的にGPUかCPUに切り替わる
use_tpu=False,
model_fn=model_fn,
config=run_config,
train_batch_size=config["train_batch_size"],
eval_batch_size=config["eval_batch_size"],
predict_batch_size=config["predict_batch_size"]
)
# Training
train_file = os.path.join(config["output_directory"], "train.tf_record")
run_classifier.file_based_convert_examples_to_features(train_examples, label_list, config["max_seq_length"], tokenizer, train_file)
train_input_fn = run_classifier.file_based_input_fn_builder(
input_file=train_file,
seq_length=config["max_seq_length"],
is_training=True,
drop_remainder=True
)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
# Prediction
predict_examples = processor.get_test_examples(config["data_directory"])
num_actual_predict_examples = len(predict_examples)
predict_file = os.path.join(config["output_directory"], "predict.tf_record")
run_classifier.file_based_convert_examples_to_features(
predict_examples,
label_list,
config["max_seq_length"],
tokenizer,
predict_file
)
predict_drop_remainder = False
predict_input_fn = run_classifier.file_based_input_fn_builder(
input_file=predict_file,
seq_length=config["max_seq_length"],
is_training=False,
drop_remainder=predict_drop_remainder
)
result = estimator.predict(input_fn=predict_input_fn)
probabilities = np.array([prediction["probabilities"][1] for (i, prediction) in enumerate(result)])
processor._test_df["is_duplicate"] = probabilities
processor._test_df[["is_duplicate", "test_id"]].to_csv("submission.csv", index=False)
pd.read_csv("submission.csv").head()
結果
このコンペは同じ質問が何度も使われており,使われる頻度が重要なヒントになっています.
そのため,BERT単体では,
- 学習時間: 3〜4時間
- 予測時間: 3時間
- Private: 0.33466
- Public: 0.32676
となり,あまり良い性能は出ない状況です.
見た感じ,Private: 0.12988となっている24位の解法がシンプルで,LSTM部分をBERTに置き換えてグラフ構造的な特徴量を組み込んだネットワークを作れば良さそう
まとめ
- 質問の利用頻度などグラフ構造から得られる特徴量が重要なので,BERT単体では性能は出せない模様
-
extract_features.py
形式で特徴量が出せるので,特徴を抽出してネットワークを構築する必要あり.(もう少々お待ち下さい...) - 24位の解法がシンプルなので,これベースに作っていったら良さそうです.