LoginSignup
9
6

More than 5 years have passed since last update.

BERTでKaggleの過去問Quora Question Pairsを解いてみる

Last updated at Posted at 2018-12-21

現在作成中です...もう少々お待ち下さい...

タスク

開発環境

  • GPU: GTX-1080 Ti(多分,メモリが10GB以上必要)
  • TPU: 利用してない
  • 動作しない場合はtrain_batch_sizeの値を小さくしてください.
  • BERT Model: (Base版でLarge版ではないです)

前準備

  • コンペのデータセットのダウンロード
  • bertのコードと学習済みモデルのダウンロード
mkdir input
kaggle competitions download -c quora-question-pairs -p input
unzip "input/*.zip" -d input
git clone https://github.com/google-research/bert.git
wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
unzip uncased_L-12_H-768_A-12.zip

コード

# coding: utf-8
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

sys.path.append("./bert")
import modeling
import optimization
import run_classifier
import tokenization

DEBUG_MODE = False
config = {
    "qqp_model_directory": "/tmp/bert_model",
    "bert_config_filepath": "./uncased_L-12_H-768_A-12/bert_config.json",
    "vocab_filepth": "./uncased_L-12_H-768_A-12/vocab.txt",
    "input_directory": "./input",
    "data_directory": "./data",
    "output_directory": "./output",
    "init_checkpoint": None,
    "do_lower_case": True,
    "max_seq_length": 128,
    "train_batch_size": 32,
    "eval_batch_size": 8,
    "predict_batch_size": 8,
    "learning_rate": 5e-5,
    "num_train_epochs": 3.0,
    "warmup_proportion": 0.1,
    "save_checkpoints_steps": 1000,
    "nrows": (2000 if DEBUG_MODE else None),
}

for directory in [config["data_directory"], config["output_directory"], config["qqp_model_directory"]]:
    if not os.path.exists(directory):
        os.makedirs(directory)

# 壊れたデータの修正

train_df = pd.read_csv(os.path.join(config["input_directory"], "train.csv"), encoding="utf-8")
test_df = pd.read_csv(os.path.join(config["input_directory"], "test.csv"), encoding="utf-8")

def remove(df, on):
    # cf. Test data: https://www.kaggle.com/c/quora-question-pairs/discussion/59035
    valid_ids =[type(x)==int for x in df[on]]
    return df[valid_ids].drop_duplicates()

def fillna(df, value=""):
    df["question1"].fillna(value, inplace=True)
    df["question2"].fillna(value, inplace=True)
    return df

train_df = fillna(remove(train_df, on="id"), value="__NA__")
test_df = fillna(remove(test_df, on="test_id"), value="__NA__")

train_df.to_csv(os.path.join(config["data_directory"], "train.csv"), encoding="utf-8", index=False)
test_df.to_csv(os.path.join(config["data_directory"], "test.csv"), encoding="utf-8", index=False)

# Bert用に変換

class QqpProcessor(object): # QQP: Quora Question Pairs
    def __init__(self, data_directory, nrows=None, random_state=777):
        self._all_train_df = pd.read_csv(os.path.join(data_directory, "train.csv"), nrows=nrows, encoding="utf-8")
        self._train_df, self._dev_df = train_test_split(self._all_train_df, random_state=777)
        self._test_df = pd.read_csv(os.path.join(data_directory, "test.csv"), nrows=nrows, encoding="utf-8")
        self._test_df["id"] = self._test_df["test_id"]

    def get_train_examples(self, data_dir):
        return self._create_examples(self._train_df, "train")

    def get_dev_examples(self, data_dir):
        return self._create_examples(self._dev_df, "dev")

    def get_test_examples(self, data_dir):
        return self._create_examples(self._test_df, "test")

    def get_labels(self):
        return ["0", "1"]

    def _create_examples(self, df, set_type):
        def create_example(row):
            guid = row["id"]
            text_a = tokenization.convert_to_unicode(row["question1"])
            text_b = tokenization.convert_to_unicode(row["question2"])
            if set_type == "test":
                label = "0"
            else: # train, dev
                label = tokenization.convert_to_unicode(str(row["is_duplicate"]))
            return run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
        return df.apply(create_example, axis=1).values.tolist()

processor = QqpProcessor(data_directory=config["data_directory"], nrows=config["nrows"])
tokenization.validate_case_matches_checkpoint(do_lower_case=config["do_lower_case"], init_checkpoint=None)
bert_config = modeling.BertConfig.from_json_file(config["bert_config_filepath"])
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file=config["vocab_filepth"], do_lower_case=config["do_lower_case"])
run_config = tf.contrib.tpu.RunConfig(
    model_dir=config["qqp_model_directory"],
    save_checkpoints_steps=1000,
)
train_examples = processor.get_train_examples(config["data_directory"])
num_train_steps = int(len(train_examples) / config["train_batch_size"] * config["num_train_epochs"])
num_warmup_steps = int(num_train_steps * config["warmup_proportion"])

model_fn = run_classifier.model_fn_builder(
    bert_config=bert_config,
    num_labels=len(label_list),
    init_checkpoint=config["init_checkpoint"],
    learning_rate=config["learning_rate"],
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=False
)

estimator = tf.contrib.tpu.TPUEstimator( # TPU使わない場合自動的にGPUかCPUに切り替わる
    use_tpu=False,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=config["train_batch_size"],
    eval_batch_size=config["eval_batch_size"],
    predict_batch_size=config["predict_batch_size"]
)

# Training

train_file = os.path.join(config["output_directory"], "train.tf_record")
run_classifier.file_based_convert_examples_to_features(train_examples, label_list, config["max_seq_length"], tokenizer, train_file)
train_input_fn = run_classifier.file_based_input_fn_builder(
    input_file=train_file,
    seq_length=config["max_seq_length"],
    is_training=True,
    drop_remainder=True
)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

# Prediction

predict_examples = processor.get_test_examples(config["data_directory"])
num_actual_predict_examples = len(predict_examples)
predict_file = os.path.join(config["output_directory"], "predict.tf_record")
run_classifier.file_based_convert_examples_to_features(
    predict_examples, 
    label_list,                                    
    config["max_seq_length"], 
    tokenizer,                                    
    predict_file
)
predict_drop_remainder = False
predict_input_fn = run_classifier.file_based_input_fn_builder(
    input_file=predict_file,
    seq_length=config["max_seq_length"],
    is_training=False,
    drop_remainder=predict_drop_remainder
)

result = estimator.predict(input_fn=predict_input_fn)
probabilities = np.array([prediction["probabilities"][1] for (i, prediction) in enumerate(result)])
processor._test_df["is_duplicate"] = probabilities
processor._test_df[["is_duplicate", "test_id"]].to_csv("submission.csv", index=False)
pd.read_csv("submission.csv").head()

結果

このコンペは同じ質問が何度も使われており,使われる頻度が重要なヒントになっています.
そのため,BERT単体では,

  • 学習時間: 3〜4時間
  • 予測時間: 3時間
  • Private: 0.33466
  • Public: 0.32676

となり,あまり良い性能は出ない状況です.

見た感じ,Private: 0.12988となっている24位の解法がシンプルで,LSTM部分をBERTに置き換えてグラフ構造的な特徴量を組み込んだネットワークを作れば良さそう

まとめ

  • 質問の利用頻度などグラフ構造から得られる特徴量が重要なので,BERT単体では性能は出せない模様
  • extract_features.py形式で特徴量が出せるので,特徴を抽出してネットワークを構築する必要あり.(もう少々お待ち下さい...)
  • 24位の解法がシンプルなので,これベースに作っていったら良さそうです.
9
6
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
9
6