More than 3 years have passed since last update.

LibtorchでBERTモデルを使う

Last updated at 2020-11-23Posted at 2020-02-25

Pythonモジュールのインストール(CPU Only)

$ pip3 install -U pip
$ pip install -U virtualenv
$ pip install tensorflow-cpu==2.2.0 -f https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow_cpu-2.2.0-cp36-cp36m-manylinux2020_x86_64.whl
$ pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
$ pip install -U transformers

確認

$ pip list |grep -E "tensorflow|torch|transformers"
tensorflow-cpu       2.1.0
tensorflow-estimator 2.1.0
torch                1.4.0+cpu
torchvision          0.5.0+cpu
transformers         2.5.0

BERTモデルをTensorflowからPyTorchフォーマットへ変換

$ wget ***/Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ unzip ./Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ mv Japanese_L-12_H-768_A-12_E-30_BPE_WWM bert

$ transformers-cli convert \
    --model_type bert \
    --tf_checkpoint /opt/model/bert/model.ckpt \
    --config /opt/model/bert/bert_config.json \
    --pytorch_dump_output /opt/model/bert/pytorch_model.bin

pytorch_model.bin のファイルは予め用意してくれています。
上記処理は備忘録なので実行の必要はありません。

Torchスクリプトから読み込めるようにモデルを変換

bert_config.json

{
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": true,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 32006
}

torchscript を true にします。

trace.py

# -*- coding: utf-8 -*-

from transformers import BertModel, BertForMaskedLM, BertTokenizer, BertConfig
import torch

enc = BertTokenizer(
    "/opt/model/bert/vocab.txt", 
    do_lower_case=False, 
    do_basic_tokenize=False
)

# Tokenizing input text
text = "[CLS] 吾輩 は 猫 である 。 [SEP] 名前 は まだ 無い 。 [SEP]"
tokenized_text = enc.tokenize(text)

# Masking one of the input tokens
masked_index = 7
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
dummy_input = [tokens_tensor, segments_tensors]

# Initializing the model with the torchscript flag
# Flag set to True even though it is not necessary as this model does not have an LM Head.
config = BertConfig.from_json_file(
    "/opt/model/bert/bert_config.json"
)

# Instantiating the model
model = BertModel(config)

# The model needs to be in evaluation mode
model.eval()

# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertForMaskedLM.from_pretrained(
    "/opt/model/bert/pytorch_model.bin",
    config=config
)

# Creating the trace
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
torch.jit.save(traced_model, "traced_bert.pt")

穴埋めをするので BertForMaskedLM を追加してモデルをマスクモードにしています。

変換用スクリプトを実行してTorch Script形式へ変換

$ python ./trace.py
$ ls -lh 
519M traced_bert.pt

穴埋め問題にチャレンジ

bert-sample/
    CMakeList.txt
    main.cc
    unilib/
        CMakeList.txt
            :
            :
            :

$ mkdir bert-sample/
$ cd ..
$ git clone https://github.com/kamalkraj/BERT-NER.git
$ cd ./BERT-NER/cpp-app
$ cp -r unilib ../../bert-sample/
$ cd ../../bert-sample/

CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)

project(bert-sample)

set (bert-sample_VERSION_MAJOR 0)
set (bert-sample_VERSION_MINOR 1)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

find_package(Torch REQUIRED)

add_executable(sample-bin ${CMAKE_CURRENT_SOURCE_DIR}/main.cc)
add_dependencies(sample-bin unilib-static)
target_link_libraries(sample-bin ${TORCH_LIBRARIES} unilib-static)
set_target_properties(sample-bin PROPERTIES OUTPUT_NAME sample)

add_subdirectory(unilib)

unilib/CMakeList.txt

add_library(unilib-static STATIC 
	${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp
	${CMAKE_CURRENT_SOURCE_DIR}/uninorms.cpp)

target_include_directories(unilib-static PRIVATE ${CMAKE_SOURCE_DIR}/unilib)
target_link_libraries(unilib-static ${TORCH_LIBRARIES})

main.cc

# include <iostream>
# include <vector>
# include <string>

# include <torch/script.h>
# include "unilib/tokenizer.h"

int main(int argc, char **argv)
{
    auto module = torch::jit::load("/opt/model/bert/traced_bert.pt");
    module.eval();

    BertTokenizer tokenizer;
    tokenizer.add_vocab("/opt/model/bert/vocab.txt");

    std::vector<std::string> tokens = tokenizer.tokenize("[CLS] 私 が 飼って いる [MASK] の 名前 は ちょむスケ です [SEP]");
    int masked_index = 5;

    std::vector<float> input_ids = tokenizer.convert_tokens_to_ids(tokens);
    std::vector<float> input_mask = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
    int length = 12;

    std::vector<torch::jit::IValue> inputs;
    inputs.push_back(torch::from_blob(input_ids.data(), {1, length}).to(torch::kInt64));
    inputs.push_back(torch::from_blob(input_mask.data(), {1, length}).to(torch::kInt64));

    auto elms = module.forward(inputs).toTuple()->elements();
    auto tensor = elms[0].toTensor();
    int64_t k = 5;
    auto result = std::get<1>(tensor[0][masked_index].topk(k, -1, true, true));

    for (int64_t idx = 0; idx < k; idx++) {
        int id = result[idx].item<int>();
        std::cout << id << " : " << tokenizer.ids_to_tokens.at(id) << "\n";
    }

    return 0;
}

$ mkdir build
$ cmake ..
$ make -j
$ ./sample 
4817 : 猫
2099 : 犬
15534 : ウサギ
7589 : ペット
1017 : 動物

付録

Juman++のインストール　辞書モデル付き

$ wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
$ tar Jxvf jumanpp-2.0.0-rc3.tar.xz
$ cd jumanpp-2.0.0-rc3
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install

Juman++ライブラリ

$ sudo apt-get install -y libeigen3-dev pegtl-dev

$ wget https://github.com/Quintus/pathie-cpp/archive/v0.1.1.tar.gz
$ tar xzvf v0.1.1.tar.gz
$ cd pathie-cpp-0.1.1
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install

$ git clone https://github.com/YujiroTakahashi/libjumanpp.git
$ cd libjumanpp
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j
$ sudo make install

特徴ベクトルの抽出