LoginSignup
4
5

More than 3 years have passed since last update.

LibtorchでBERTモデルを使う

Last updated at Posted at 2020-02-25

Pythonモジュールのインストール(CPU Only)

$ pip3 install -U pip
$ pip install -U virtualenv
$ pip install tensorflow-cpu==2.2.0 -f https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow_cpu-2.2.0-cp36-cp36m-manylinux2020_x86_64.whl
$ pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
$ pip install -U transformers

確認

$ pip list |grep -E "tensorflow|torch|transformers"
tensorflow-cpu       2.1.0
tensorflow-estimator 2.1.0
torch                1.4.0+cpu
torchvision          0.5.0+cpu
transformers         2.5.0

BERTモデルをTensorflowからPyTorchフォーマットへ変換

$ wget ***/Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ unzip ./Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ mv Japanese_L-12_H-768_A-12_E-30_BPE_WWM bert

$ transformers-cli convert \
    --model_type bert \
    --tf_checkpoint /opt/model/bert/model.ckpt \
    --config /opt/model/bert/bert_config.json \
    --pytorch_dump_output /opt/model/bert/pytorch_model.bin

pytorch_model.bin のファイルは予め用意してくれています。
上記処理は備忘録なので実行の必要はありません。

Torchスクリプトから読み込めるようにモデルを変換

bert_config.json
{
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": true,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 32006
}

torchscripttrue にします。

trace.py
# -*- coding: utf-8 -*-

from transformers import BertModel, BertForMaskedLM, BertTokenizer, BertConfig
import torch

enc = BertTokenizer(
    "/opt/model/bert/vocab.txt", 
    do_lower_case=False, 
    do_basic_tokenize=False
)

# Tokenizing input text
text = "[CLS] 吾輩 は 猫 である 。 [SEP] 名前 は まだ 無い 。 [SEP]"
tokenized_text = enc.tokenize(text)

# Masking one of the input tokens
masked_index = 7
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
dummy_input = [tokens_tensor, segments_tensors]

# Initializing the model with the torchscript flag
# Flag set to True even though it is not necessary as this model does not have an LM Head.
config = BertConfig.from_json_file(
    "/opt/model/bert/bert_config.json"
)

# Instantiating the model
model = BertModel(config)

# The model needs to be in evaluation mode
model.eval()

# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertForMaskedLM.from_pretrained(
    "/opt/model/bert/pytorch_model.bin",
    config=config
)

# Creating the trace
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
torch.jit.save(traced_model, "traced_bert.pt")

穴埋めをするので BertForMaskedLM を追加してモデルをマスクモードにしています。

変換用スクリプトを実行してTorch Script形式へ変換

$ python ./trace.py
$ ls -lh 
519M traced_bert.pt

穴埋め問題にチャレンジ

bert-sample/
    CMakeList.txt
    main.cc
    unilib/
        CMakeList.txt
            :
            :
            :
$ mkdir bert-sample/
$ cd ..
$ git clone https://github.com/kamalkraj/BERT-NER.git
$ cd ./BERT-NER/cpp-app
$ cp -r unilib ../../bert-sample/
$ cd ../../bert-sample/
CMakeLists.txt
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)

project(bert-sample)

set (bert-sample_VERSION_MAJOR 0)
set (bert-sample_VERSION_MINOR 1)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

find_package(Torch REQUIRED)

add_executable(sample-bin ${CMAKE_CURRENT_SOURCE_DIR}/main.cc)
add_dependencies(sample-bin unilib-static)
target_link_libraries(sample-bin ${TORCH_LIBRARIES} unilib-static)
set_target_properties(sample-bin PROPERTIES OUTPUT_NAME sample)

add_subdirectory(unilib)
unilib/CMakeList.txt
add_library(unilib-static STATIC 
    ${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/uninorms.cpp)

target_include_directories(unilib-static PRIVATE ${CMAKE_SOURCE_DIR}/unilib)
target_link_libraries(unilib-static ${TORCH_LIBRARIES})

main.cc
#include <iostream>
#include <vector>
#include <string>

#include <torch/script.h>
#include "unilib/tokenizer.h"

int main(int argc, char **argv)
{
    auto module = torch::jit::load("/opt/model/bert/traced_bert.pt");
    module.eval();

    BertTokenizer tokenizer;
    tokenizer.add_vocab("/opt/model/bert/vocab.txt");

    std::vector<std::string> tokens = tokenizer.tokenize("[CLS] 私 が 飼って いる [MASK] の 名前 は ちょむスケ です [SEP]");
    int masked_index = 5;

    std::vector<float> input_ids = tokenizer.convert_tokens_to_ids(tokens);
    std::vector<float> input_mask = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
    int length = 12;

    std::vector<torch::jit::IValue> inputs;
    inputs.push_back(torch::from_blob(input_ids.data(), {1, length}).to(torch::kInt64));
    inputs.push_back(torch::from_blob(input_mask.data(), {1, length}).to(torch::kInt64));

    auto elms = module.forward(inputs).toTuple()->elements();
    auto tensor = elms[0].toTensor();
    int64_t k = 5;
    auto result = std::get<1>(tensor[0][masked_index].topk(k, -1, true, true));

    for (int64_t idx = 0; idx < k; idx++) {
        int id = result[idx].item<int>();
        std::cout << id << " : " << tokenizer.ids_to_tokens.at(id) << "\n";
    }

    return 0;
}
$ mkdir build
$ cmake ..
$ make -j
$ ./sample 
4817 : 猫
2099 : 犬
15534 : ウサギ
7589 : ペット
1017 : 動物

付録

Juman++のインストール 辞書モデル付き

$ wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
$ tar Jxvf jumanpp-2.0.0-rc3.tar.xz
$ cd jumanpp-2.0.0-rc3
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install

Juman++ライブラリ

$ sudo apt-get install -y libeigen3-dev pegtl-dev

$ wget https://github.com/Quintus/pathie-cpp/archive/v0.1.1.tar.gz
$ tar xzvf v0.1.1.tar.gz
$ cd pathie-cpp-0.1.1
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install

$ git clone https://github.com/YujiroTakahashi/libjumanpp.git
$ cd libjumanpp
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j
$ sudo make install

特徴ベクトルの抽出

main.cc
#include <iostream>
#include <vector>
#include <string>

#include <torch/script.h>
#include "unilib/tokenizer.h"

int main(int argc, char **argv)
{
    torch::jit::script::Module module = torch::jit::load("/opt/model/bert/traced_bert.pt");
    torch::NoGradGuard no_grad_guard{};
    module.eval();

    BertTokenizer tokenizer;
    tokenizer.add_vocab("/opt/model/bert/vocab.txt");

    std::vector<std::string> tokens = tokenizer.tokenize("[CLS] 私 が 飼って いる 犬 の 名前 は アンシー です [SEP]");

    std::vector<float> input_ids = tokenizer.convert_tokens_to_ids(tokens);
    std::vector<float> input_mask = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
    int length = 12;

    std::vector<torch::jit::IValue> inputs;
    inputs.push_back(torch::from_blob(input_ids.data(), {1, length}).to(torch::kInt64));
    inputs.push_back(torch::from_blob(input_mask.data(), {1, length}).to(torch::kInt64));

    auto all_encoder_layers = module.forward(inputs).toTuple()->elements()[0];
    auto embedding = all_encoder_layers.toTensor()[0].mean(-2);

    float *x = static_cast<float*>(embedding.storage().data());  
    std::vector<float> encode(x, x+768);
    for (auto &val : encode) {
        std::cout << val << ", ";
    }
    std::cout << std::endl;

    return 0;
}

Pythonのベクトル抽出部分

all_encoder_layers, _ = model.forward(tokens_tensor)

embedding = all_encoder_layers[0].mean(-2)
vals = embedding.numpy()

for val in vals:
    print(val, end=', ')

pyTorch:mean

4
5
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
5