Qiita Teams that are logged in
You are not logged in to any team

Log in to Qiita Team
Community
OrganizationAdvent CalendarQiitadon (β)
Service
Qiita JobsQiita ZineQiita Blog
Help us understand the problem. What is going on with this article?

LibtorchでBERTモデルを使う

Pythonモジュールのインストール(CPU Only)

$ pip3 install -U pip
$ pip install -U virtualenv
$ pip install tensorflow-cpu==2.2.0 -f https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow_cpu-2.2.0-cp36-cp36m-manylinux2020_x86_64.whl
$ pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
$ pip install -U transformers

確認

$ pip list |grep -E "tensorflow|torch|transformers"
tensorflow-cpu       2.1.0
tensorflow-estimator 2.1.0
torch                1.4.0+cpu
torchvision          0.5.0+cpu
transformers         2.5.0

BERTモデルをTensorflowからPyTorchフォーマットへ変換

$ wget ***/Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ unzip ./Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ mv Japanese_L-12_H-768_A-12_E-30_BPE_WWM bert

$ transformers-cli convert \
    --model_type bert \
    --tf_checkpoint /opt/model/bert/model.ckpt \
    --config /opt/model/bert/bert_config.json \
    --pytorch_dump_output /opt/model/bert/pytorch_model.bin

pytorch_model.bin のファイルは予め用意してくれています。
上記処理は備忘録なので実行の必要はありません。

Torchスクリプトから読み込めるようにモデルを変換

bert_config.json
{
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": true,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 32006
}

torchscripttrue にします。

trace.py
# -*- coding: utf-8 -*-

from transformers import BertModel, BertForMaskedLM, BertTokenizer, BertConfig
import torch

enc = BertTokenizer(
    "/opt/model/bert/vocab.txt", 
    do_lower_case=False, 
    do_basic_tokenize=False
)

# Tokenizing input text
text = "[CLS] 吾輩 は 猫 である 。 [SEP] 名前 は まだ 無い 。 [SEP]"
tokenized_text = enc.tokenize(text)

# Masking one of the input tokens
masked_index = 7
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
dummy_input = [tokens_tensor, segments_tensors]

# Initializing the model with the torchscript flag
# Flag set to True even though it is not necessary as this model does not have an LM Head.
config = BertConfig.from_json_file(
    "/opt/model/bert/bert_config.json"
)

# Instantiating the model
model = BertModel(config)

# The model needs to be in evaluation mode
model.eval()

# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertForMaskedLM.from_pretrained(
    "/opt/model/bert/pytorch_model.bin",
    config=config
)

# Creating the trace
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
torch.jit.save(traced_model, "traced_bert.pt")

穴埋めをするので BertForMaskedLM を追加してモデルをマスクモードにしています。

変換用スクリプトを実行してTorch Script形式へ変換

$ python ./trace.py
$ ls -lh 
519M traced_bert.pt

穴埋め問題にチャレンジ

bert-sample/
    CMakeList.txt
    main.cc
    unilib/
        CMakeList.txt
            :
            :
            :
$ mkdir bert-sample/
$ cd ..
$ git clone https://github.com/kamalkraj/BERT-NER.git
$ cd ./BERT-NER/cpp-app
$ cp -r unilib ../../bert-sample/
$ cd ../../bert-sample/
CMakeLists.txt
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)

project(bert-sample)

set (bert-sample_VERSION_MAJOR 0)
set (bert-sample_VERSION_MINOR 1)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

find_package(Torch REQUIRED)

add_executable(sample-bin ${CMAKE_CURRENT_SOURCE_DIR}/main.cc)
add_dependencies(sample-bin unilib-static)
target_link_libraries(sample-bin ${TORCH_LIBRARIES} unilib-static)
set_target_properties(sample-bin PROPERTIES OUTPUT_NAME sample)

add_subdirectory(unilib)
unilib/CMakeList.txt
add_library(unilib-static STATIC 
    ${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/uninorms.cpp)

target_include_directories(unilib-static PRIVATE ${CMAKE_SOURCE_DIR}/unilib)
target_link_libraries(unilib-static ${TORCH_LIBRARIES})

main.cc
#include <iostream>
#include <vector>
#include <string>

#include <torch/script.h>
#include "unilib/tokenizer.h"

int main(int argc, char **argv)
{
    auto module = torch::jit::load("/opt/model/bert/traced_bert.pt");
    module.eval();

    BertTokenizer tokenizer;
    tokenizer.add_vocab("/opt/model/bert/vocab.txt");

    std::vector<std::string> tokens = tokenizer.tokenize("[CLS] 私 が 飼って いる [MASK] の 名前 は ちょむスケ です [SEP]");
    int masked_index = 5;

    std::vector<float> input_ids = tokenizer.convert_tokens_to_ids(tokens);
    std::vector<float> input_mask = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
    int length = 12;

    std::vector<torch::jit::IValue> inputs;
    inputs.push_back(torch::from_blob(input_ids.data(), {1, length}).to(torch::kInt64));
    inputs.push_back(torch::from_blob(input_mask.data(), {1, length}).to(torch::kInt64));

    auto elms = module.forward(inputs).toTuple()->elements();
    auto tensor = elms[0].toTensor();
    int64_t k = 5;
    auto result = std::get<1>(tensor[0][masked_index].topk(k, -1, true, true));

    for (int64_t idx = 0; idx < k; idx++) {
        int id = result[idx].item<int>();
        std::cout << id << " : " << tokenizer.ids_to_tokens.at(id) << "\n";
    }

    return 0;
}
$ mkdir build
$ cmake ..
$ make -j
$ ./sample 
4817 : 猫
2099 : 犬
15534 : ウサギ
7589 : ペット
1017 : 動物

付録

Juman++のインストール 辞書モデル付き

$ wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
$ tar Jxvf jumanpp-2.0.0-rc3.tar.xz
$ cd jumanpp-2.0.0-rc3
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install

Juman++ライブラリ

$ sudo apt-get install -y libeigen3-dev pegtl-dev

$ wget https://github.com/Quintus/pathie-cpp/archive/v0.1.1.tar.gz
$ tar xzvf v0.1.1.tar.gz
$ cd pathie-cpp-0.1.1
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install

$ git clone https://github.com/YujiroTakahashi/libjumanpp.git
$ cd libjumanpp
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j
$ sudo make install

特徴ベクトルの抽出

main.cc
#include <iostream>
#include <vector>
#include <string>

#include <torch/script.h>
#include "unilib/tokenizer.h"

int main(int argc, char **argv)
{
    torch::jit::script::Module module = torch::jit::load("/opt/model/bert/traced_bert.pt");
    torch::NoGradGuard no_grad_guard{};
    module.eval();

    BertTokenizer tokenizer;
    tokenizer.add_vocab("/opt/model/bert/vocab.txt");

    std::vector<std::string> tokens = tokenizer.tokenize("[CLS] 私 が 飼って いる 犬 の 名前 は アンシー です [SEP]");

    std::vector<float> input_ids = tokenizer.convert_tokens_to_ids(tokens);
    std::vector<float> input_mask = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
    int length = 12;

    std::vector<torch::jit::IValue> inputs;
    inputs.push_back(torch::from_blob(input_ids.data(), {1, length}).to(torch::kInt64));
    inputs.push_back(torch::from_blob(input_mask.data(), {1, length}).to(torch::kInt64));

    auto all_encoder_layers = module.forward(inputs).toTuple()->elements()[0];
    auto embedding = all_encoder_layers.toTensor()[0].mean(-2);

    float *x = static_cast<float*>(embedding.storage().data());  
    std::vector<float> encode(x, x+768);
    for (auto &val : encode) {
        std::cout << val << ", ";
    }
    std::cout << std::endl;

    return 0;
}

Pythonのベクトル抽出部分

all_encoder_layers, _ = model.forward(tokens_tensor)

embedding = all_encoder_layers[0].mean(-2)
vals = embedding.numpy()

for val in vals:
    print(val, end=', ')

pyTorch:mean

YujiroTakahashi
C/C++で機械学習してます。
Why not register and get more from Qiita?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away