Pythonモジュールのインストール(CPU Only)
$ pip3 install -U pip
$ pip install -U virtualenv
$ pip install tensorflow-cpu==2.2.0 -f https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow_cpu-2.2.0-cp36-cp36m-manylinux2020_x86_64.whl
$ pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
$ pip install -U transformers
確認
$ pip list |grep -E "tensorflow|torch|transformers"
tensorflow-cpu 2.1.0
tensorflow-estimator 2.1.0
torch 1.4.0+cpu
torchvision 0.5.0+cpu
transformers 2.5.0
BERTモデルをTensorflowからPyTorchフォーマットへ変換
$ wget ***/Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ unzip ./Japanese_L-12_H-768_A-12_E-30_BPE_WWM.zip
$ mv Japanese_L-12_H-768_A-12_E-30_BPE_WWM bert
$ transformers-cli convert \
--model_type bert \
--tf_checkpoint /opt/model/bert/model.ckpt \
--config /opt/model/bert/bert_config.json \
--pytorch_dump_output /opt/model/bert/pytorch_model.bin
pytorch_model.bin のファイルは予め用意してくれています。
上記処理は備忘録なので実行の必要はありません。
Torchスクリプトから読み込めるようにモデルを変換
bert_config.json
{
"architectures": null,
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"do_sample": false,
"eos_token_ids": 0,
"finetuning_task": null,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"is_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_eps": 1e-12,
"length_penalty": 1.0,
"max_length": 20,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_beams": 1,
"num_hidden_layers": 12,
"num_labels": 2,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_past": true,
"pad_token_id": 0,
"pruned_heads": {},
"repetition_penalty": 1.0,
"temperature": 1.0,
"top_k": 50,
"top_p": 1.0,
"torchscript": true,
"type_vocab_size": 2,
"use_bfloat16": false,
"vocab_size": 32006
}
torchscript を true にします。
trace.py
# -*- coding: utf-8 -*-
from transformers import BertModel, BertForMaskedLM, BertTokenizer, BertConfig
import torch
enc = BertTokenizer(
"/opt/model/bert/vocab.txt",
do_lower_case=False,
do_basic_tokenize=False
)
# Tokenizing input text
text = "[CLS] 吾輩 は 猫 である 。 [SEP] 名前 は まだ 無い 。 [SEP]"
tokenized_text = enc.tokenize(text)
# Masking one of the input tokens
masked_index = 7
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
dummy_input = [tokens_tensor, segments_tensors]
# Initializing the model with the torchscript flag
# Flag set to True even though it is not necessary as this model does not have an LM Head.
config = BertConfig.from_json_file(
"/opt/model/bert/bert_config.json"
)
# Instantiating the model
model = BertModel(config)
# The model needs to be in evaluation mode
model.eval()
# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertForMaskedLM.from_pretrained(
"/opt/model/bert/pytorch_model.bin",
config=config
)
# Creating the trace
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
torch.jit.save(traced_model, "traced_bert.pt")
穴埋めをするので BertForMaskedLM を追加してモデルをマスクモードにしています。
変換用スクリプトを実行してTorch Script形式へ変換
$ python ./trace.py
$ ls -lh
519M traced_bert.pt
穴埋め問題にチャレンジ
bert-sample/
CMakeList.txt
main.cc
unilib/
CMakeList.txt
:
:
:
$ mkdir bert-sample/
$ cd ..
$ git clone https://github.com/kamalkraj/BERT-NER.git
$ cd ./BERT-NER/cpp-app
$ cp -r unilib ../../bert-sample/
$ cd ../../bert-sample/
CMakeLists.txt
cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(bert-sample)
set (bert-sample_VERSION_MAJOR 0)
set (bert-sample_VERSION_MINOR 1)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
find_package(Torch REQUIRED)
add_executable(sample-bin ${CMAKE_CURRENT_SOURCE_DIR}/main.cc)
add_dependencies(sample-bin unilib-static)
target_link_libraries(sample-bin ${TORCH_LIBRARIES} unilib-static)
set_target_properties(sample-bin PROPERTIES OUTPUT_NAME sample)
add_subdirectory(unilib)
unilib/CMakeList.txt
add_library(unilib-static STATIC
${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp
${CMAKE_CURRENT_SOURCE_DIR}/uninorms.cpp)
target_include_directories(unilib-static PRIVATE ${CMAKE_SOURCE_DIR}/unilib)
target_link_libraries(unilib-static ${TORCH_LIBRARIES})
main.cc
#include <iostream>
#include <vector>
#include <string>
#include <torch/script.h>
#include "unilib/tokenizer.h"
int main(int argc, char **argv)
{
auto module = torch::jit::load("/opt/model/bert/traced_bert.pt");
module.eval();
BertTokenizer tokenizer;
tokenizer.add_vocab("/opt/model/bert/vocab.txt");
std::vector<std::string> tokens = tokenizer.tokenize("[CLS] 私 が 飼って いる [MASK] の 名前 は ちょむスケ です [SEP]");
int masked_index = 5;
std::vector<float> input_ids = tokenizer.convert_tokens_to_ids(tokens);
std::vector<float> input_mask = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
int length = 12;
std::vector<torch::jit::IValue> inputs;
inputs.push_back(torch::from_blob(input_ids.data(), {1, length}).to(torch::kInt64));
inputs.push_back(torch::from_blob(input_mask.data(), {1, length}).to(torch::kInt64));
auto elms = module.forward(inputs).toTuple()->elements();
auto tensor = elms[0].toTensor();
int64_t k = 5;
auto result = std::get<1>(tensor[0][masked_index].topk(k, -1, true, true));
for (int64_t idx = 0; idx < k; idx++) {
int id = result[idx].item<int>();
std::cout << id << " : " << tokenizer.ids_to_tokens.at(id) << "\n";
}
return 0;
}
$ mkdir build
$ cmake ..
$ make -j
$ ./sample
4817 : 猫
2099 : 犬
15534 : ウサギ
7589 : ペット
1017 : 動物
付録
Juman++のインストール 辞書モデル付き
$ wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
$ tar Jxvf jumanpp-2.0.0-rc3.tar.xz
$ cd jumanpp-2.0.0-rc3
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install
Juman++ライブラリ
$ sudo apt-get install -y libeigen3-dev pegtl-dev
$ wget https://github.com/Quintus/pathie-cpp/archive/v0.1.1.tar.gz
$ tar xzvf v0.1.1.tar.gz
$ cd pathie-cpp-0.1.1
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j $(nproc)
$ sudo make install
$ git clone https://github.com/YujiroTakahashi/libjumanpp.git
$ cd libjumanpp
$ mkdir build
$ cd build
$ cmake -DCMAKE_BUILD_TYPE=Release ..
$ make -j
$ sudo make install
特徴ベクトルの抽出
main.cc
#include <iostream>
#include <vector>
#include <string>
#include <torch/script.h>
#include "unilib/tokenizer.h"
int main(int argc, char **argv)
{
torch::jit::script::Module module = torch::jit::load("/opt/model/bert/traced_bert.pt");
torch::NoGradGuard no_grad_guard{};
module.eval();
BertTokenizer tokenizer;
tokenizer.add_vocab("/opt/model/bert/vocab.txt");
std::vector<std::string> tokens = tokenizer.tokenize("[CLS] 私 が 飼って いる 犬 の 名前 は アンシー です [SEP]");
std::vector<float> input_ids = tokenizer.convert_tokens_to_ids(tokens);
std::vector<float> input_mask = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
int length = 12;
std::vector<torch::jit::IValue> inputs;
inputs.push_back(torch::from_blob(input_ids.data(), {1, length}).to(torch::kInt64));
inputs.push_back(torch::from_blob(input_mask.data(), {1, length}).to(torch::kInt64));
auto all_encoder_layers = module.forward(inputs).toTuple()->elements()[0];
auto embedding = all_encoder_layers.toTensor()[0].mean(-2);
float *x = static_cast<float*>(embedding.storage().data());
std::vector<float> encode(x, x+768);
for (auto &val : encode) {
std::cout << val << ", ";
}
std::cout << std::endl;
return 0;
}
Pythonのベクトル抽出部分
all_encoder_layers, _ = model.forward(tokens_tensor)
embedding = all_encoder_layers[0].mean(-2)
vals = embedding.numpy()
for val in vals:
print(val, end=', ')