More than 3 years have passed since last update.

追い詰められたWEBエンジニアのAIことはじめ。colabでとりあえず動かす文章分類問題(BERT)(2020/09/14現在)

Last updated at 2020-09-14Posted at 2020-09-14

AIをやれと言われて困ったエンジニアの奮闘記録です。とりあえず文章分類をcolabでやるよ！

書いた人

Python:わからん。虚無
AI:わからん。虚無 kerasとか知らんし。。
colab:わからん、虚無
情報源:web、stack over flow、github

1. これをやる

https://tech-blog.cloud-config.jp/2020-02-06-category-classification-using-bert/
本記事は↑をとりあえず動かそうとして動かなかったのでわからんまま無理くり動かした補完記事です。

入力

とりあえず動かしたいので最小行数のデータ

train

feature
ちなみに弓道のリーグ戦は四人一組のグループが交互に引く形である
番号変えたいと思う人間もいるのではないのでしょうか？

label
スポーツ
携帯電話

test

feature
ちなみにグループが交互に引く形である
番号がほしい

label
スポーツ
携帯電話

colabでの実行をバチバチ全部書いていくよ

# ドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

# 必要ライブラリのインストール
!pip install sentencepiece
!pip install keras_bert
# これ必要だったよ？しらんけど。
!pip install np_utils
# GPUとかTPUとか使いたい...わからない...
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
import os
import tensorflow_datasets as tfds
print(tf.__version__)

# BERTの設定ファイル、モデルのロード
# max値を得るプログラム
import pandas as pd
import sentencepiece as spm

# feature.csvは上記で用意したファイルのパスを指定してください
train_features_df = pd.read_csv('/content/drive/My Drive/bert/data/trains/features.csv')

def _get_indice(feature):
    tokens = []
    tokens.append('[CLS]')
    tokens.extend(sp.encode_as_pieces(feature))
    tokens.append('[SEP]')
    number = len(tokens)

    return number

sp = spm.SentencePieceProcessor()
# ダウンロードした事前学習モデルのパスを指定してください
sp.Load('/content/drive/My Drive/bert/bert-wiki-ja/wiki-ja.model')

numbers = []

for feature in train_features_df['feature']:
    features_number = _get_indice(feature)
    numbers.append(features_number)

# 最大トークン数
max_token_num = max(numbers)
print("max_token_number: " + str(max_token_num))

ここで書き出された値を次の箱にかくのです。 18が出力された

# 学習データのロード関数
import sys
sys.path.append('modules')
from keras_bert import load_trained_model_from_checkpoint
from keras import utils

# BERTのロード
config_path = '/content/drive/My Drive/bert/bert-wiki-ja/bert_finetuning_config_v1.json'
# 拡張子まで記載しない
checkpoint_path = '/content/drive/My Drive/bert/bert-wiki-ja/model.ckpt-1400000'

# 最大のトークン数
SEQ_LEN = 18
BATCH_SIZE = 16
BERT_DIM = 768
LR = 1e-4
# 学習回数
EPOCH = 1 # 20

bert = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True,  trainable=True, seq_len=SEQ_LEN)
bert.summary()

学習回数は1回(時間かけたくない)

# 学習データのロード関数
from keras import utils
import numpy as np # これ必要だったよ？

maxlen = SEQ_LEN # 参照サイトではmaxlenが宣言されずに使われようとしていた...
sp = spm.SentencePieceProcessor()
sp.Load('/content/drive/My Drive/bert/bert-wiki-ja/wiki-ja.model')

def _get_indice(feature):
    indices = np.zeros((maxlen), dtype = np.int32)

    tokens = []
    tokens.append('[CLS]')
    tokens.extend(sp.encode_as_pieces(feature))
    tokens.append('[SEP]')

    for t, token in enumerate(tokens):
        if t >= maxlen:
            break
        try:
            indices[t] = sp.piece_to_id(token)
        except:
            logging.warn(f'{token} is unknown.')
            indices[t] = sp.piece_to_id('<unk>')

    return indices

def _load_labeldata(train_dir, test_dir):
    train_features_df = pd.read_csv(f'{train_dir}/features.csv')
    train_labels_df = pd.read_csv(f'{train_dir}/labels.csv')
    test_features_df = pd.read_csv(f'{test_dir}/features.csv')
    test_labels_df = pd.read_csv(f'{test_dir}/labels.csv')
    label2index = {k: i for i, k in enumerate(train_labels_df['label'].unique())}
    index2label = {i: k for i, k in enumerate(train_labels_df['label'].unique())}
    class_count = len(label2index)
    train_labels = utils.np_utils.to_categorical([label2index[label] for label in train_labels_df['label']], num_classes=class_count)
    test_label_indices = [label2index[label] for label in test_labels_df['label']]
    test_labels = utils.np_utils.to_categorical(test_label_indices, num_classes=class_count)

    train_features = []
    test_features = []

    for feature in train_features_df['feature']:
        train_features.append(_get_indice(feature))
    train_segments = np.zeros((len(train_features), maxlen), dtype = np.float32)
    print("maxlen")
    print(maxlen)
    for feature in test_features_df['feature']:
        test_features.append(_get_indice(feature))
    test_segments = np.zeros((len(test_features), maxlen), dtype = np.float32)

    print(f'Trainデータ数: {len(train_features_df)}, Testデータ数: {len(test_features_df)}, ラベル数: {class_count}')

    return {
        'class_count': class_count,
        'label2index': label2index,
        'index2label': index2label,
        'train_labels': train_labels,
        'test_labels': test_labels,
        'test_label_indices': test_label_indices,
        'train_features': np.array(train_features),
        'train_segments': np.array(train_segments),
        'test_features': np.array(test_features),
        'test_segments': np.array(test_segments),
        'input_len': maxlen
    }

# モデル作成関数
from keras.layers import Dense, Dropout, LSTM, Bidirectional, Flatten, GlobalMaxPooling1D
from keras_bert.layers import MaskedGlobalMaxPool1D
from keras import Input, Model
from keras_bert import AdamWarmup, calc_train_steps

def _create_model(input_shape, class_count):
    decay_steps, warmup_steps = calc_train_steps(
        input_shape[0],
        batch_size=BATCH_SIZE,
        epochs=EPOCH,
    )

    bert_last = bert.get_layer(name='NSP-Dense').output
    x1 = bert_last
    output_tensor = Dense(class_count, activation='softmax')(x1)
    # Trainableの場合は、Input Masked Layerが3番目の入力なりますが、
    # FineTuning時には必要無いので1, 2番目の入力だけ使用します。
    # Trainableでなければkeras-bertのModel.inputそのままで問題ありません。
    model = Model([bert.input[0], bert.input[1]], output_tensor)
    model.compile(loss='categorical_crossentropy',
                  optimizer=AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR),
                  #optimizer='nadam',
                  metrics=['mae', 'mse', 'acc'])

    return model

# 学習データのロードとモデルの準備
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.utils.np_utils import to_categorical
import numpy as np
from keras import utils

trains_dir = '/content/drive/My Drive/bert/data/trains'
tests_dir = '/content/drive/My Drive/bert/data/tests'

data = _load_labeldata(trains_dir, tests_dir)
model_filename = '/content/drive/My Drive/bert/models/knbc_finetuning.model'
model = _create_model(data['train_features'].shape, data['class_count'])

model.summary()

# 学習の実行
history = model.fit([data['train_features'], data['train_segments']],
          data['train_labels'],
          epochs = EPOCH,
          batch_size = BATCH_SIZE,
          validation_data=([data['test_features'], data['test_segments']], data['test_labels']),
          shuffle=False,
          verbose = 1,
          callbacks = [
#              ModelCheckpoint(monitor='val_acc', mode='max', filepath=model_filename, save_best_only=True)
          ])

1分ぐらいかかる

# モデルの評価
from sklearn.metrics import classification_report, confusion_matrix
from keras.models import load_model
from keras_bert import get_custom_objects

model = load_model(model_filename, custom_objects=get_custom_objects())

predicted_test_labels = model.predict([data['test_features'], data['test_segments']]).argmax(axis=1)
numeric_test_labels = np.array(data['test_labels']).argmax(axis=1)

report = classification_report(
        numeric_test_labels, predicted_test_labels, target_names=['携帯電話', 'スポーツ'], output_dict=True)
display(pd.DataFrame(report).T)

結果は↓

	precision	recall	f1-score	support
携帯電話	0.50	1.0	0.666667	1.0
スポーツ	0.00	0.0	0.000000	1.0
accuracy	0.50	0.5	0.500000	0.5
macro avg	0.25	0.5	0.333333	2.0
weighted avg	0.25	0.5	0.333333	2.0

入力は訓練2行、テスト2行、学習1回なので学習結果としてはだめだめですが、とりあえず動くっぽい

# 予測
import sys
import pandas as pd
import sentencepiece as spm
import logging
import numpy as np

from keras import utils
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras_bert import load_trained_model_from_checkpoint
from keras_bert import get_custom_objects
from sklearn.metrics import classification_report, confusion_matrix

sys.path.append('modules')

# SentencePieceProccerモデルの読込
spp = spm.SentencePieceProcessor()
spp.Load('/content/drive/My Drive/bert/bert-wiki-ja/wiki-ja.model')
# BERTの学習したモデルの読込
model_filename = '/content/drive/My Drive/bert/models/knbc_finetuning.model'
model = load_model(model_filename, custom_objects=get_custom_objects())

SEQ_LEN = 18
maxlen = SEQ_LEN

def _get_indice(feature):
    indices = np.zeros((maxlen), dtype=np.int32)

    tokens = []
    tokens.append('[CLS]')
    tokens.extend(spp.encode_as_pieces(feature))
    tokens.append('[SEP]')

    for t, token in enumerate(tokens):
        if t >= maxlen:
            break
        try:
            indices[t] = spp.piece_to_id(token)
        except:
            logging.warn('unknown')
            indices[t] = spp.piece_to_id('<unk>')
    return indices

# 予測実行
feature = "運動するのは楽しい"

test_features = []
test_features.append(_get_indice(feature))
test_segments = np.zeros(
    (len(test_features), maxlen), dtype=np.float32)

# np.arrayでlistに変換しないとだめだった。
predicted_test_labels = model.predict(
    [np.array(test_features), test_segments]).argmax(axis=1)
label_data = pd.read_csv('/content/drive/My Drive/bert/label_id/id_category.csv')
label = label_data.query(f'id == {predicted_test_labels[0]}')
label = label.iloc[0]
label_name = label['label']
print(label_name)

スポーツ

感想

データ増やせばいけそう
試行錯誤していたらcolabのメモリ限界にすぐ到達するので辛い
業務でやるなら金でスペックぶっ叩いた環境が必要そう
EC2なら1.5$/1Hのマシンがある
tpu使いたい

追い詰められたWEBエンジニアですが、インターネッツの恩恵にあずかりAI活動を始めることができそうです。高速道路はできている感じはあり、フレームワークも優秀(というかデータ用意できたら何もすることない)です。
ただ、調べた感じではAIって作るのに金かかるから経営陣はかんたんにAIとか言わないほうが良いかもしれません。のるかそるか全くわからない状態で数百万の投資+人件費の覚悟はありますか？という問題となりそうです。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up