Takuro Fujii, Koki Shibata, Atsuki Yamaguchi, Terufumi Morishita, Yasuhiro Sogawa『How do different tokenizers perform on downstream tasks in scriptio continua languages?: A case study in Japanese』(Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (July 2023), Volume 4: Student Research Workshop, pp.39-49)を読んでいて、これらのモデルのトークナイザの「精度」が、結果と相関しているのかが気になった。気になったのだが、https://huggingface.co/hitachi-nlp で公開されている各モデルは、トークナイザを内蔵しておらず、いわば「外注」状態なので、4月12日の記事の手法が適用できない。仕方ないので、15種類のトークナイザを直に組み上げるやり方で、国語研短単位に対するトークナイザの「精度」を測ってみた。Google Colaboratoryだと、こんな感じ。
!apt install mecab-ipadic-utf8
!test -d /usr/local/etc || mkdir -p /usr/local/etc
!test -f /usr/local/etc/mecabrc || ln -s /etc/mecabrc /usr/local/etc/mecabrc
!test -d jumanpp-2.0.0-rc4 || curl -L https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc4/jumanpp-2.0.0-rc4.tar.xz | tar xJf -
!test -x /usr/local/bin/jumanpp || ( mkdir jumanpp-2.0.0-rc4/build && cd jumanpp-2.0.0-rc4/build && cmake .. -DCMAKE_BUILD_TYPE=Release && make install )
!test -d bccwj-suw+unidic+tag || curl -L https://github.com/daac-tools/vaporetto/releases/download/v0.5.0/bccwj-suw+unidic+tag.tar.xz | tar xJvf -
!test -d compare-ja-tokenizer || git clone --depth=1 https://github.com/hitachi-nlp/compare-ja-tokenizer
!test -d UD_Japanese-GSD || git clone --depth=1 https://github.com/UniversalDependencies/UD_Japanese-GSD
!cp UD_Japanese-GSD/*-test.conllu test.conllu
!pip install transformers sentencepiece spacy-alignments pytextspan mecab-python3 fugashi ipadic unidic-lite pyknp mojimoji sudachipy sudachidict_core vaporetto==0.2.1
from tokenizers import Tokenizer
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast,AutoTokenizer
from spacy_alignments import get_alignments
class MecabPreTokenizer:
def __init__(self):
from MeCab import Tagger
self.mecab=Tagger("-Owakati")
self.tokenize=lambda x:self.mecab.parse(x).strip().split(" ")
def custom_split(self,i,normalized_string):
import textspan
text=str(normalized_string)
tokens=self.tokenize(text)
tokens_spans=textspan.get_original_spans(tokens,text)
return [normalized_string[s:e] for t in tokens_spans for s,e in t]
def pre_tokenize(self,pretok):
pretok.split(self.custom_split)
class JumanPreTokenizer(MecabPreTokenizer):
def __init__(self):
from pyknp import Juman
self.juman=Juman("jumanpp",multithreading=True)
def tokenize(self,sentence):
import mojimoji,traceback
text=mojimoji.han_to_zen(sentence).rstrip()
try:
result=self.juman.analysis(text)
except:
traceback.print_exc()
text=""
result=self.juman.analysis(text)
return [m.midasi for m in result.mrph_list()]
class SudachiPreTokenizer(MecabPreTokenizer):
def __init__(self):
from sudachipy.dictionary import Dictionary
self.sudachi=Dictionary().create()
self.tokenize=lambda x:[t.surface() for t in self.sudachi.tokenize(x)]
class VaporettoPreTokenizer(MecabPreTokenizer):
def __init__(self,dic="bccwj-suw+unidic+tag/bccwj-suw+unidic+tag.model.zst"):
import vaporetto
with open(dic,"rb") as r:
m=r.read()
self.vaporetto=vaporetto.Vaporetto(m,predict_tags=False)
self.tokenize=lambda x:[t.surface() for t in self.vaporetto.tokenize(x)]
pre_tokenizers={"mecab":MecabPreTokenizer,"jumanpp":JumanPreTokenizer,"sudachi":SudachiPreTokenizer,"vaporetto":VaporettoPreTokenizer,"nothing":None}
def prf(tokenizer,model_name):
gold=system=correct=0
with open("test.conllu","r",encoding="utf-8") as r:
for k in r:
if k.startswith("# text ="):
txt=k[8:].strip()
frm=[]
elif k.strip()=="":
g=[(t[0],t[-1]+1) for t in get_alignments(list(txt),frm)[1]]
try:
s=[t for t in tokenizer(txt,return_offsets_mapping=True)["offset_mapping"] if t[0]<t[1]]
except:
s=[(t[0],t[-1]+1) if t>[] else (0,0) for t in get_alignments(list(txt),tokenizer.tokenize(txt))[1]]
gold+=len(g)
system+=len(s)
i=j=0
while i<len(g) and j<len(s):
if s[j][0]<g[i][0]:
j+=1
elif g[i][0]<s[j][0]:
i+=1
else:
correct+=g[i][1]==s[j][1]
i+=1
j+=1
else:
t=k.split("\t")
if len(t)==10 and t[0].isdecimal():
frm.append(t[1])
print("\n***",model_name)
print("Precision",correct/system if system else 0.0)
print("Recall ",correct/gold)
print("F1 Score ",2*correct/(system+gold))
for w in ["bpe","wordpiece","unigram"]:
for p in pre_tokenizers:
t=Tokenizer.from_file(f"compare-ja-tokenizer/data/dict/{p}_{w}.json")
t.post_processor=BertProcessing(cls=("[CLS]",t.token_to_id("[CLS]")),sep=("[SEP]",t.token_to_id("[SEP]")))
tkz=PreTrainedTokenizerFast(tokenizer_object=t,unk_token="[UNK]",pad_token="[PAD]",cls_token="[CLS]",sep_token="[SEP]",mask_token="[MASK]")
if pre_tokenizers[p]:
tkz._tokenizer.pre_tokenizer=PreTokenizer.custom(pre_tokenizers[p]())
prf(tkz,f"hitachi-nlp/bert-base-japanese_{p}-{w}")
for b in ["tohoku-nlp/bert-base-japanese","tohoku-nlp/bert-base-japanese-v2","tohoku-nlp/bert-base-japanese-v3"]:
prf(AutoTokenizer.from_pretrained(b),b)
「外注」先のGitHubを横目に、MeCab辞書にはipadicを、Vaporette辞書にはbccwj-suw+unidic+tag.model.zstの0.5.0版を、それぞれ使ってみたところ、私(安岡孝一)の手元では、以下の結果が得られた。
*** hitachi-nlp/bert-base-japanese_mecab-bpe
Precision 0.8305061082024433
Recall 0.9127666104035599
F1 Score 0.869695529807376
*** hitachi-nlp/bert-base-japanese_jumanpp-bpe
Precision 0.7408021712907117
Recall 0.7538744821236766
F1 Score 0.7472811620655563
*** hitachi-nlp/bert-base-japanese_sudachi-bpe
Precision 0.8196292103028587
Recall 0.8886757710603038
F1 Score 0.8527571228741809
*** hitachi-nlp/bert-base-japanese_vaporetto-bpe
Precision 0.8230683867387987
Recall 0.9104649378548412
F1 Score 0.8645636019233571
*** hitachi-nlp/bert-base-japanese_nothing-bpe
Precision 0.4829113395170051
Recall 0.4433788553015191
F1 Score 0.4623015079396824
*** hitachi-nlp/bert-base-japanese_mecab-wordpiece
Precision 0.6765020446681347
Recall 0.8249961638790855
F1 Score 0.7434062705243873
*** hitachi-nlp/bert-base-japanese_jumanpp-wordpiece
Precision 0.6228225367446925
Recall 0.7022402946140862
F1 Score 0.6601514605120807
*** hitachi-nlp/bert-base-japanese_sudachi-wordpiece
Precision 0.6839216186889641
Recall 0.8220807119840418
F1 Score 0.7466638793073412
*** hitachi-nlp/bert-base-japanese_vaporetto-wordpiece
Precision 0.7039549468833994
Recall 0.8439466011968697
F1 Score 0.7676203768318214
*** hitachi-nlp/bert-base-japanese_nothing-wordpiece
Precision 0.3320056468690128
Recall 0.5052171244437624
F1 Score 0.40069368382621395
*** hitachi-nlp/bert-base-japanese_mecab-unigram
Precision 0.5373932244755651
Recall 0.7095289243516956
F1 Score 0.6115795390668916
*** hitachi-nlp/bert-base-japanese_jumanpp-unigram
Precision 0.586027865404837
Recall 0.6841338038974989
F1 Score 0.6312920353982301
*** hitachi-nlp/bert-base-japanese_sudachi-unigram
Precision 0.6123429416112343
Recall 0.7627742826453889
F1 Score 0.6793303723949436
*** hitachi-nlp/bert-base-japanese_vaporetto-unigram
Precision 0.6639132353868465
Recall 0.8078103421819856
F1 Score 0.7288270515349738
*** hitachi-nlp/bert-base-japanese_nothing-unigram
Precision 0.5332243071096574
Recall 0.47529538131041893
F1 Score 0.5025961382443616
*** tohoku-nlp/bert-base-japanese
Precision 0.8397322741982937
Recall 0.8759398496240601
F1 Score 0.8574539992489674
*** tohoku-nlp/bert-base-japanese-v2
Precision 0.8708618721461188
Recall 0.9364738376553629
F1 Score 0.9024768946395564
*** tohoku-nlp/bert-base-japanese-v3
Precision 0.866405693950178
Recall 0.9339419978517723
F1 Score 0.8989071038251366
確かにhitachi-nlp/bert-base-japanese_mecab-bpeのトークナイザが最も高いF1値を示しており、bert-base-japaneseを上回っている。しかしながら、bert-base-japanese-v2には届いていない。ipadicには特有のバグがあって(たとえば2021年10月26日の日記を参照)、トークナイザに使う場合は注意が必要なのだが、この論文の中ではipadicやその他の辞書のクセが全く検討されておらず、bert-base-japanese-v2との比較もおこなわれていない。でも、うーん、いまさらunidic-liteで実験をやり直すのは、さすがに手間がかかりすぎるなぁ。