10月8日の記事の手法をgemma-3-270m-fixedに適用して、UD_English-EWTのUPOS品詞付与を試してみた。Google Colaboratory (GPU版)だと、こんな感じ。
!pip install -U transformers accelerate datasets evaluate seqeval
!test -d gemma-3-270m-fixed || git clone --depth=1 https://huggingface.co/PJMixers-Dev/gemma-3-270m-fixed
import json
with open("gemma-3-270m-fixed/gemma3.py","w",encoding="utf-8") as w:
print("""from transformers import Gemma3PreTrainedModel,Gemma3TextConfig
from transformers.modeling_layers import GenericForTokenClassification
class Gemma3ForTokenClassification(GenericForTokenClassification,Gemma3PreTrainedModel):
config_class=Gemma3TextConfig""",file=w)
with open("gemma-3-270m-fixed/config.json","r",encoding="utf-8") as r:
d=json.load(r)
if "auto_map" not in d:
d["auto_map"]={}
d["auto_map"]["AutoModelForTokenClassification"]="gemma3.Gemma3ForTokenClassification"
with open("gemma-3-270m-fixed/config.json","w",encoding="utf-8") as w:
json.dump(d,w,indent=2)
s='$1=="transformers"{printf("-b v%s",$2)}'
!test -d transformers || git clone `pip list | awk '{s}'` https://github.com/huggingface/transformers
!test -d UD_English-EWT || git clone --depth=1 https://github.com/UniversalDependencies/UD_English-EWT
def makejson(conllu_file,json_file):
with open(conllu_file,"r",encoding="utf-8") as r, open(json_file,"w",encoding="utf-8") as w:
d,f={"tokens":["<bos>"],"tags":["SYM"]},False
for s in r:
if s.strip()=="":
if len(d["tokens"])>1:
print(json.dumps(d),file=w)
d,f={"tokens":["<bos>"],"tags":["SYM"]},False
else:
t=s.split("\t")
if len(t)==10 and t[0].isdecimal():
d["tokens"].append(" "+t[1] if f else t[1])
d["tags"].append(t[3])
f=t[9].find("SpaceAfter=No")<0
makejson("UD_English-EWT/en_ewt-ud-train.conllu","train.json")
makejson("UD_English-EWT/en_ewt-ud-dev.conllu","dev.json")
makejson("UD_English-EWT/en_ewt-ud-test.conllu","test.json")
!env WANDB_DISABLED=true python transformers/examples/pytorch/token-classification/run_ner.py --task_name pos --model_name_or_path gemma-3-270m-fixed --trust_remote_code --train_file train.json --validation_file dev.json --test_file test.json --output_dir ./gemma-3-270m-english-upos --overwrite_output_dir --do_train --do_eval --do_predict
私(安岡孝一)の手元では、30分弱で以下のmetricsが出力されて、gemma-3-270m-english-uposが出来上がった。
***** train metrics *****
epoch = 3.0
total_flos = 912969GF
train_loss = 0.1059
train_runtime = 0:26:40.38
train_samples = 12544
train_samples_per_second = 23.514
train_steps_per_second = 2.939
***** eval metrics *****
epoch = 3.0
eval_accuracy = 0.9475
eval_f1 = 0.9385
eval_loss = 0.2574
eval_precision = 0.9379
eval_recall = 0.9392
eval_runtime = 0:00:12.12
eval_samples = 2001
eval_samples_per_second = 165.031
eval_steps_per_second = 20.701
***** predict metrics *****
predict_accuracy = 0.9455
predict_f1 = 0.9363
predict_loss = 0.2604
predict_precision = 0.9348
predict_recall = 0.9379
predict_runtime = 0:00:13.13
predict_samples_per_second = 158.102
predict_steps_per_second = 19.791
eval・predictともにF1値が0.93強で、もう一息である。ちょっと動かしてみよう。
from transformers import pipeline
nlp=pipeline("token-classification","gemma-3-270m-english-upos",trust_remote_code=True)
print(nlp("It don't mean a thing if it ain't got that swing"))
出来立てのgemma-3-270m-english-uposで「It don't mean a thing if it ain't got that swing」に品詞付与してみたところ、私の手元では以下の結果が得られた。
[{'entity': 'PRON', 'score': np.float32(0.99943954), 'index': 1, 'word': 'It', 'start': 0, 'end': 2}, {'entity': 'AUX', 'score': np.float32(0.9707727), 'index': 2, 'word': '▁don', 'start': 2, 'end': 6}, {'entity': 'AUX', 'score': np.float32(0.9864097), 'index': 3, 'word': "'", 'start': 6, 'end': 7}, {'entity': 'PART', 'score': np.float32(0.9999999), 'index': 4, 'word': 't', 'start': 7, 'end': 8}, {'entity': 'VERB', 'score': np.float32(0.9999852), 'index': 5, 'word': '▁mean', 'start': 8, 'end': 13}, {'entity': 'DET', 'score': np.float32(0.9989656), 'index': 6, 'word': '▁a', 'start': 13, 'end': 15}, {'entity': 'NOUN', 'score': np.float32(0.99999976), 'index': 7, 'word': '▁thing', 'start': 15, 'end': 21}, {'entity': 'SCONJ', 'score': np.float32(0.999647), 'index': 8, 'word': '▁if', 'start': 21, 'end': 24}, {'entity': 'PRON', 'score': np.float32(0.99999857), 'index': 9, 'word': '▁it', 'start': 24, 'end': 27}, {'entity': 'AUX', 'score': np.float32(0.6213639), 'index': 10, 'word': '▁ain', 'start': 27, 'end': 31}, {'entity': 'AUX', 'score': np.float32(0.9997758), 'index': 11, 'word': "'", 'start': 31, 'end': 32}, {'entity': 'PART', 'score': np.float32(1.0), 'index': 12, 'word': 't', 'start': 32, 'end': 33}, {'entity': 'VERB', 'score': np.float32(0.9727375), 'index': 13, 'word': '▁got', 'start': 33, 'end': 37}, {'entity': 'DET', 'score': np.float32(0.7514034), 'index': 14, 'word': '▁that', 'start': 37, 'end': 42}, {'entity': 'NOUN', 'score': np.float32(0.99995697), 'index': 15, 'word': '▁swing', 'start': 42, 'end': 48}]
やはり「don't」と「ain't」がイマイチだ。UD_English-EWTでは「don't」は「do」「n't」に、「ain't」は「ai」「n't」に切って、AUXとPARTを品詞付与するのが流儀なのだが、↑の結果では、そもそも切れ目が合ってない。やっぱり、トークナイザの改造が必要かな。