Terra Blevins, Hila Gonen, Luke Zettlemoyer『Prompting Language Models for Linguistic Structure』の手法をXunziALLMに適用して、漢文(古典中国語)でのUPOS (Universal Part-Of-Speech)品詞付与に挑戦してみた。Few-Shot Promptingの例文は、UD_Classical_Chinese-Kyotoの訓練データのうち、『論語』里仁篇の冒頭部分を用いた。
#! /usr/bin/python3
# pip3 install transformers accelerate transformers_stream_generator einops tiktoken
model="ccwu0918/XunziALLM"
class TextUPOSList(list):
__str__=lambda self:"\n".join("###text:"+"".join(t for t,u in s)+"\n###UPOS:"+"|".join(t+"_"+u for t,u in s) for s in self)+"\n"
ex=TextUPOSList()
ex.append([("子","NOUN"),("曰","VERB"),("里","VERB"),("仁","NOUN"),("為","AUX"),("美","NOUN"),("擇","VERB"),("不","ADV"),("處","VERB"),("仁","NOUN"),("焉","ADV"),("得","AUX"),("知","VERB")])
ex.append([("子","NOUN"),("曰","VERB"),("不","ADV"),("仁","VERB"),("者","PART"),("不","ADV"),("可","AUX"),("以","VERB"),("久","VERB"),("處","VERB"),("約","NOUN"),("不","ADV"),("可","AUX"),("以","VERB"),("長","VERB"),("處","VERB"),("樂","NOUN"),("仁","VERB"),("者","PART"),("安","VERB"),("仁","VERB"),("知","VERB"),("者","PART"),("利","VERB"),("仁","VERB")])
ex.append([("子","NOUN"),("曰","VERB"),("唯","ADV"),("仁","VERB"),("者","PART"),("能","AUX"),("好","VERB"),("人","NOUN"),("能","AUX"),("惡","VERB"),("人","NOUN")])
ex.append([("子","NOUN"),("曰","VERB"),("苟","ADV"),("志","VERB"),("於","ADP"),("仁","NOUN"),("矣","PART"),("無","ADV"),("惡","VERB"),("也","PART")])
ex.append([("子","NOUN"),("曰","VERB"),("富","VERB"),("與","ADP"),("貴","VERB"),("是","PRON"),("人","NOUN"),("之","SCONJ"),("所","PART"),("欲","VERB"),("也","PART"),("不","ADV"),("以","VERB"),("其","PRON"),("道","NOUN"),("得","VERB"),("之","PRON"),("不","ADV"),("處","VERB"),("也","PART"),("貧","VERB"),("與","ADP"),("賤","VERB"),("是","PRON"),("人","NOUN"),("之","SCONJ"),("所","PART"),("惡","VERB"),("也","PART"),("不","ADV"),("以","VERB"),("其","PRON"),("道","NOUN"),("得","VERB"),("之","PRON"),("不","ADV"),("去","VERB"),("也","PART"),("君子","NOUN"),("去","VERB"),("仁","VERB"),("惡","ADV"),("乎","PART"),("成","VERB"),("名","NOUN"),("君子","NOUN"),("無","VERB"),("終","VERB"),("食","NOUN"),("之","SCONJ"),("間","NOUN"),("違","VERB"),("仁","VERB"),("造","VERB"),("次","NOUN"),("必","ADV"),("於","VERB"),("是","PRON"),("顛","ADV"),("沛","VERB"),("必","ADV"),("於","VERB"),("是","PRON")])
ex.append([("子","NOUN"),("曰","VERB"),("我","PRON"),("未","ADV"),("見","VERB"),("好","VERB"),("仁","VERB"),("者","PART"),("惡","VERB"),("不","ADV"),("仁","VERB"),("者","PART"),("好","VERB"),("仁","VERB"),("者","PART"),("無","ADV"),("以","ADV"),("尚","VERB"),("之","PRON"),("惡","VERB"),("不","ADV"),("仁","VERB"),("者","PART"),("其","PRON"),("為","VERB"),("仁","NOUN"),("矣","PART"),("不","ADV"),("使","VERB"),("不","ADV"),("仁","VERB"),("者","PART"),("加","VERB"),("乎","ADP"),("其","PRON"),("身","NOUN"),("有","VERB"),("能","AUX"),("一","NUM"),("日","NOUN"),("用","VERB"),("其","PRON"),("力","NOUN"),("於","ADP"),("仁","NOUN"),("矣","PART"),("乎","PART"),("我","PRON"),("未","ADV"),("見","VERB"),("力","NOUN"),("不","ADV"),("足","VERB"),("者","PART"),("蓋","PART"),("有","VERB"),("之","PRON"),("矣","PART"),("我","PRON"),("未","ADV"),("之","PRON"),("見","VERB"),("也","PART")])
ex.append([("子","NOUN"),("曰","VERB"),("人","NOUN"),("之","SCONJ"),("過","VERB"),("也","PART"),("各","ADV"),("於","VERB"),("其","PRON"),("黨","NOUN"),("觀","VERB"),("過","NOUN"),("斯","ADV"),("知","VERB"),("人","NOUN"),("矣","PART")])
ex.append([("子","NOUN"),("曰","VERB"),("朝","NOUN"),("聞","VERB"),("道","NOUN"),("夕","NOUN"),("死","VERB"),("可","VERB"),("矣","PART")])
ex.append([("子","NOUN"),("曰","VERB"),("士","NOUN"),("志","VERB"),("於","ADP"),("道","NOUN"),("而","CCONJ"),("恥","VERB"),("惡","NOUN"),("衣","NOUN"),("惡","NOUN"),("食","NOUN"),("者","PART"),("未","ADV"),("足","AUX"),("與","ADV"),("議","VERB"),("也","PART")])
ex.append([("子","NOUN"),("曰","VERB"),("君子","NOUN"),("之","SCONJ"),("於","VERB"),("天","NOUN"),("下","NOUN"),("也","PART"),("無","VERB"),("適","VERB"),("也","PART"),("無","VERB"),("莫","ADV"),("也","PART"),("義","NOUN"),("之","PRON"),("與","ADP"),("比","VERB")])
ex.append([("子","NOUN"),("曰","VERB"),("君子","NOUN"),("懷","VERB"),("德","NOUN"),("小","VERB"),("人","NOUN"),("懷","VERB"),("土","NOUN"),("君子","NOUN"),("懷","VERB"),("刑","NOUN"),("小","VERB"),("人","NOUN"),("懷","VERB"),("惠","NOUN")])
ex.append([("子","NOUN"),("曰","VERB"),("放","VERB"),("於","ADP"),("利","NOUN"),("而","CCONJ"),("行","VERB"),("多","VERB"),("怨","NOUN")])
ex.append([("子","NOUN"),("曰","VERB"),("能","AUX"),("以","VERB"),("禮","VERB"),("讓","VERB"),("為","VERB"),("國","NOUN"),("於","ADP"),("從","VERB"),("政","NOUN"),("乎","PART"),("何","PRON"),("有","VERB"),("不","ADV"),("能","AUX"),("以","VERB"),("禮","VERB"),("讓","VERB"),("為","VERB"),("國","NOUN"),("如","VERB"),("禮","NOUN"),("何","PRON")])
from transformers import AutoTokenizer,AutoModelForCausalLM
tkz=AutoTokenizer.from_pretrained(model,trust_remote_code=True)
mdl=AutoModelForCausalLM.from_pretrained(model,trust_remote_code=True,device_map="auto")
def nlp(txt):
inp=tkz(str(ex)+f"###text:{txt}\n###UPOS:",return_tensors="pt").to(mdl.device)
doc=tkz.decode(mdl.generate(**inp)[0])
return "\n".join(doc.split("\n")[len(ex)*2:len(ex)*2+2])
print(nlp("子曰不患無位患所以立不患莫己知求為可知也"))
「子曰不患無位患所以立不患莫己知求為可知也」に品詞付与してみたところ、私(安岡孝一)の手元では以下の結果が得られた。
###text:子曰不患無位患所以立不患莫己知求為可知也
###UPOS:子_NOUN|曰_VERB|不_ADV|患_VERB|無_VERB|位_NOUN|患_VERB|所_PRON|以_VERB|立_VERB|不_ADV|患_VERB|莫_ADV|己_PRON|知_VERB|求_VERB|為_VERB|可_ADV|知_VERB|也_PART|不_PART|患_VERB|莫_ADV|己_PRON|知_VERB|求_VERB|為_VERB|可_ADV|知_VERB|也_PART
「不患莫己知求為可知也」がダブって出力されており、しかも「可_AUX
」の品詞が間違っている。もう一度やってみよう。
###text:子曰不患無位患所以立不患莫己知求為可知也
###UPOS:子_NOUN|曰_VERB|不_ADV|患_VERB|無_VERB|位_NOUN|患_VERB|所_CCONJ|以_VERB|立_VERB|不_ADV|患_VERB|莫_VERB|己_VERB|知_VERB|求_VERB|為_VERB|可_VERB|知_VERB|也_PART
今度はダブリは無かったものの、「患莫己知求為可知」が全てVERB
となってしまっている。うーむ、13-shot Promptingでこの程度だと、現実の処理に使うのは無理があるなあ。