ans40.py
"""
40. 係り受け解析結果の読み込み(形態素)
形態素を表すクラスMorphを実装せよ.このクラスは表層形(surface),findall(pos1)をメンバ変数に持つこととする.
さらに,CaboChaの解析結果(neko.txt.cabocha)を読み込み,各文をMorphオブジェクトのリストとして表現し,3文目の形態素列を表示せよ.
ans40_cabocha.pyでneko.txt.cabochaを生成する。
ans40.shで生成した例はneko.txt.cabocha2。比較をすれば、neko.txt.cabochaの方が綺麗。
ans40.pyの実装は直接neko.txt.cabochaを読み込んでいるから、スピードが早い。
ans40_2.pyの実装は解析時間がかかる。おすすめしない。
"""
from typing import List
class Morph:
def __init__(self, data):
self.surface = data["surface"]
self.base = data["base"]
self.pos = data["pos"]
self.pos1 = data["pos1"]
def __repr__(self):
return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
self.surface, self.base, self.pos, self.pos1
)
def read_file(fpath: str) -> List[List[str]]:
"""Get clear format of parsed sentences.
Args:
fpath (str): File path.
Returns:
List[List[str]]: List of sentences, and each sentence contains a word list.
e.g. result[1]:
['* 0 1D 0/1 0.000000',
'吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
'は\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
'* 1 -1D 0/2 0.000000',
'猫\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
'で\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
'。\t記号,句点,*,*,*,*,。,。,。']
"""
with open(fpath, mode="rt", encoding="utf-8") as f:
sentences = f.read().split("EOS\n")
return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]
# ans40
def convert_sent_to_morph(sent: List[str]) -> List[Morph]:
"""Extract word and convert to morph.
Args:
sent (List[str]): A sentence contains a word list.
e.g. sent:
['* 0 2D 0/0 -0.764522',
'\u3000\t記号,空白,*,*,*,*,\u3000,\u3000,\u3000',
'* 1 2D 0/1 -0.764522',
'吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
'は\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
'* 2 -1D 0/2 0.000000',
'猫\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
'で\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
'。\t記号,句点,*,*,*,*,。,。,。']
Returns:
List[Morph]: [description]
"""
res = []
for word in sent:
if word[0] == "*":
continue
features = word.split(",")
dic = {
"surface": features[0].split("\t")[0],
"base": features[6],
"pos": features[0].split("\t")[1],
"pos1": features[1],
}
res.append(Morph(dic))
return res
fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
morph_sents = [convert_sent_to_morph(sent) for sent in sentences]
for m in morph_sents[2]:
print(m) # __str__()
# surface[名前] base[名前] pos[名詞] pos1[一般]
# surface[は] base[は] pos[助詞] pos1[係助詞]
# surface[まだ] base[まだ] pos[副詞] pos1[助詞類接続]
# surface[無い] base[無い] pos[形容詞] pos1[自立]
# surface[。] base[。] pos[記号] pos1[句点]
ans40_2.py
from typing import List
import CaboCha
def read_file(path: str) -> List[str]:
data = []
with open(path) as f:
for line in f:
line = line.strip()
if line != "":
data.append(line)
return data
class Morph:
def __init__(self, surface, base, pos, pos1):
self.surface = surface # 表層形
self.base = base # 基本形
self.pos = pos # 品詞
self.pos1 = pos1 # 品詞細分類1
def __str__(self):
s = "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]"
return s.format(self.surface, self.base, self.pos, self.pos1)
def get_morph(sent: str) -> list:
c = CaboCha.Parser()
parsed_sent = c.parse(sent).toString(CaboCha.FORMAT_LATTICE)
words = parsed_sent.strip().split("\n")
# e.g. ['* 0 -1D 0/0 0.000000', '一\t名詞,数,*,*,*,*,一,イチ,イチ', 'EOS']
morphs = []
for word in words:
# 先頭が*の行は係り受け解析結果なのでスキップ
if word[0] == "*" or word.strip() == "EOS":
continue
# 表層形はtab区切り、それ以外は','区切りでバラす
features = word.split(",")
morphs.append(
Morph(
features[0].split("\t")[0], # surface
features[6], # base
features[0].split("\t")[1], # pos
features[1], # pos1
)
)
return morphs
file_path = "neko.txt"
sentence_list = read_file(file_path)
# ['一', '吾輩は猫である。', '名前はまだ無い。', 'どこで生れたかとんと見当がつかぬ。']
morphs = [get_morph(sent) for sent in sentence_list] # Cabochaの解析時間はちょっと長い
for m in morphs[3]:
print(m)
# surface[どこ] base[どこ] pos[名詞] pos1[代名詞]
# surface[で] base[で] pos[助詞] pos1[格助詞]
# surface[生れ] base[生れる] pos[動詞] pos1[自立]
# surface[た] base[た] pos[助動詞] pos1[*]
# surface[か] base[か] pos[助詞] pos1[副助詞/並立助詞/終助詞]
# surface[とんと] base[とんと] pos[副詞] pos1[一般]
# surface[見当] base[見当] pos[名詞] pos1[サ変接続]
# surface[が] base[が] pos[助詞] pos1[格助詞]
# surface[つか] base[つく] pos[動詞] pos1[自立]
# surface[ぬ] base[ぬ] pos[助動詞] pos1[*]
# surface[。] base[。] pos[記号] pos1[句点]
ans40_parse_to_cabocha_format.py
import CaboCha
def parse_txt(file_in: str, file_out: str) -> None:
"""Convert neko.txt to cabocha format in a clear format."""
with open(file_in) as f_in, open(file_out, "w") as f_out:
cabocha = CaboCha.Parser()
for line in f_in:
line = line.strip()
if line == "":
continue
parsed_sent = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
f_out.write(parsed_sent)
file_in = "neko.txt"
file_out = "neko.txt.cabocha"
parse_txt(file_in, file_out)
ans40.sh
cat neko.txt | cabocha -f1 > neko.txt.cabocha