"""
30. 形態素解析結果の読み込みPermalink
形態素解析結果(neko.txt.mecab)を読み込むプログラムを実装せよ.
ただし,各形態素は表層形(surface),基本形(base),品詞(pos),品詞細分類1(pos1)をキーとするマッピング型に格納し,
1文を形態素(マッピング型)のリストとして表現せよ.第4章の残りの問題では,ここで作ったプログラムを活用せよ.
- `neko.txt.mecab` は`ans30.sh`で作成された。
- これからの質問は`30_neko_mecab.json`を使う
品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音
['名詞', '代名詞', '一般', '*', '*', '*', '吾輩', 'ワガハイ', 'ワガハイ']
"""
from typing import List
import MeCab
import utils
def read_file(path: str) -> List[str]:
data = []
with open(path) as f:
for line in f:
line = line.strip()
if line != "":
data.append(line)
return data
def parse(sent: str) -> List[str]:
node = tagger.parseToNode(sent)
result = []
while node:
node_dic = {}
features = node.feature.split(",")
node_dic["surface"] = node.surface # 表層形(surface)
node_dic["base"] = features[6] # 基本形(base)
node_dic["pos"] = features[0] # 品詞(pos)
node_dic["pos1"] = features[1] # 品詞細分類1(pos1)
result.append(node_dic)
node = node.next
return result
file_path = "neko.txt"
data = read_file(file_path)
# ['一', '吾輩は猫である。', '名前はまだ無い。', 'どこで生れたかとんと見当がつかぬ。']
tagger = MeCab.Tagger("-r /usr/local/etc/mecabrc")
result = [parse(sent) for sent in data]
# ans30
utils.save_json(result, "30_neko_mecab.json")
data = utils.read_json("30_neko_mecab.json")
utils.py
:
import itertools
import json
from typing import Any, List
def save_json(data: Any, save_path: str) -> None:
"""Save data to json format.
Args:
data (Any): The data to store.
save_path (str): Path to save.
"""
with open(save_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def read_json(path: str) -> List[Any]:
"""Read json data
Args:
path (str): Path of file to read.
Returns:
List[Any]: FTSE entity data.
"""
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return data
def flat(sequence: List[List[Any]]) -> List[Any]:
return list(itertools.chain(*sequence))