More than 3 years have passed since last update.

言語処理100本ノック(2020): 40

Posted at 2020-09-30

`ans40.py`

"""
40. 係り受け解析結果の読み込み（形態素）
形態素を表すクラスMorphを実装せよ．このクラスは表層形（surface），findall（pos1）をメンバ変数に持つこととする．
さらに，CaboChaの解析結果（neko.txt.cabocha）を読み込み，各文をMorphオブジェクトのリストとして表現し，3文目の形態素列を表示せよ．

ans40_cabocha.pyでneko.txt.cabochaを生成する。
ans40.shで生成した例はneko.txt.cabocha2。比較をすれば、neko.txt.cabochaの方が綺麗。

ans40.pyの実装は直接neko.txt.cabochaを読み込んでいるから、スピードが早い。
ans40_2.pyの実装は解析時間がかかる。おすすめしない。
"""
from typing import List


class Morph:
    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                           ['* 0 1D 0/1 0.000000',
                            '吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
                            'は\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
                            '* 1 -1D 0/2 0.000000',
                            '猫\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
                            'で\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
                            'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
                            '。\t記号,句点,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


# ans40
def convert_sent_to_morph(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                                ['* 0 2D 0/0 -0.764522',
                                '\u3000\t記号,空白,*,*,*,*,\u3000,\u3000,\u3000',
                                '* 1 2D 0/1 -0.764522',
                                '吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
                                'は\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
                                '* 2 -1D 0/2 0.000000',
                                '猫\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
                                'で\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
                                'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
                                '。\t記号,句点,*,*,*,*,。,。,。']

    Returns:
        List[Morph]: [description]
    """
    res = []
    for word in sent:
        if word[0] == "*":
            continue
        features = word.split(",")
        dic = {
            "surface": features[0].split("\t")[0],
            "base": features[6],
            "pos": features[0].split("\t")[1],
            "pos1": features[1],
        }
        res.append(Morph(dic))

    return res


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
morph_sents = [convert_sent_to_morph(sent) for sent in sentences]

for m in morph_sents[2]:
    print(m)  # __str__()

# surface[名前]   base[名前]      pos[名詞]       pos1[一般]
# surface[は]     base[は]        pos[助詞]       pos1[係助詞]
# surface[まだ]   base[まだ]      pos[副詞]       pos1[助詞類接続]
# surface[無い]   base[無い]      pos[形容詞]     pos1[自立]
# surface[。]     base[。]        pos[記号]       pos1[句点]

`ans40_2.py`

from typing import List

import CaboCha


def read_file(path: str) -> List[str]:
    data = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line != "":
                data.append(line)
    return data


class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface  # 表層形
        self.base = base  # 基本形
        self.pos = pos  # 品詞
        self.pos1 = pos1  # 品詞細分類1

    def __str__(self):
        s = "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]"
        return s.format(self.surface, self.base, self.pos, self.pos1)


def get_morph(sent: str) -> list:
    c = CaboCha.Parser()
    parsed_sent = c.parse(sent).toString(CaboCha.FORMAT_LATTICE)
    words = parsed_sent.strip().split("\n")
    # e.g. ['* 0 -1D 0/0 0.000000', '一\t名詞,数,*,*,*,*,一,イチ,イチ', 'EOS']

    morphs = []
    for word in words:
        # 先頭が*の行は係り受け解析結果なのでスキップ
        if word[0] == "*" or word.strip() == "EOS":
            continue

        # 表層形はtab区切り、それ以外は','区切りでバラす
        features = word.split(",")

        morphs.append(
            Morph(
                features[0].split("\t")[0],  # surface
                features[6],  # base
                features[0].split("\t")[1],  # pos
                features[1],  # pos1
            )
        )

    return morphs


file_path = "neko.txt"
sentence_list = read_file(file_path)
# ['一', '吾輩は猫である。', '名前はまだ無い。', 'どこで生れたかとんと見当がつかぬ。']

morphs = [get_morph(sent) for sent in sentence_list]  # Cabochaの解析時間はちょっと長い
for m in morphs[3]:
    print(m)

# surface[どこ]   base[どこ]      pos[名詞]       pos1[代名詞]
# surface[で]     base[で]        pos[助詞]       pos1[格助詞]
# surface[生れ]   base[生れる]    pos[動詞]       pos1[自立]
# surface[た]     base[た]        pos[助動詞]     pos1[*]
# surface[か]     base[か]        pos[助詞]       pos1[副助詞／並立助詞／終助詞]
# surface[とんと] base[とんと]    pos[副詞]       pos1[一般]
# surface[見当]   base[見当]      pos[名詞]       pos1[サ変接続]
# surface[が]     base[が]        pos[助詞]       pos1[格助詞]
# surface[つか]   base[つく]      pos[動詞]       pos1[自立]
# surface[ぬ]     base[ぬ]        pos[助動詞]     pos1[*]
# surface[。]     base[。]        pos[記号]       pos1[句点]

`ans40_parse_to_cabocha_format.py`

import CaboCha


def parse_txt(file_in: str, file_out: str) -> None:
    """Convert neko.txt to cabocha format in a clear format."""
    with open(file_in) as f_in, open(file_out, "w") as f_out:
        cabocha = CaboCha.Parser()

        for line in f_in:
            line = line.strip()
            if line == "":
                continue
            parsed_sent = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            f_out.write(parsed_sent)


file_in = "neko.txt"
file_out = "neko.txt.cabocha"

parse_txt(file_in, file_out)

`ans40.sh`

cat neko.txt | cabocha -f1 > neko.txt.cabocha

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up