0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

言語処理100本ノック(2020): 40

Posted at

ans40.py

"""
40. 係り受け解析結果の読み込み(形態素)
形態素を表すクラスMorphを実装せよ.このクラスは表層形(surface),findall(pos1)をメンバ変数に持つこととする.
さらに,CaboChaの解析結果(neko.txt.cabocha)を読み込み,各文をMorphオブジェクトのリストとして表現し,3文目の形態素列を表示せよ.

ans40_cabocha.pyでneko.txt.cabochaを生成する。
ans40.shで生成した例はneko.txt.cabocha2。比較をすれば、neko.txt.cabochaの方が綺麗。

ans40.pyの実装は直接neko.txt.cabochaを読み込んでいるから、スピードが早い。
ans40_2.pyの実装は解析時間がかかる。おすすめしない。
"""
from typing import List


class Morph:
    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                           ['* 0 1D 0/1 0.000000',
                            '吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
                            '\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
                            '* 1 -1D 0/2 0.000000',
                            '\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
                            '\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
                            'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
                            '\t記号,句点,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


# ans40
def convert_sent_to_morph(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                                ['* 0 2D 0/0 -0.764522',
                                '\u3000\t記号,空白,*,*,*,*,\u3000,\u3000,\u3000',
                                '* 1 2D 0/1 -0.764522',
                                '吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
                                '\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
                                '* 2 -1D 0/2 0.000000',
                                '\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
                                '\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
                                'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
                                '\t記号,句点,*,*,*,*,。,。,。']

    Returns:
        List[Morph]: [description]
    """
    res = []
    for word in sent:
        if word[0] == "*":
            continue
        features = word.split(",")
        dic = {
            "surface": features[0].split("\t")[0],
            "base": features[6],
            "pos": features[0].split("\t")[1],
            "pos1": features[1],
        }
        res.append(Morph(dic))

    return res


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
morph_sents = [convert_sent_to_morph(sent) for sent in sentences]

for m in morph_sents[2]:
    print(m)  # __str__()

# surface[名前]   base[名前]      pos[名詞]       pos1[一般]
# surface[は]     base[は]        pos[助詞]       pos1[係助詞]
# surface[まだ]   base[まだ]      pos[副詞]       pos1[助詞類接続]
# surface[無い]   base[無い]      pos[形容詞]     pos1[自立]
# surface[。]     base[。]        pos[記号]       pos1[句点]

ans40_2.py

from typing import List

import CaboCha


def read_file(path: str) -> List[str]:
    data = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line != "":
                data.append(line)
    return data


class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface  # 表層形
        self.base = base  # 基本形
        self.pos = pos  # 品詞
        self.pos1 = pos1  # 品詞細分類1

    def __str__(self):
        s = "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]"
        return s.format(self.surface, self.base, self.pos, self.pos1)


def get_morph(sent: str) -> list:
    c = CaboCha.Parser()
    parsed_sent = c.parse(sent).toString(CaboCha.FORMAT_LATTICE)
    words = parsed_sent.strip().split("\n")
    # e.g. ['* 0 -1D 0/0 0.000000', '一\t名詞,数,*,*,*,*,一,イチ,イチ', 'EOS']

    morphs = []
    for word in words:
        # 先頭が*の行は係り受け解析結果なのでスキップ
        if word[0] == "*" or word.strip() == "EOS":
            continue

        # 表層形はtab区切り、それ以外は','区切りでバラす
        features = word.split(",")

        morphs.append(
            Morph(
                features[0].split("\t")[0],  # surface
                features[6],  # base
                features[0].split("\t")[1],  # pos
                features[1],  # pos1
            )
        )

    return morphs


file_path = "neko.txt"
sentence_list = read_file(file_path)
# ['一', '吾輩は猫である。', '名前はまだ無い。', 'どこで生れたかとんと見当がつかぬ。']

morphs = [get_morph(sent) for sent in sentence_list]  # Cabochaの解析時間はちょっと長い
for m in morphs[3]:
    print(m)

# surface[どこ]   base[どこ]      pos[名詞]       pos1[代名詞]
# surface[で]     base[で]        pos[助詞]       pos1[格助詞]
# surface[生れ]   base[生れる]    pos[動詞]       pos1[自立]
# surface[た]     base[た]        pos[助動詞]     pos1[*]
# surface[か]     base[か]        pos[助詞]       pos1[副助詞/並立助詞/終助詞]
# surface[とんと] base[とんと]    pos[副詞]       pos1[一般]
# surface[見当]   base[見当]      pos[名詞]       pos1[サ変接続]
# surface[が]     base[が]        pos[助詞]       pos1[格助詞]
# surface[つか]   base[つく]      pos[動詞]       pos1[自立]
# surface[ぬ]     base[ぬ]        pos[助動詞]     pos1[*]
# surface[。]     base[。]        pos[記号]       pos1[句点]

ans40_parse_to_cabocha_format.py

import CaboCha


def parse_txt(file_in: str, file_out: str) -> None:
    """Convert neko.txt to cabocha format in a clear format."""
    with open(file_in) as f_in, open(file_out, "w") as f_out:
        cabocha = CaboCha.Parser()

        for line in f_in:
            line = line.strip()
            if line == "":
                continue
            parsed_sent = cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            f_out.write(parsed_sent)


file_in = "neko.txt"
file_out = "neko.txt.cabocha"

parse_txt(file_in, file_out)

ans40.sh

cat neko.txt | cabocha -f1 > neko.txt.cabocha
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?