More than 5 years have passed since last update.

自然言語処理100本ノック第5章係り受け解析(前半)

Posted at 2015-12-30

第5章の前半の問題を解いた記録。
対象とするファイルはwebページにもある通り、neko.txtとする。

夏目漱石の小説『吾輩は猫である』の文章（neko.txt）をCaboChaを使って係り受け解析し，その結果をneko.txt.cabochaというファイルに保存せよ．このファイルを用いて，以下の問に対応するプログラムを実装せよ．

40. 係り受け解析結果の読み込み（形態素）

形態素を表すクラスMorphを実装せよ．このクラスは表層形（surface），基本形（base），品詞（pos），品詞細分類1（pos1）をメンバ変数に持つこととする．さらに，CaboChaの解析結果（neko.txt.cabocha）を読み込み，各文をMorphオブジェクトのリストとして表現し，3文目の形態素列を表示せよ．

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

class Morph():
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def print_all(self):
        return self.surface + "\t" + self.base + ", " + self.pos + ", " + self.pos1

def read_morpheme(cabochafile):
    sentences = []
    sentence = []
    for line in cabochafile:
        if line == "EOS\n":
            # if len(sentence) > 0:
            #     sentences.append(sentence)
            sentences.append(sentence)
            sentence = []
        elif line[0] == "*":
            continue
        else:
            surface, other = line.split()
            others = other.split(",")
            base, pos, pos1 = others[6], others[0], others[1]
            morph = Morph(surface, base, pos, pos1)
            sentence.append(morph)
    return sentences

if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = read_morpheme(f)
    for morph in sentences[2]:
        print morph.print_all()
    f.close()

41. 係り受け解析結果の読み込み（文節・係り受け）

40に加えて，文節を表すクラスChunkを実装せよ．このクラスは形態素（Morphオブジェクト）のリスト（morphs），係り先文節インデックス番号（dst），係り元文節インデックス番号のリスト（srcs）をメンバ変数に持つこととする．さらに，入力テキストのCaboChaの解析結果を読み込み，１文をChunkオブジェクトのリストとして表現し，8文目の文節の文字列と係り先を表示せよ．第5章の残りの問題では，ここで作ったプログラムを活用せよ．

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem40


class Chunk():
    def __init__(self):
        self.morphs = []
        self.dst = -1
        self.srcs = []

    def __repr__(self):
        if self.morphs:
            surfs = [morph.surface for morph in self.morphs if morph.pos != '記号']
            return "".join(surfs)

    def include_pos(self, pos):
        return pos in [morph.pos for morph in self.morphs]

    def morphs_of_pos(self, pos):
        return [morph for morph in self.morphs if morph.pos == pos]

    def morphs_of_pos1(self, pos1):
        return [morph for morph in self.morphs if morph.pos1 == pos1]


def read_chunk(cabochafile):
    sentences = []
    sentence = []
    for line in cabochafile:
        if line == "EOS\n":
            for idx, c in enumerate(sentence[:-1]):
                if c.dst != -1:
                    sentence[c.dst].srcs.append(idx)
            # if len(sentence) > 1:
                # sentences.append(sentence)
            sentences.append(sentence)
            sentence = []
        elif line[0] == "*":
            chunk = Chunk()
            chunk.dst = int(line.split()[2].strip("D"))
            sentence.append(chunk)
        else:
            surface, other = line.split()
            others = other.split(",")
            base, pos, pos1 = others[6], others[0], others[1]
            morph = problem40.Morph(surface, base, pos, pos1)
            sentence[-1].morphs.append(morph)
    return sentences


if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = read_chunk(f)
    for idx, chnk in enumerate(sentences[7]):
        surfaces = ""
        for mrph in chnk.morphs:
            surfaces += mrph.surface
        print "%d" % idx, surfaces, "=>", chnk.dst
    f.close()

42. 係り元と係り先の文節の表示

係り元の文節と係り先の文節のテキストをタブ区切り形式ですべて抽出せよ．ただし，句読点などの記号は出力しないようにせよ．

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem41

def make_chunk_pair(sentence):
    pairs = []
    for chunk in sentence:
        if chunk.dst != -1:
            pairs.append((chunk, sentence[chunk.dst]))
    return pairs

if __name__ == "__main__":
    f = open("neko.txt.cabocha")
    sentences = problem41.read_chunk(f)
    pair_sentences = []
    for sentence in sentences:
        pair = make_chunk_pair(sentence)
        pair_sentences.append(pair)
    for sentence in pair_sentences:
        for pair in sentence:
            print "\t".join([str(chunk) for chunk in pair])
    f.close()

43. 名詞を含む文節が動詞を含む文節に係るものを抽出

名詞を含む文節が，動詞を含む文節に係るとき，これらをタブ区切り形式で抽出せよ．ただし，句読点などの記号は出力しないようにせよ．

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem41
import problem42

def findNtoV(chunk_pair):
    flagN = False
    flagV = False
    if "名詞" in [morph.pos for morph in chunk_pair[0].morphs]:
        flagN = True
    if "動詞" in [morph.pos for morph in chunk_pair[1].morphs]:
        flagV = True
    return flagN and flagV

if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = problem41.read_chunk(f)
    pair_sentences = []
    for sentence in sentences:
        pair = problem42.make_chunk_pair(sentence)
        pair_sentences.append(pair)
    pairs_NtoV = []
    for pair_sentence in pair_sentences:
        for chunk_pair in pair_sentence:
            if findNtoV(chunk_pair):
                pairs_NtoV.append(chunk_pair)
    for pair_NtoV in pairs_NtoV:
        noun, verb = pair_NtoV
        print "%s\t%s" % (noun, verb)
    f.close()

44. 係り受け木の可視化

与えられた文の係り受け木を有向グラフとして可視化せよ．可視化には，係り受け木をDOT言語に変換し，Graphvizを用いるとよい．また，Pythonから有向グラフを直接的に可視化するには，pydotを使うとよい．

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem41
import problem42


def sentenceToDot(idx, sentence):
    head = "digraph sentence{0} ".format(idx)
    body_head = "{ graph [rankdir = LR]; "
    body = ""
    for chunk_pair in sentence:
        former, latter = chunk_pair
        body += ('"'+str(former)+'"->"'+str(latter)+'"; ')
    dotString = head + body_head + body + '}'
    return dotString


if __name__ == "__main__":
    f = open("neko.txt.cabocha", "r")
    sentences = problem41.read_chunk(f)
    pair_sentences = []
    for sentence in sentences:
        pair = problem42.make_chunk_pair(sentence)
        pair_sentences.append(pair)
    # dotStrings = []
    for idx, sentence in enumerate(pair_sentences):
        dotString = sentenceToDot(idx, sentence)
        print dotString
        # dotStrings.append(dotString)
    f.close()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

自然言語処理100本ノック 第5章 係り受け解析(前半)

40. 係り受け解析結果の読み込み（形態素）

41. 係り受け解析結果の読み込み（文節・係り受け）

42. 係り元と係り先の文節の表示

43. 名詞を含む文節が動詞を含む文節に係るものを抽出

44. 係り受け木の可視化

自然言語処理100本ノック第5章係り受け解析(前半)