第5章の前半の問題を解いた記録。
対象とするファイルはwebページにもある通り、neko.txtとする。
夏目漱石の小説『吾輩は猫である』の文章(neko.txt)をCaboChaを使って係り受け解析し,その結果をneko.txt.cabochaというファイルに保存せよ.このファイルを用いて,以下の問に対応するプログラムを実装せよ.
40. 係り受け解析結果の読み込み(形態素)
形態素を表すクラスMorphを実装せよ.このクラスは表層形(surface),基本形(base),品詞(pos),品詞細分類1(pos1)をメンバ変数に持つこととする.さらに,CaboChaの解析結果(neko.txt.cabocha)を読み込み,各文をMorphオブジェクトのリストとして表現し,3文目の形態素列を表示せよ.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
class Morph():
def __init__(self, surface, base, pos, pos1):
self.surface = surface
self.base = base
self.pos = pos
self.pos1 = pos1
def print_all(self):
return self.surface + "\t" + self.base + ", " + self.pos + ", " + self.pos1
def read_morpheme(cabochafile):
sentences = []
sentence = []
for line in cabochafile:
if line == "EOS\n":
# if len(sentence) > 0:
# sentences.append(sentence)
sentences.append(sentence)
sentence = []
elif line[0] == "*":
continue
else:
surface, other = line.split()
others = other.split(",")
base, pos, pos1 = others[6], others[0], others[1]
morph = Morph(surface, base, pos, pos1)
sentence.append(morph)
return sentences
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = read_morpheme(f)
for morph in sentences[2]:
print morph.print_all()
f.close()
41. 係り受け解析結果の読み込み(文節・係り受け)
40に加えて,文節を表すクラスChunkを実装せよ.このクラスは形態素(Morphオブジェクト)のリスト(morphs),係り先文節インデックス番号(dst),係り元文節インデックス番号のリスト(srcs)をメンバ変数に持つこととする.さらに,入力テキストのCaboChaの解析結果を読み込み,1文をChunkオブジェクトのリストとして表現し,8文目の文節の文字列と係り先を表示せよ.第5章の残りの問題では,ここで作ったプログラムを活用せよ.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem40
class Chunk():
def __init__(self):
self.morphs = []
self.dst = -1
self.srcs = []
def __repr__(self):
if self.morphs:
surfs = [morph.surface for morph in self.morphs if morph.pos != '記号']
return "".join(surfs)
def include_pos(self, pos):
return pos in [morph.pos for morph in self.morphs]
def morphs_of_pos(self, pos):
return [morph for morph in self.morphs if morph.pos == pos]
def morphs_of_pos1(self, pos1):
return [morph for morph in self.morphs if morph.pos1 == pos1]
def read_chunk(cabochafile):
sentences = []
sentence = []
for line in cabochafile:
if line == "EOS\n":
for idx, c in enumerate(sentence[:-1]):
if c.dst != -1:
sentence[c.dst].srcs.append(idx)
# if len(sentence) > 1:
# sentences.append(sentence)
sentences.append(sentence)
sentence = []
elif line[0] == "*":
chunk = Chunk()
chunk.dst = int(line.split()[2].strip("D"))
sentence.append(chunk)
else:
surface, other = line.split()
others = other.split(",")
base, pos, pos1 = others[6], others[0], others[1]
morph = problem40.Morph(surface, base, pos, pos1)
sentence[-1].morphs.append(morph)
return sentences
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = read_chunk(f)
for idx, chnk in enumerate(sentences[7]):
surfaces = ""
for mrph in chnk.morphs:
surfaces += mrph.surface
print "%d" % idx, surfaces, "=>", chnk.dst
f.close()
42. 係り元と係り先の文節の表示
係り元の文節と係り先の文節のテキストをタブ区切り形式ですべて抽出せよ.ただし,句読点などの記号は出力しないようにせよ.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem41
def make_chunk_pair(sentence):
pairs = []
for chunk in sentence:
if chunk.dst != -1:
pairs.append((chunk, sentence[chunk.dst]))
return pairs
if __name__ == "__main__":
f = open("neko.txt.cabocha")
sentences = problem41.read_chunk(f)
pair_sentences = []
for sentence in sentences:
pair = make_chunk_pair(sentence)
pair_sentences.append(pair)
for sentence in pair_sentences:
for pair in sentence:
print "\t".join([str(chunk) for chunk in pair])
f.close()
43. 名詞を含む文節が動詞を含む文節に係るものを抽出
名詞を含む文節が,動詞を含む文節に係るとき,これらをタブ区切り形式で抽出せよ.ただし,句読点などの記号は出力しないようにせよ.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem41
import problem42
def findNtoV(chunk_pair):
flagN = False
flagV = False
if "名詞" in [morph.pos for morph in chunk_pair[0].morphs]:
flagN = True
if "動詞" in [morph.pos for morph in chunk_pair[1].morphs]:
flagV = True
return flagN and flagV
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = problem41.read_chunk(f)
pair_sentences = []
for sentence in sentences:
pair = problem42.make_chunk_pair(sentence)
pair_sentences.append(pair)
pairs_NtoV = []
for pair_sentence in pair_sentences:
for chunk_pair in pair_sentence:
if findNtoV(chunk_pair):
pairs_NtoV.append(chunk_pair)
for pair_NtoV in pairs_NtoV:
noun, verb = pair_NtoV
print "%s\t%s" % (noun, verb)
f.close()
44. 係り受け木の可視化
与えられた文の係り受け木を有向グラフとして可視化せよ.可視化には,係り受け木をDOT言語に変換し,Graphvizを用いるとよい.また,Pythonから有向グラフを直接的に可視化するには,pydotを使うとよい.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem41
import problem42
def sentenceToDot(idx, sentence):
head = "digraph sentence{0} ".format(idx)
body_head = "{ graph [rankdir = LR]; "
body = ""
for chunk_pair in sentence:
former, latter = chunk_pair
body += ('"'+str(former)+'"->"'+str(latter)+'"; ')
dotString = head + body_head + body + '}'
return dotString
if __name__ == "__main__":
f = open("neko.txt.cabocha", "r")
sentences = problem41.read_chunk(f)
pair_sentences = []
for sentence in sentences:
pair = problem42.make_chunk_pair(sentence)
pair_sentences.append(pair)
# dotStrings = []
for idx, sentence in enumerate(pair_sentences):
dotString = sentenceToDot(idx, sentence)
print dotString
# dotStrings.append(dotString)
f.close()