第6章の後半の問題を解いた記録。
対象とするファイルはwebページにもある通り、nlp.txtとする。
英語のテキスト(nlp.txt)に対して,以下の処理を実行せよ.
55. 固有表現抽出
入力文中の人名をすべて抜き出せ.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
WORD = re.compile(r"<word>(\w+)</word>")
NER = re.compile(r"<NER>(\w+)</NER>")
token = ""
person = ""
f = open('nlp.txt.xml', 'r')
for line in f:
line = line.strip()
word = WORD.search(line)
if word:
token = word.group(1)
continue
ner = NER.search(line)
if ner:
if ner.group(1) == "PERSON":
person = token
print person
f.close()
56. 共参照解析
Stanford Core NLPの共参照解析の結果に基づき,文中の参照表現(mention)を代表参照表現(representative mention)に置換せよ.ただし,置換するときは,「代表参照表現(参照表現)」のように,元の参照表現が分かるように配慮せよ.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
import xml.etree.ElementTree as et
from functools import partial
LRB = re.compile(r"-LRB- ")
RRB = re.compile(r" -RRB-")
NOTATION = re.compile(r" ([,\.:;])")
LDQ = re.compile(r"`` ")
RDQ = re.compile(r" \'\'")
SQ = re.compile(r" \'")
SQS = re.compile(r" \'s")
class StanfordDocument():
def __init__(self, file):
self.xmltree = et.parse(file)
root = self.xmltree.getroot()
self.sentences = root.find('document/sentences')
self.coreferences = root.find('document/coreference')
def getListOfSentences(self):
sentences = []
for sentence in self.sentences.findall('sentence'):
sentences.append([word.text for word in sentence.findall('tokens/token/word')])
return sentences
def main(file):
doc = StanfordDocument(file)
sentences = doc.getListOfSentences()
for coref in doc.coreferences.findall('coreference'):
mentions = coref.findall('mention')
represent = coref.find('mention[@representative="true"]')
for mention in mentions:
if mention != represent:
sentence_i = int(mention.find('sentence').text) - 1
start_i = int(mention.find('start').text) - 1
end_i = int(mention.find('end').text) - 2
target_sentence = sentences[sentence_i]
target_sentence[start_i] = represent.find('text').text.strip() + ' (' + target_sentence[start_i]
# print list(represent)
# target_sentence[start_i] = "[" + str(sentence_i) +","+ str(start_i) +","+ str(end_i) +","+str(sentences[sentence_i][start_i])+ "]" + ' (' + target_sentence[start_i]
target_sentence[end_i] = target_sentence[end_i] + ')'
return sentences
def prettifySentence(sentence):
s = " ".join(sentence)
partials = map(
lambda x: partial(x[0], x[1]),
[
(LRB.sub, '('),
(RRB.sub, ')'),
(LDQ.sub, '\"'),
(RDQ.sub, '\"'),
(SQS.sub, "\'s"),
(SQ.sub, "\'"),
(NOTATION.sub, r'\1')
]
)
for part in partials:
s = part(s)
return s
if __name__ == "__main__":
file = "nlp_line.txt.xml"
sentences = main(file)
for sentence in sentences:
s = prettifySentence(sentence)
print s
57. 係り受け解析
Stanford Core NLPの係り受け解析の結果(collapsed-dependencies)を有向グラフとして可視化せよ.可視化には,係り受け木をDOT言語に変換し,Graphvizを用いるとよい.また,Pythonから有向グラフを直接的に可視化するには,pydotを使うとよい.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import sys
import problem56
def dependToDot(i, dependency):
header = "digraph sentence{0} ".format(i)
body_head = "{ graph [rankdir = LR]; "
body = ""
for dep in dependency:
governor, dependent, label = dep.find('governor').text, dep.find('dependent').text, dep.get('type')
body += '"{gov}"->"{dep}" [label = "{label}"]; '.format(gov=governor, dep=dependent, label=label)
dotString = header + body_head + body + "}"
return dotString
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dotSentences = []
for i, sentence in enumerate(sentences):
dependency = sentence.find("dependencies[@type='collapsed-dependencies']")
dotSentences.append(dependToDot(i+1, dependency))
return dotSentences
if __name__ == '__main__':
dotSentences = main('nlp_line.txt.xml')
if len(sys.argv) > 1:
target = int(sys.argv[1]) - 1
print dotSentences[target]
else:
for dotSentence in dotSentences:
print dotSentence
58. タプルの抽出
Stanford Core NLPの係り受け解析の結果(collapsed-dependencies)に基づき,「主語 述語 目的語」の組をタブ区切り形式で出力せよ.ただし,主語,述語,目的語の定義は以下を参考にせよ.
- 述語: nsubj関係とdobj関係の子(dependant)を持つ単語
- 主語: 述語からnsubj関係にある子(dependent)
- 目的語: 述語からdobj関係にある子(dependent)
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
def extractTuples(sentence):
dependencies = sentence.find("dependencies[@type='collapsed-dependencies']")
dep_triple = []
dep_dic = {}
for dep in dependencies:
gov = (dep.find('governor').get('idx'), dep.find('governor').text)
if dep.get('type') in ['nsubj', 'dobj']:
dep_dic.setdefault(gov, []).append((dep.get('type'), dep.find('dependent').text))
verbs = [key for key, value in dep_dic.iteritems() if set([t for (t, d) in value]) == set(['nsubj', 'dobj'])]
for verb in verbs:
nsubj = [d for (t, d) in dep_dic[verb] if t == 'nsubj']
dobj = [d for (t, d) in dep_dic[verb] if t == 'dobj']
dep_triple += [[verb[1], n, d] for n in nsubj for d in dobj]
return dep_triple
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dep_triple = []
for sentence in sentences:
dep_triple.append(extractTuples(sentence))
return dep_triple
if __name__ == '__main__':
dep_triple = main('nlp_line.txt.xml')
for dep in dep_triple:
for dt in dep:
print "%s\t%s\t%s" % (dt[1], dt[0], dt[2])
59. S式の解析
Stanford Core NLPの句構造解析の結果(S式)を読み込み,文中のすべての名詞句(NP)を表示せよ.入れ子になっている名詞句もすべて表示すること.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
class TreeParser():
def __init__(self):
self.root = None
self._stack = [[]]
def parse(self, tree_string):
read = []
for character in tree_string.strip():
if character == "(":
self._stack.append([])
elif character == " ":
if read:
self._stack[-1].append("".join(read))
read = []
elif character == ")":
if read:
self._stack[-1].append("".join(read))
read = []
self._stack[-2].append(self._stack.pop())
else:
read.append(character)
self.root = self._stack.pop()
def get_phrase(self, tag):
s = self.root[0][1]
return self._recursive_finder(s, tag)
def _recursive_finder(self, lst, tag):
res = []
if lst[0] == tag:
res.append(lst)
for l in lst[1:]:
if isinstance(l, list):
res.extend(self._recursive_finder(l, tag))
return res
def main(file, tag):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
tag_phases = []
for sentence in sentences:
parser = TreeParser()
tree_string = sentence.find('parse').text
parser.parse(tree_string)
tag_phases.append(parser.get_phrase(tag))
return tag_phases
def str_phrase(phrase):
res = []
for p in phrase:
if isinstance(p, list):
if isinstance(p[1], list):
res += str_phrase(p)
else:
res.append(p[1])
return res
if __name__ == "__main__":
np_phases = main("nlp_line.txt.xml", "NP")
for np_phase in np_phases:
for np in np_phase:
phase_list = str_phrase(np)
np_string = problem56.prettifySentence(phase_list)
print np_string