LoginSignup
9
8

More than 5 years have passed since last update.

自然言語処理100本ノック 第6章 英語テキストの処理(後半)

Posted at

第6章の後半の問題を解いた記録。
対象とするファイルはwebページにもある通り、nlp.txtとする。

英語のテキスト(nlp.txt)に対して,以下の処理を実行せよ.

55. 固有表現抽出

入力文中の人名をすべて抜き出せ.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re

WORD = re.compile(r"<word>(\w+)</word>")
NER = re.compile(r"<NER>(\w+)</NER>")

token = ""
person = ""
f = open('nlp.txt.xml', 'r')
for line in f:
    line = line.strip()
    word = WORD.search(line)
    if word:
        token = word.group(1)
        continue
    ner = NER.search(line)
    if ner:
        if ner.group(1) == "PERSON":
            person = token
            print person
f.close()

56. 共参照解析

Stanford Core NLPの共参照解析の結果に基づき,文中の参照表現(mention)を代表参照表現(representative mention)に置換せよ.ただし,置換するときは,「代表参照表現(参照表現)」のように,元の参照表現が分かるように配慮せよ.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import re
import xml.etree.ElementTree as et
from functools import partial

LRB = re.compile(r"-LRB- ")
RRB = re.compile(r" -RRB-")
NOTATION = re.compile(r" ([,\.:;])")
LDQ = re.compile(r"`` ")
RDQ = re.compile(r" \'\'")
SQ = re.compile(r" \'")
SQS = re.compile(r" \'s")

class StanfordDocument():
    def __init__(self, file):
        self.xmltree = et.parse(file)
        root = self.xmltree.getroot()
        self.sentences = root.find('document/sentences')
        self.coreferences = root.find('document/coreference')

    def getListOfSentences(self):
        sentences = []
        for sentence in self.sentences.findall('sentence'):
            sentences.append([word.text for word in sentence.findall('tokens/token/word')])
        return sentences

def main(file):
    doc = StanfordDocument(file)
    sentences = doc.getListOfSentences()

    for coref in doc.coreferences.findall('coreference'):
        mentions = coref.findall('mention')
        represent = coref.find('mention[@representative="true"]')
        for mention in mentions:
            if mention != represent:
                sentence_i = int(mention.find('sentence').text) - 1
                start_i = int(mention.find('start').text) - 1
                end_i = int(mention.find('end').text) - 2

                target_sentence = sentences[sentence_i]
                target_sentence[start_i] = represent.find('text').text.strip() + ' (' + target_sentence[start_i]
                # print list(represent)
                # target_sentence[start_i] = "[" + str(sentence_i) +","+ str(start_i) +","+ str(end_i) +","+str(sentences[sentence_i][start_i])+ "]" + ' (' + target_sentence[start_i]
                target_sentence[end_i] = target_sentence[end_i] + ')'
    return sentences

def prettifySentence(sentence):
    s = " ".join(sentence)
    partials = map(
        lambda x: partial(x[0], x[1]),
        [
            (LRB.sub, '('),
            (RRB.sub, ')'),
            (LDQ.sub, '\"'),
            (RDQ.sub, '\"'),
            (SQS.sub, "\'s"),
            (SQ.sub, "\'"),
            (NOTATION.sub, r'\1')
        ]
    )
    for part in partials:
        s = part(s)
    return s

if __name__ == "__main__":
    file = "nlp_line.txt.xml"
    sentences = main(file)
    for sentence in sentences:
        s = prettifySentence(sentence)
        print s

57. 係り受け解析

Stanford Core NLPの係り受け解析の結果(collapsed-dependencies)を有向グラフとして可視化せよ.可視化には,係り受け木をDOT言語に変換し,Graphvizを用いるとよい.また,Pythonから有向グラフを直接的に可視化するには,pydotを使うとよい.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import sys
import problem56

def dependToDot(i, dependency):
    header = "digraph sentence{0} ".format(i)
    body_head = "{ graph [rankdir = LR]; "
    body = ""
    for dep in dependency:
        governor, dependent, label = dep.find('governor').text, dep.find('dependent').text, dep.get('type')
        body += '"{gov}"->"{dep}" [label = "{label}"]; '.format(gov=governor, dep=dependent, label=label)

    dotString = header + body_head + body + "}"
    return dotString

def main(file):
    doc = problem56.StanfordDocument(file)
    sentences = doc.sentences.findall('sentence')
    dotSentences = []
    for i, sentence in enumerate(sentences):
        dependency = sentence.find("dependencies[@type='collapsed-dependencies']")
        dotSentences.append(dependToDot(i+1, dependency))
    return dotSentences

if __name__ == '__main__':
    dotSentences = main('nlp_line.txt.xml')
    if len(sys.argv) > 1:
        target = int(sys.argv[1]) - 1
        print dotSentences[target]
    else:
        for dotSentence in dotSentences:
            print dotSentence

58. タプルの抽出

Stanford Core NLPの係り受け解析の結果(collapsed-dependencies)に基づき,「主語 述語 目的語」の組をタブ区切り形式で出力せよ.ただし,主語,述語,目的語の定義は以下を参考にせよ.
- 述語: nsubj関係とdobj関係の子(dependant)を持つ単語
- 主語: 述語からnsubj関係にある子(dependent)
- 目的語: 述語からdobj関係にある子(dependent)

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem56

def extractTuples(sentence):
    dependencies = sentence.find("dependencies[@type='collapsed-dependencies']")
    dep_triple = []
    dep_dic = {}

    for dep in dependencies:
        gov = (dep.find('governor').get('idx'), dep.find('governor').text)
        if dep.get('type') in ['nsubj', 'dobj']:
            dep_dic.setdefault(gov, []).append((dep.get('type'), dep.find('dependent').text))

    verbs = [key for key, value in dep_dic.iteritems() if set([t for (t, d) in value]) == set(['nsubj', 'dobj'])]

    for verb in verbs:
        nsubj = [d for (t, d) in dep_dic[verb] if t == 'nsubj']
        dobj = [d for (t, d) in dep_dic[verb] if t == 'dobj']
        dep_triple += [[verb[1], n, d] for n in nsubj for d in dobj]

    return dep_triple

def main(file):
    doc = problem56.StanfordDocument(file)
    sentences = doc.sentences.findall('sentence')
    dep_triple = []

    for sentence in sentences:
        dep_triple.append(extractTuples(sentence))

    return dep_triple

if __name__ == '__main__':
    dep_triple = main('nlp_line.txt.xml')
    for dep in dep_triple:
        for dt in dep:
            print "%s\t%s\t%s" % (dt[1], dt[0], dt[2])

59. S式の解析

Stanford Core NLPの句構造解析の結果(S式)を読み込み,文中のすべての名詞句(NP)を表示せよ.入れ子になっている名詞句もすべて表示すること.

# -*- coding: utf-8 -*-
__author__ = 'todoroki'

import problem56

class TreeParser():
    def __init__(self):
        self.root = None
        self._stack = [[]]

    def parse(self, tree_string):
        read = []
        for character in tree_string.strip():
            if character == "(":
                self._stack.append([])
            elif character == " ":
                if read:
                    self._stack[-1].append("".join(read))
                    read = []
            elif character == ")":
                if read:
                    self._stack[-1].append("".join(read))
                    read = []
                self._stack[-2].append(self._stack.pop())
            else:
                read.append(character)
        self.root = self._stack.pop()

    def get_phrase(self, tag):
        s = self.root[0][1]
        return self._recursive_finder(s, tag)

    def _recursive_finder(self, lst, tag):
        res = []
        if lst[0] == tag:
            res.append(lst)
        for l in lst[1:]:
            if isinstance(l, list):
                res.extend(self._recursive_finder(l, tag))
        return res

def main(file, tag):
    doc = problem56.StanfordDocument(file)
    sentences = doc.sentences.findall('sentence')
    tag_phases = []

    for sentence in sentences:
        parser = TreeParser()
        tree_string = sentence.find('parse').text
        parser.parse(tree_string)
        tag_phases.append(parser.get_phrase(tag))

    return tag_phases

def str_phrase(phrase):
    res = []
    for p in phrase:
        if isinstance(p, list):
            if isinstance(p[1], list):
                res += str_phrase(p)
            else:
                res.append(p[1])
    return res

if __name__ == "__main__":
    np_phases = main("nlp_line.txt.xml", "NP")
    for np_phase in np_phases:
        for np in np_phase:
            phase_list = str_phrase(np)
            np_string = problem56.prettifySentence(phase_list)
            print np_string

9
8
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
9
8