More than 1 year has passed since last update.

言語処理100本ノック俺の解答第5章

Last updated at 2023-03-05Posted at 2022-09-10

形態素解析器はCaboChaを使います。

問題に入る前に、文分割について。CaboChaは、文を入力として受け取る前提になっていると思う。一方でai.jaは文に分割されていない。ai.jaの行をそのままCaboChaに投げても、いい感じに解析してくれはするのだが、やはり係り受け解析って文の内部の構造を解析するものだと思うので、気持ちが悪い。

そこで、まずテキストを文に切るスクリプトを作成。東北大BERTの前処理を真似て、MeCabの解析結果が句点になるところで切る。

import argparse
import sys

parser = argparse.ArgumentParser()
parser.add_argument('-o', help="outpuf filename")
args = parser.parse_args()

sys.stdin.reconfigure(encoding='utf-8')
f_o = open(args.o, "w", encoding="utf_8", newline='\n')

sentence = ''
sent_written = False
for l in sys.stdin:
    l = l.rstrip()
    if l == 'EOS':
        if len(sentence) > 0:
            f_o.write(sentence+'\n')
            sent_written = True
        if not sent_written: # EOSだけで中身がない文のときも改行を出力
            f_o.write('\n')
        sentence = ''
        sent_written = False
    else:
        (surface, features) = l.split('\t')
        sentence += surface
        features_list = features.split(',')
        if features_list[0] == "記号" and features_list[1] == "句点":
            f_o.write(sentence+'\n')
            sentence = ''
            sent_written = True

if len(sentence) > 0:
    f_o.write(sentence+'\n')

このスクリプトをmecab_sent_splitter.pyとして、下記のように実行します。

$ mecab ai.ja.txt | python mecab_sent_splitter.py -o ai.ja-sent.txt
$ cabocha ai.ja-sent.txt -o ai.ja.txt.parsed -f1

解答

ほぼ同じスクリプトで出来るので、ひとつのスクリプトで、コマンド引数でどの問題向けかを切り替えるようにした。例えば問題42に解答する場合は、スクリプトのファイル名をnlp100-ch05.pyとすると、

$ cat ai.ja.txt.parsed | python nlp100-ch05.py 42

のように実行する。

# -*- coding: utf-8 -*-
"""
@author: sen
"""

import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument('problem', type=int)
args = argparser.parse_args()

import sys
import re

class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1
    def __str__(self):
        return ','.join([self.surface, self.base, self.pos, self.pos1])

punctuations = list(map(chr, range(0x3000, 0x3040))) # CJK Symbols and Punctuation
punctuations += ['（', '）', '(', ')']
trans_dict = {c:'' for c in punctuations}
trans_table = str.maketrans(trans_dict)

class Chunk:
    def __init__(self, dst):
        self.dst = dst
        self.sources = []
        self.morphs = []
        self.nouns = [] # 名詞単語のmorphsにおけるインデックスのリスト
        self.pred = None
        self.particle = None
    def __str__(self):
        return '%s %d' % (self.get_surface(), self.dst)
    def get_surface(self, noun_var=None):
        if noun_var and len(self.nouns) > 0:
            surf_or_var = ''
            for i in range(self.nouns[0]):
                surf_or_var += self.morphs[i].surface
            surf_or_var += noun_var
            for i in range(self.nouns[-1] + 1, len(self.morphs)):
                surf_or_var += self.morphs[i].surface
            return surf_or_var
        else:
            return ''.join([m.surface for m in self.morphs])
    def append(self, morph):
        surface = morph.surface.translate(trans_table)
        if len(surface) == 0:
            return
        self.morphs.append(morph)
        if morph.pos == '名詞':
            self.nouns.append(len(self.morphs) - 1)
        elif morph.pos == '動詞' and not self.pred:
            self.pred = morph.base
        elif morph.pos == '助詞':
            self.particle = morph.base
    def isNoun(self):
        return len(self.nouns) > 0
    def isVerb(self):
        return self.pred != None
    def add_source(self, src):
        self.sources.append(src)

# 文節iからルートまでのパスのインデックス列。i自体は含まない。
def get_path(sentence, i):
    path = []
    j = sentence[i].dst
    while (j >= 0):
        path.append(j)
        j = sentence[j].dst
    return path

sys.stdin.reconfigure(encoding='utf-8')
sys.stdout.reconfigure(encoding='utf-8')

chunk_info_pat = re.compile(r'^\* \d+ (-?\d+)D')

sentences = []
sentence = []
chunk = None
for l in sys.stdin:
    l = l.rstrip()
    if l == 'EOS':
        for c in sentence:
            if c.dst >= 0:
                sentence[c.dst].add_source(c)
        sentences.append(sentence)
        sentence = []
    elif l.startswith('*'):
        dst = int(chunk_info_pat.match(l).group(1))
        chunk = Chunk(dst)
        sentence.append(chunk)
        continue
    else:
        (surface, features) = l.split('\t')
        features_list = features.split(',')
        m = Morph(surface,
                  features_list[6],
                  features_list[0],
                  features_list[1])
        chunk.append(m)

if args.problem == 42:
    for i in range(0, 10):
        snt = sentences[i]
        for ch in snt:
            s_f = ch.get_surface()
            s_t = snt[ch.dst].get_surface()
            print('%s\t%s' % (s_f, s_t))
elif args.problem == 43:
    for i in range(0, 10):
        snt = sentences[i]
        for ch in snt:
            if not ch.isNoun():
                continue
            if not snt[ch.dst].isVerb():
                continue
            s_f = ch.get_surface()
            s_t = snt[ch.dst].get_surface()
            print('%s\t%s' % (s_f, s_t))
elif args.problem == 44:
    from graphviz import Digraph
    for i in range(0, 10):
        snt = sentences[i]
        graph = Digraph(format="png")
        graph.attr('node', fontname="MS Gothic")
        for j, ch in enumerate(snt):
            graph.node(str(j), ch.get_surface())
            graph.edge(str(j), str(ch.dst))
        graph.render("image/%d" % i)
elif args.problem == 45:
    for snt in sentences:
        for c in snt:
            if c.isVerb():
                particles = [src.particle for src in c.sources if src.particle]
                print('%s\t%s' % (c.pred, ' '.join(sorted(particles))))
elif args.problem == 46:
    for snt in sentences:
        for c in snt:
            if c.isVerb():
                args = [(src.particle, src.get_surface()) for src in c.sources if src.particle]
                args.sort(key = lambda x: x[0])
                s_particle = ' '.join([x[0] for x in args])
                s_surface = ' '.join([x[1] for x in args])
                print('\t'.join((c.pred, s_particle, s_surface)))
elif args.problem == 47:
    for snt in sentences:
        for c in snt:
            if not c.isVerb():
                continue
            sahen = None
            arguments = []
            for src in c.sources:
                if src.particle:
                    if src.particle == 'を' and src.noun and src.noun.pos1 == 'サ変接続':
                        sahen = src
                    else:
                        arguments.append((src.particle, src.get_surface()))
            
            if not sahen:
                continue

            for sahensrc in sahen.sources:
                if sahensrc.particle:
                    arguments.append((sahensrc.particle, sahensrc.get_surface()))

            arguments.sort(key = lambda x: x[0])
            s_particle = ' '.join([x[0] for x in arguments])
            s_surface = ' '.join([x[1] for x in arguments])
            print('%s%s\t%s\t%s' % (sahen.get_surface(), c.pred, s_particle, s_surface))
elif args.problem == 48:
    for snt in sentences:
        for chunk in snt:
            if chunk.isNoun():
                print(chunk.get_surface(), end='')
                c = chunk
                while c.dst != -1:
                    c = snt[c.dst]
                    print(' -> ' + c.get_surface(), end='')
                print()
        print()
elif args.problem == 49:
    for snt in sentences:
        for i in range(len(snt)):
            if not snt[i].isNoun():
                continue
            path_i = get_path(snt, i)
            i_surf = snt[i].get_surface('X')
            for j in range(i + 1, len(snt)):
                if not snt[j].isNoun():
                    continue
                j_surf = snt[j].get_surface('Y')
                if j in path_i: # 文節iから構文木の根に至る経路上に文節jが存在する場合
                    nodes = [i_surf]
                    for x in path_i:
                        if x == j:
                            break
                        nodes.append(snt[x].get_surface())
                    nodes.append(j_surf)
                    print(' -> '.join(nodes))
                else: # 文節iと文節jから構文木の根に至る経路上で共通の文節kで交わる場合
                    path_j = get_path(snt, j)
                    k = min(set(path_i) & set(path_j))
                    nodes_i = [i_surf]
                    for x in path_i:
                        if x == k:
                            break
                        nodes_i.append(snt[x].get_surface())
                    print(' -> '.join(nodes_i), end=' | ')
                    nodes_j = [j_surf]
                    for x in path_j:
                        if x == k:
                            break
                        nodes_j.append(snt[x].get_surface())
                    print(' -> '.join(nodes_j), end=' | ')
                    print(snt[k].get_surface())

else:
    print('Unknown problem number: %d' % args.problem)

問題44までの対象テキストの範囲に関して。問題40の問題文の「冒頭の説明文」とは、元のWikipedia記事で「概要」の前までのことを指すのかなと思うが、ai.jaでは「概要」という見出しは抜け落ちている。「冒頭の説明文」をシステマチックに取り出す方法が思いつかないので適当に先頭から10文を表示してます。

問題42にある「句読点などの記号」について、それらを列挙するのにUnicodeのCJK Symbols and Punctuationブロックを使ってみた。しかしこれには普通の括弧"（"、"）"が含まれておらず、それだけ個別に追加しているので、ちょっと格好悪い。

問題45は、スクリプト自体より、「UNIXコマンドを用いて確認せよ」の方で意外にも苦労した。以下が俺の実施例。grepにおける正規表現内のタブ文字の扱いで。$'ほげ'で「ほげ」内の「\」エスケープができるようにするというのは、知らなかった。

$ cat ai.ja.txt.parsed | python nlp100-ch05.py 45 > nlp100-45.txt
$ sort nlp100-45.txt | uniq -c | sort -r | less
$ grep -E $'^(行う|なる|与える)\t' nlp100-45.txt | uniq -c | sort -r | less

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

言語処理100本ノック俺の解答 第5章

解答

言語処理100本ノック俺の解答第5章