Python
正規表現
自然言語処理
機械学習
データサイエンス

固有表現抽出ツールanagoの訓練データを京都ウェブ文書リードコーパスから用意する

More than 1 year has passed since last update.

前回( https://qiita.com/sugiyamath/items/365b263d4f03d3bca26f ), Hironsanのgithubのツール「anago」を試しましたが、十分なデータの用意ができませんでした。今回は、KWDLCからanagoの訓練データを生成します。

形式のルール

  1. 単語とラベルをタブでつなぐ。
  2. 一文ごとに改行だけの行を挿入する。
  3. ラベルは"IOBタグ-分類(英大文字3字)"。

データの置き場所

データは以下のURLからダウンロードします。
http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/KWDLC/download_kwdlc.cgi

このデータを展開し、形式的にnltk_data/corpora/kwdlcに置きます。

コード

import nltk,re
from nltk.corpus.reader.util import *
from nltk.corpus.util import LazyCorpusLoader

root = nltk.data.find('corpora/kwdlc/dat/rel')
fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r".+-.+", f)]
data = ""

for fileid in fileids:

    with open(root+"/" + fileid, "r") as file:
        data = data + file.read()

data_spl = data.split("\n")

regexp0 = re.compile(r"^#")
regexp1 = re.compile(r"^([^ ]+) [^ ]+ [^ ]+ [^ ]+ [^ ]+ [^ ]+ [^ ]+$")
regexp2 = re.compile(r".*ne type=\"([A-Z]+)\" target=\"(.+)\".*")

data_fmt = []
for d in data_spl:
    if regexp0.search(d):
        continue
    if regexp1.search(d):
        data_fmt.append(re.sub(regexp1, r"\1", d))
    if regexp2.search(d):
        data_fmt.append(re.sub(regexp2, r"NE:\1:\2", d))


def fill_array(size, it):
    out = []
    for i in range(size):
        out.append(it)
    return out

outs = []
labels = fill_array(len(data_fmt), None)

for i,d in enumerate(data_fmt):
    regexp3 = re.compile("NE:([A-Z]+):(.+)")
    if regexp3.search(d):
        rs = d.split(":")
        tmp_bools = []

        counter = i
        while(counter > 0):
            counter = counter - 1
            if data_fmt[counter] in rs[2]:
                tmp_bools.append(counter)
            else:
                break

        counter = i
        while(counter < len(data_fmt)):
            counter = counter + 1
            if data_fmt[counter] in rs[2]:
                tmp_bools.append(counter)
            else:
                break

        tmp_bools.sort()
        first_flag = True
        for b in tmp_bools:
            if first_flag:
                labels[b] = "B-" + rs[1][:3]
                first_flag = False
            else:
                labels[b] = "I-" + rs[1][:3]
        outs.append(None)
    else:
        outs.append(d)

with open("kwdlc.txt", "w") as file:
    for i, d in enumerate(outs):
        regexp4 = re.compile("。")
        if d is not None:
            if labels[i] is None:
                if regexp4.search(d):
                    file.write(d + "\t" + "O" + "\n")
                else:
                    file.write(d + "\t" + "O")
            else:
                file.write(d + "\t" + labels[i])
            file.write("\n")

KWDLCを完全にJUMAN形式に置き換える

Embeddingの作成にJUMANを使って分かち書きする場合、訓練データがJUMAN形式と完全に整合性を保っていると役に立ちます。以下のコードは上記のkwdlc.txtをJUMAN形式に置き換えるコードです。

kwdlc_jumaned.py
# coding: utf-8
import re
data = None
with open("kwdlc.txt","r") as file:
    data = file.read()
    data = data.split("\n")

tmp_sentence = []
sentences_spl = []
for d in data:
    if not d:
        sentences_spl.append(tmp_sentence)
        tmp_sentence = []
    else:
        tmp_sentence.append(d)

sentences = []
sentences_onlywords = []
sentences_onlylabels = []
for ss in sentences_spl:
    sentences.append(''.join([s.split('\t')[0] for s in ss]))
    sentences_onlywords.append([s.split('\t')[0] for s in ss])
    sentences_onlylabels.append([s.split('\t')[1] for s in ss])

findex = []
ilens = []
for ss in sentences_spl:
    first_indexes = []
    token_lens = []
    token_len = None
    for i, s in enumerate(ss):
        token = s.split('\t')[1]
        if 'B' in token:
            first_indexes.append(i)
            token_len = 1
        elif 'I' in token:
            token_len = token_len + 1
        else:
            if token_len is not None:
                token_lens.append(token_len)
            token_len = None
    findex.append(first_indexes)
    ilens.append(token_lens)

labels = []
for fis, ils, ss, sl in zip(findex, ilens, sentences_onlywords, sentences_onlylabels):
    ls = []
    for fi, il in zip(fis, ils):
        ls.append([sl[fi][2:],''.join([s for s in ss[fi:(fi+il)]])])
    labels.append(ls)

from pyknp import Juman

def get_equal_index(target, wakachi):
    for i, w in enumerate(wakachi):
        if target == w:
            return True, i
    return False, None


def get_max_index(inlabels, index):
    first_index = 0
    counter = 0
    max_counter = 1
    first_decided = 0
    flag = False
    for i, l in enumerate(inlabels[index]):
        if l:
            counter = counter + 1
        else:
            counter = 0

        if not flag and l:
            first_index = i

        if max_counter < counter:
            max_counter = counter
            first_decided = first_index


        flag = l
    return first_decided, max_counter

def fill_array(size, it):
    out = []
    for i in range(size):
        out.append(it)
    return out

def get_iobs(target_index, sentences, labels):
    juman = Juman()
    result = juman.analysis(sentences[target_index])
    wakachi = [mrph.midasi for mrph in result.mrph_list()]

    inlabels = []
    for l in labels[target_index]:
        tmp_inlabel = []
        for w in wakachi:
            if w in l[1]:
                tmp_inlabel.append(True)
            else:
                tmp_inlabel.append(False)
        inlabels.append(tmp_inlabel)

    iobs = fill_array(len(wakachi), 'O')
    for i, l in enumerate(labels[target_index]):
        flag, index = get_equal_index(l[1], wakachi)
        if flag:
            iobs[index] = 'B-'+l[0]
        else:
            index, ilen = get_max_index(inlabels, i)
            for i in range(ilen):
                if i == 0:
                    iobs[index] = 'B-'+l[0]
                else:
                    iobs[index+i] = 'I-'+l[0]
    return iobs, wakachi

iob_labels = []
wakachies = []
slen = len(sentences)
for i, _ in enumerate(sentences):
    if i % 100 == 0:
        print(100.0*i/slen)
    iobs, wakachi = get_iobs(i, sentences, labels)
    iob_labels.append(iobs)
    wakachies.append(wakachi)

with open("kwdlc_jumaned.txt","w") as file:
    for iobs, wakachi in zip(iob_labels, wakachies):
        for iob, w in zip(iobs, wakachi):
            file.write(w + "\t" + iob + "\n")
        file.write("\n")