Posted at

freqtを使って文の構造の良さを評価するモデル

KeyakiツリーバンクからPCFGを生成して構文解析という記事では、PCFGを生成しました。この生成したPCFGを利用して、文の構造の良さを評価するモデルを作成します。


実行の流れ


  1. Keyaki Treebankの文を正例としてデータを整形。

  2. PCFGによってランダム生成した文を負例としてデータを生成。

  3. 1と2のデータを結合して、freqtへ入力し、部分木とその頻度を取得。

  4. 部分木がある一定以上出現する場合、その部分木をOneHot特徴量として定義。

  5. 4の特徴量からロジスティック回帰で訓練。

  6. 任意の文で文の構造を評価するモジュールを作成。


正例の整形


import nltk
from nltk import *
from train import load_data
import sys
import pandas as pd
from tqdm import tqdm
import re

pos = "QUOT,-LRB-,-RRB-,PU,ADJI,ADJN,ADV,AX,AXD,CL,CONJ,D,FW,INTJ,MD,N,NEG,NPR,NUM,P,PASS,PNL,PRO,Q,QN,SYM,VB,VB0,VB2,WADV,WD,WNUM,WPRO".split(",")

regex = re.compile(r"\(({}) (.+?)\)".format('|'.join(pos)))
regex2 = re.compile(r"\((.+?)(\*.+?\*).*?\)")

if __name__ == "__main__":
data = load_data("./data/out3.txt")
out = []
for i, tree in tqdm(enumerate(data)):
tree.collapse_unary(collapsePOS=False)
tree.chomsky_normal_form(horzMarkov=False)
sent = re.sub(regex, r'(\1(\2))', str(tree).replace("\n", "")).replace(" ","")
sent = re.sub(regex2, r'(\1(\2))', sent)
out.append({"sent": sent, "label":True})
pd.DataFrame(out).to_csv("true_sents.csv", index=False)


負例の生成


import pickle
import nltk
from pcfg_generate import pcfg_generate
from tqdm import tqdm
import pandas as pd
import re
from collections import defaultdict
from nltk.tree import Tree
from nltk.parse import RecursiveDescentParser
from nltk import Production
from nltk.parse.generate import generate
from nltk.grammar import Nonterminal
import signal
from contextlib import contextmanager

import sys
sys.setrecursionlimit(10000)

class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)

pos = "QUOT,-LRB-,-RRB-,PU,ADJI,ADJN,ADV,AX,AXD,CL,CONJ,D,FW,INTJ,MD,N,NEG,NPR,NUM,P,PASS,PNL,PRO,Q,QN,SYM,VB,VB0,VB2,WADV,WD,WNUM,WPRO".split(",")

regex = re.compile(r"\(({}) (.+?)\)".format('|'.join(pos)))
regex2 = re.compile(r"\((.+?)(\*.+?\*).*?\)")

if __name__ == "__main__":
reg1 = re.compile(r' \[[0-9\.]+\]')

with open("./model.pkl", "rb") as f:
grammar = pickle.load(f)

out = []
generate_size = 41100
with open("false_examples.txt", "w") as f:
for i, (xs, prods) in tqdm(enumerate(pcfg_generate(grammar, n=generate_size))):
grammar = nltk.CFG(nltk.Nonterminal("S"), prods)
rd = RecursiveDescentParser(grammar)
sent = ""
try:
with time_limit(1):
for x in rd.parse(xs):
sent = str(x)
break

except Exception as e:
print(str(i)+":ERR:"+str(e), end=" ", flush=True)
continue

sent = re.sub(regex, r'(\1(\2))', sent.replace("\n", "")).replace(" ","")
sent = re.sub(regex2, r'(\1(\2))', sent)
out.append({"sent": sent, "label":False})

pd.DataFrame(out).to_csv("./false_sents.csv", index=False)

なお、pcfg_generateというスクリプトは以下にあります:

https://github.com/sugiyamath/cfg_experiments/blob/master/scripts/pcfg_generate.py


freqtへ入力

生成したsentsを以下のように結合します。

import pandas as pd

df1 = pd.read_csv("true_sents.csv")
df2 = pd.read_csv("false_sents.csv")

with open("all_sents.txt") as f:
f.write('\n'.join(df1["sent"].tolist() + df2["sent"].tolist()))

結局、ラベルはインデクス番号で対応します。

$ wc -l true_sents.csv

41006

これにより、TrueかFalseかのしきい値はインデクス番号41005だとわかります。

さて、freqtを以下からダウンロードし、コンパイルしてください。

http://chasen.org/~taku/software/freqt/

$ make

$ make test
$ cp freqt /usr/local/bin

つぎに、以下を実行します。

$ /usr/local/bin/freqt -w -M 2 -L 5 < all_sents.txt > all_freqt.xml


部分木から特徴量設計

# coding: utf-8

from tqdm import tqdm
import pandas as pd
import sys

def load_freqt(filename):
with open(filename) as f:
colnames = []
rowindices = []
for line in f:
if line.startswith("<what>"):
colnames.append(line.replace("<what>", "").replace("</what>","").strip())
elif line.startswith("<where>"):
rowindices.append(line.replace("<where>", "").replace("</where>","").strip().split())
assert len(colnames) == len(rowindices)
return {"columns": colnames, "rows": rowindices}

def solve_maxinds(rowindices):
out = []
for indices in rowindices:
indices = list(map(int, indices))
out += indices
return max(out)

def select_targets(colnames, rowindices, threshold=200):
return {
colname:list(map(int,indices))
for indices, colname in zip(rowindices, colnames)
if len(indices) > threshold and '*' not in colname
}

def targets2df(targets, maxind):
out = [{key:False for key in targets.keys()} for _ in range(maxind)]
for column, values in tqdm(targets.items()):
for value in values:
out[value-1][column] = True
df = pd.DataFrame(out)
return df

def run(infile, outfile, threshold=200):
totalstep = 5
print("Step 1/{}: Loading file".format(totalstep))
data = load_freqt(infile)
print("Step 2/{}: solving indices".format(totalstep))
maxind = solve_existinds(data['rows'])
print("Step 3/{}: select targets".format(totalstep))
targets = select_targets(data['columns'], data['rows'], threshold)
print("Step 4/{}: targets2df".format(totalstep))
df = targets2df(targets, maxind)
print("Step 5/{}: output file".format(totalstep))
df.to_csv(outfile, index=False)

if __name__ == "__main__":
run("../annotated/all_freqt.xml", "../annotated/ann_fixed.csv", 400)

このコードを実行することで、ann_fixed.csvという、特徴量設計済みデータが出力されます。


訓練

from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

if __name__ == "__main__":
X = pd.read_csv("ann_fixed.csv")
y = [i<41005 for i in range(df.shape[0])]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

with open("columns.txt", "w") as f:
f.write('\n'.join(X.columns.tolist()))

with open("logreg_model.pkl", "wb") as f:
pickle.dump(clf_real, f)


任意の文に対して構造評価


import sys
import re
import nltk
import pickle
import MeCab
import tempfile
from subprocess import check_output, Popen, PIPE
import pandas as pd

def load_data(column_file="./columns2.txt", nltk_model_file="./model.pkl", logreg_model_file="./logreg_model2.pkl"):

models = []
columns = []
for filename in [nltk_model_file, logreg_model_file]:
with open(filename, "rb") as f:
models.append(pickle.load(f))

with open(column_file) as f:
columns = [line.strip() for line in f if line != "\n"]

return {"columns": columns, "grammar": models[0], "clf": models[1]}

def freqt(tree, freqt_path="/usr/local/bin/freqt"):
ps = Popen("echo \"{}\"".format(tree), stdout=PIPE, shell=True)
output = check_output([freqt_path, "-w", "-M 2", "-L 5"], stdin=ps.stdout)
return output.decode('utf-8')

def feature_engineering(xml, columns):
row = {column:False for column in columns}
for line in xml.split("\n"):
if line.startswith("<what>"):
line = line.replace("<what>","").replace("</what>","").strip()
if line in columns:
row[line] = True

return row

def predict(clf, rows, columns):
X = pd.DataFrame(rows)[columns]
return clf.predict_proba(X)

def build_tree(sent, grammar, tokenize, cfg_tooks_path="./cfg_tools/3/"):
pos = "QUOT,-LRB-,-RRB-,PU,ADJI,ADJN,ADV,AX,AXD,CL,CONJ,D,FW,INTJ,MD,N,NEG,NPR,NUM,P,PASS,PNL,PRO,Q,QN,SYM,VB,VB0,VB2,WADV,WD,WNUM,WPRO".split(",")
regex = re.compile(r"\(({}) (.+?)\)".format('|'.join(pos)))
regex2 = re.compile(r"\((.+?)(\*.+?\*).*?\)")

sys.path.append(cfg_tooks_path)
import learn_pcfg as lp
target = tokenize(sent)
tree = None
for x in lp.prob_parse(grammar, target, n=1):
tree = x.pformat()
break
tree = re.sub(regex, r'(\1(\2))', tree.replace("\n", "")).replace(" ","")
tree = re.sub(regex2, r'(\1(\2))', tree)
return tree

if __name__ == "__main__":
data = load_data()
sent = "犬が走る。"
tagger = MeCab.Tagger("-Owakati")
tree = build_tree(sent, data['grammar'], tagger.parse)
xml = freqt(tree)
row = feature_engineering(xml, data['columns'])
print(predict(data['clf'], [row], data['columns']))

ここで用いられているPCFGは事前訓練しておいてください。


訓練済みモデルへのリンク

https://github.com/sugiyamath/cfg_experiments/tree/master/model


参考