More than 1 year has passed since last update.

PythonでN-gramを作る（Julius）

Last updated at 2022-03-14Posted at 2022-03-14

ソースコードを載せておきます

make_n-gram.py

import subprocess
import time
import os

path = os.getcwd().replace(os.sep,'/')+"/" # 実行中のパス取得

srilm_path = "C:/cygwin64/srilm/bin/cygwin64/ngram-count"
mkbingram = "C:/Julius/julius-4.6-win32bin/bin/mkbingram.exe"

input_sentence = 'sentence.txt'
corpus = 'corpus.txt'
reversal_corpus = 'reversal_corpus.txt'
forward_n_gram = "forward_n-gram.arpa"
backward_n_gram = "backward_n-gram.arpa"
bingram = "n-gram.bingram"

def strip_cmd_injection(instr):
    inj = [";", "|", "&", "`", "(", ")", "$", "<", ">", "*", "?", "{", "}", "[", "]", "!", "？", "！", "「", "」", "\n"]
    for s in inj:
        instr = instr.replace(s, "")
    instr = instr.replace("", "")
    return instr

def chasen(arg):
    arg = strip_cmd_injection(arg)
    cmd = "echo {0} | chasen -iw".format(arg)
    subprocess.Popen("chcp 65001", shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    time.sleep(0.1) # chcp 65001の反映待ち
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()
    if stderr != b'':
        raise(Exception(stderr.decode('utf-8')))

    try:
        lines = stdout.decode('utf-8').split("\n")
    except:
        raise(Exception(stderr.decode('utf-8')))

    for line in lines:
        if (line == "EOS"):
            break
        yield line.split("\t")

def make_corpus():
    f = open(input_sentence, 'r')
    lines = f.read().split('\n')
    f.close()

    Words = []
    for line in lines:
        if line == "" or line == " ":
            continue
        words = []
        for cha in chasen(line):
            words.append(cha[0])
        Words.append(" ".join(words).replace("。 ", "。\n"))

    f = open(corpus, 'w')
    f.write("\n".join(Words))
    f.close()

def make_reversal_corpus():
    f = open(corpus, 'r')
    lines = f.read().split('\n')
    f.close()

    Words = []
    for line in lines:
        Words.append(" ".join(line.split(" ")[::-1]))

    f = open(reversal_corpus, 'w')
    f.write("\n".join(Words))
    f.close()

def make_forward_n_gram():
    cmd = "{0} -order 2 -text {1} -unk -lm {2}".format(srilm_path, path+corpus, path+forward_n_gram)
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

def make_backward_n_gram():
    cmd = "{0} -order 3 -text {1} -unk -lm {2}".format(srilm_path, path+reversal_corpus, path+backward_n_gram)
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

def make_mkbingram():
    cmd = "{0} -nlr {1} -nlr {2} {3}".format(mkbingram, path+forward_n_gram, path+backward_n_gram, path+bingram)
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

if __name__ == '__main__':
    make_corpus()
    make_reversal_corpus()
    make_forward_n_gram()
    make_backward_n_gram()
    make_mkbingram()

コード解説

Chasenを使って学習コーパスを作成

学習コーパスは単語のスペース区切り、－文一行にする。

def make_corpus():
    f = open(input_sentence, 'r')
    lines = f.read().split('\n')
    f.close()

    Words = []
    for line in lines:
        if line == "" or line == " ":
            continue
        words = []
        for cha in chasen(line):
            words.append(cha[0])
        Words.append(" ".join(words).replace("。 ", "。\n"))

    f = open(corpus, 'w')
    f.write("\n".join(Words))
    f.close()

Chasenの詳しい説明はこちら

SRILMを使って前向きN-gramを作成

def make_forward_n_gram():
    cmd = "{0} -order 2 -text {1} -unk -lm {2}".format(srilm_path, path+corpus, path+forward_n_gram)
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

SRILMの詳しい説明はこちら

逆順コーパスを作成

def make_reversal_corpus():
    f = open(corpus, 'r')
    lines = f.read().split('\n')
    f.close()

    Words = []
    for line in lines:
        Words.append(" ".join(line.split(" ")[::-1]))

    f = open(reversal_corpus, 'w')
    f.write("\n".join(Words))
    f.close()

SRILMを使って後ろ向きN-gramを作成

def make_backward_n_gram():
    cmd = "{0} -order 3 -text {1} -unk -lm {2}".format(srilm_path, path+reversal_corpus, path+backward_n_gram)
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

作成したN-gramを結合してバイナリ形式に変換

def make_mkbingram():
    cmd = "{0} -nlr {1} -nlr {2} {3}".format(mkbingram, path+forward_n_gram, path+backward_n_gram, path+bingram)
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()

以上、PythonでN-gramを作る方法でした。

ソースコード

今回使用したソースコード、音声ファイルはこちらにあります。

参考にしたページ

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up