More than 5 years have passed since last update.

Python+WindowsでのMecab/Cabocha/KNP

Last updated at 2016-12-27Posted at 2016-12-26

やりたいこと

Windows+pythonでMecab/Cabocha/KNPを使う時、楽しようとして「import mecab」や、「import Cabocha」を使うとリコンパイルやら、UTF-8指定やら、面倒くさくなる。
そこで、Sjift-Jis＋外部呼び出しで、デフォルトインストールで使えるようにする。

環境

Windows
Mecab/Cabocha/Juman/KNPのインストール(SHIFT-JISで)
Mecab/Cabocha/Juman/KNPは環境変数に通しておく

ソース

homefldrを任意の場所に設定すること
(2016/12/17 追記:KNPにも対応)

import os
import subprocess
from tempfile import NamedTemporaryFile

class JapaneseDependencyRelation:
    def __init__(self, encoding=None):
        if encoding is None:
            encoding = 'utf-8'
        # Mecab,CabochaのEncoding(インストール時に従う)
        self.text_encoding = encoding
        # TMPファイル作成場所
        self.homefldr = 'C:\\tmp'

    def get_encode_original(self, j_txt):
        hoge = []
        for ch in j_txt:
            try:
                hoge.append(ch.encode(self.text_encoding))
            except:
                raise UnicodeEncodeError

        return ''.join(hoge)

    def do_mecab(self, j_txt):
        if os.path.exists(self.homefldr) == False:
            raise FileNotFoundError

        try:
            with NamedTemporaryFile(delete=False, dir=self.homefldr) as temp:
                try:
                    j_txt = j_txt.encode(self.text_encoding)
                except:
                    j_txt = self.get_encode_original(j_txt)
                temp.write(j_txt)
        except Exception as e:
            print(e)
            raise UnicodeTranslateError
        else:
            command = ['mecab']
            process = subprocess.Popen(command, stdin=open(temp.name, 'r'), stdout=subprocess.PIPE)
            output = process.communicate()[0]
        finally:
            temp.close()
            os.unlink(temp.name)
        return output.decode(self.text_encoding)

    def do_cabocha(self, j_txt, fmt=None):
        if fmt is None:
            fmt = "xml"

        if os.path.exists(self.homefldr) == False:
            raise FileNotFoundError

        try:
            with NamedTemporaryFile(delete=False, dir=self.homefldr) as temp:
                try:
                    j_txt = j_txt.encode(self.text_encoding)
                except:
                    j_txt = self.get_encode_original(j_txt)
                temp.write(j_txt)
        except Exception as e:
            print(e)
            raise UnicodeTranslateError
        else:
            '''
            -f, --output-format=TYPE  set output format style
                                        0 - tree(default)
                                        1 - lattice
                                        2 - tree + lattice
                                        3 - XML
            '''
            # cabocha引数
            if (fmt == "xml"):
                command = ['cabocha', '-f', '3']
            elif (fmt == "tree"):
                command = ['cabocha', '-f', '2']
            else:
                command = ['cabocha', '-f', '1']

            process = subprocess.Popen(command, stdin=open(temp.name, 'r'), stdout=subprocess.PIPE)
            output = process.communicate()[0]
        finally:
            temp.close()
            os.unlink(temp.name)

        return output.decode(self.text_encoding)

    def do_knp(self, j_txt, fmt=None, level=None, output=None):
        """
        解析結果の表示の指定(fmt)
            -tab 表形式による表示(tab)
            -simple 出力を少なくした表形式による表示(simple)
            -td 「解析結果汎用表示ツール」1に対応した表示(td)
            -tree (default) 木構造による表示(tree)
            -bnsttab 文節の表形式による表示(tab1)
            -bnsttree 文節の木構造による表示(tree1)
            -sexp リスト形式による表示(sexp)
        解析のレベルの指定(level)
            -bnst 形態素列を文節列に変換する(1)
            -dpnd さらに，文節間の係り受け解析を行う(2)
            -case (default) さらに，格関係の解析を行う(3)
            -anaphora さらに，照応関係の解析を行う(4)
            -ne さらに，固有表現の解析を行う(5)
        解析結果の出力情報の指定(output)
            -normal (default) 最終的解析結果だけを表示(1)
            -detail 係り受けの可能性行例，文節間の類似度行列なども表示(2)
            -debug さらに詳しい解析途中段階の情報の表示(3)
        """

        def set_argument(f, l, o):
            arg = ['juman|knp']
            if f == "tab":
                arg.append("-tab")
            elif f == "td":
                arg.append("-td")
            elif f == "tree":
                arg.append("-tree")
            elif f == "tab1":
                arg.append("-bnsttab")
            elif f == "tree1":
                arg.append("-bnsttree")
            elif f == "sexp":
                arg.append("-sexp")
            else:
                arg.append("-simple")

            if l == 1:
                arg.append("-bnst")
            elif l == 2:
                arg.append("-dpnd")
            elif l == 3:
                arg.append("-case")
            elif l == 5:
                arg.append("-ne")
            else:
                arg.append("-anaphora")

            if o == 2:
                arg.append("-detail")
            elif o == 3:
                arg.append("-debug")
            else:
                arg.append("-normal")
            return arg

        if fmt is None:
            fmt = "tab"
        if level is None:
            level = 4
        if output is None:
            output = 1

        if os.path.exists(self.homefldr) == False:
            raise FileNotFoundError

        try:
            with NamedTemporaryFile(delete=False, dir=self.homefldr) as temp:
                try:
                    j_txt = j_txt.encode(self.text_encoding)
                except:
                    j_txt = self.get_encode_original(j_txt)
                temp.write(j_txt)
        except Exception as e:
            print(e)
            raise UnicodeTranslateError
        else:
            command = set_argument(fmt, level, output)
            process = subprocess.Popen(command, shell=True, stdin=open(temp.name, 'r'), stdout=subprocess.PIPE)
            output = process.communicate()[0]
        finally:
            temp.close()
            # TMPファイル削除
            os.unlink(temp.name)

        return output.decode(self.text_encoding)

if __name__ == "__main__":
    operation_japanese = JapaneseDependencyRelation('shift-jis')
    text = '隣の客はよく柿食う客だ'
    # Mecab
    print(operation_japanese.do_mecab(text))
    # Cabocha
    print(operation_japanese.do_cabocha(text, "xml"))
    print(operation_japanese.do_cabocha(text, "tree"))
    print(operation_japanese.do_cabocha(text, "word"))
    # KNP
    print(operation_japanese.do_knp(text, "tab"))
    print(operation_japanese.do_knp(text, "td"))
    print(operation_japanese.do_knp(text, "simple"))
    print(operation_japanese.do_knp(text, "tree"))
    print(operation_japanese.do_knp(text, "tab1"))
    print(operation_japanese.do_knp(text, "tree1"))
    print(operation_japanese.do_knp(text, "sexp"))

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up