ElasticSearch Sudachi Windows + Pythonの続編

構築はこちらを参考に
PythonでMapping/Index作成までやる
- SudachiのTokenizerを弄りたい場合は、settingのJson部分を変更して実行
- 自動的にIndexを作り直します
- 前回記事でわからなかったフォルダ指定も解決
  - D:\elasticsearch-5.6.5\config\sudachiに辞書ファイルを突っ込むだけでOK

els.py

from elasticsearch import Elasticsearch


def setting(es_analyze):
    ES_SETTINGS = {
        "settings": {
            "analysis": {
                "tokenizer": {
                    "sudachi_tokenizer": {
                        "type": "sudachi_tokenizer",
                        "mode": "search",
                        "discard_punctuation": True,
                        "resources_path": "sudachi"
                    }

                },
                "filter": {
                    "stopword": {
                        "type": "stop",
                        "stopwords": [
                            "です","ます"
                        ]
                    }
                },
                "analyzer": {
                    es_analyze: {
                        "type": "custom",
                        "char_filter": [
                        ],
                        "tokenizer": "sudachi_tokenizer",
                        "filter": [
                            "stopword"
                        ]
                    }
                }
            }
        }
    }
    return ES_SETTINGS


def mapping(es_type, es_analyze):
    ES_MAPPING = {
        es_type: {
            "properties": {
                "text": {
                    "type": "string",
                    "include_in_all": False,
                    "store": False,
                    "index": "not_analyzed"
                },
                "_analyzed": {
                    "type": "string",
                    "store": True,
                    "index": "analyzed",
                    "analyzer": es_analyze
                }
            }
        }
    }
    return ES_MAPPING


class ESAnalyzer(object):
    def __init__(self, host="localhost", port=9200, index=None, type=None, analyzer=None):
        if index is None:
            index="test"
        if type is None:
            type="test"
        if analyzer is None:
            index="test"

        self.es = Elasticsearch(hosts=[{"host": host, "port": port}], send_get_body_as="POST")
        self.index = index
        self.type = type
        self.analyzer = analyzer
        self.setting = setting(analyzer)
        self.mapping = mapping(type, analyzer)

    def __call__(self, text):
        if not text:
            return []

        data = self.es.indices.analyze(index=self.index,
                                       body={"analyzer": self.analyzer, "text": text} )

        tokens = []
        for token in data["tokens"]:
            tokens.append((token["token"], token["position"]))
        tokens = list(map(lambda x: x[0], sorted(tokens, key=lambda x: x[1])))
        return tokens

    def del_index(self):
        self.es.indices.delete(index=self.index)
        print("The index has been deleted.")

    def init_index(self):
        self.es.indices.create(index=self.index, body=self.setting)
        self.es.indices.put_mapping(index=self.index, doc_type=self.type, body=self.mapping)
        print("The index has been initialised.")


def main():
    E_HOST = "localhost"
    E_PORT = 9200
    E_INDEX = 'sudachi_test'
    E_TYPE = "sudachi_test_type"
    E_ANALYZER = "sudachi_analyzer"
    analyzer = ESAnalyzer(host=E_HOST, port=E_PORT, index=E_INDEX, type=E_TYPE, analyzer=E_ANALYZER)

    text="東京特許許可局です"

    print("-" * 30 + "Index変更前" + "-" * 30)
    already_index_exist=False
    try:
        print(analyzer(text))
        already_index_exist = True
    except Exception as e:
        print("-" * 30 + "Indexがないので作成します" + "-" * 30)
        print(e)

    if already_index_exist:
        try:
            analyzer.del_index()
        except Exception as e:
            print(e)

    try:
        analyzer.init_index()
    except Exception as e:
        print(e)

    print("-" * 30 + "変更後" + "-" * 30)
    print(analyzer(text))




if __name__ == "__main__":
    main()

試しに、このまま実行すると

['東京特許許可局', '東京', '特許', '許可', '局']

上記の結果が得られます。
次に、settingの以下の部分をコメントアウト

els.py

                        "filter": [
                            # "stopword"
                        ]

再度実行すると以下の結果を得ます。

------------------------------Index変更前------------------------------
['東京特許許可局', '東京', '特許', '許可', '局']
The index has been deleted.
The index has been initialised.
------------------------------変更後------------------------------
['東京特許許可局', '東京', '特許', '許可', '局', 'です']

(続)ElasticSearch Sudachi Windows + Python

ElasticSearch Sudachi Windows + Pythonの続編