LoginSignup
1
3

More than 5 years have passed since last update.

(続)ElasticSearch Sudachi Windows + Python

Posted at

ElasticSearch Sudachi Windows + Pythonの続編

  • 構築はこちらを参考に
  • PythonでMapping/Index作成までやる
    • SudachiのTokenizerを弄りたい場合は、settingのJson部分を変更して実行
    • 自動的にIndexを作り直します
    • 前回記事でわからなかったフォルダ指定も解決
      • D:\elasticsearch-5.6.5\config\sudachiに辞書ファイルを突っ込むだけでOK
els.py
from elasticsearch import Elasticsearch


def setting(es_analyze):
    ES_SETTINGS = {
        "settings": {
            "analysis": {
                "tokenizer": {
                    "sudachi_tokenizer": {
                        "type": "sudachi_tokenizer",
                        "mode": "search",
                        "discard_punctuation": True,
                        "resources_path": "sudachi"
                    }

                },
                "filter": {
                    "stopword": {
                        "type": "stop",
                        "stopwords": [
                            "です","ます"
                        ]
                    }
                },
                "analyzer": {
                    es_analyze: {
                        "type": "custom",
                        "char_filter": [
                        ],
                        "tokenizer": "sudachi_tokenizer",
                        "filter": [
                            "stopword"
                        ]
                    }
                }
            }
        }
    }
    return ES_SETTINGS


def mapping(es_type, es_analyze):
    ES_MAPPING = {
        es_type: {
            "properties": {
                "text": {
                    "type": "string",
                    "include_in_all": False,
                    "store": False,
                    "index": "not_analyzed"
                },
                "_analyzed": {
                    "type": "string",
                    "store": True,
                    "index": "analyzed",
                    "analyzer": es_analyze
                }
            }
        }
    }
    return ES_MAPPING


class ESAnalyzer(object):
    def __init__(self, host="localhost", port=9200, index=None, type=None, analyzer=None):
        if index is None:
            index="test"
        if type is None:
            type="test"
        if analyzer is None:
            index="test"

        self.es = Elasticsearch(hosts=[{"host": host, "port": port}], send_get_body_as="POST")
        self.index = index
        self.type = type
        self.analyzer = analyzer
        self.setting = setting(analyzer)
        self.mapping = mapping(type, analyzer)

    def __call__(self, text):
        if not text:
            return []

        data = self.es.indices.analyze(index=self.index,
                                       body={"analyzer": self.analyzer, "text": text} )

        tokens = []
        for token in data["tokens"]:
            tokens.append((token["token"], token["position"]))
        tokens = list(map(lambda x: x[0], sorted(tokens, key=lambda x: x[1])))
        return tokens

    def del_index(self):
        self.es.indices.delete(index=self.index)
        print("The index has been deleted.")

    def init_index(self):
        self.es.indices.create(index=self.index, body=self.setting)
        self.es.indices.put_mapping(index=self.index, doc_type=self.type, body=self.mapping)
        print("The index has been initialised.")


def main():
    E_HOST = "localhost"
    E_PORT = 9200
    E_INDEX = 'sudachi_test'
    E_TYPE = "sudachi_test_type"
    E_ANALYZER = "sudachi_analyzer"
    analyzer = ESAnalyzer(host=E_HOST, port=E_PORT, index=E_INDEX, type=E_TYPE, analyzer=E_ANALYZER)

    text="東京特許許可局です"

    print("-" * 30 + "Index変更前" + "-" * 30)
    already_index_exist=False
    try:
        print(analyzer(text))
        already_index_exist = True
    except Exception as e:
        print("-" * 30 + "Indexがないので作成します" + "-" * 30)
        print(e)

    if already_index_exist:
        try:
            analyzer.del_index()
        except Exception as e:
            print(e)

    try:
        analyzer.init_index()
    except Exception as e:
        print(e)

    print("-" * 30 + "変更後" + "-" * 30)
    print(analyzer(text))




if __name__ == "__main__":
    main()

試しに、このまま実行すると

['東京特許許可局', '東京', '特許', '許可', '局']

上記の結果が得られます。
次に、settingの以下の部分をコメントアウト

els.py
                        "filter": [
                            # "stopword"
                        ]

再度実行すると以下の結果を得ます。

------------------------------Index変更前------------------------------
['東京特許許可局', '東京', '特許', '許可', '局']
The index has been deleted.
The index has been initialised.
------------------------------変更後------------------------------
['東京特許許可局', '東京', '特許', '許可', '局', 'です']
1
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
3