ElasticSearch Sudachi Windows + Pythonの続編
- 構築はこちらを参考に
- PythonでMapping/Index作成までやる
- SudachiのTokenizerを弄りたい場合は、settingのJson部分を変更して実行
- 自動的にIndexを作り直します
- 前回記事でわからなかったフォルダ指定も解決
- D:\elasticsearch-5.6.5\config\sudachiに辞書ファイルを突っ込むだけでOK
els.py
from elasticsearch import Elasticsearch
def setting(es_analyze):
ES_SETTINGS = {
"settings": {
"analysis": {
"tokenizer": {
"sudachi_tokenizer": {
"type": "sudachi_tokenizer",
"mode": "search",
"discard_punctuation": True,
"resources_path": "sudachi"
}
},
"filter": {
"stopword": {
"type": "stop",
"stopwords": [
"です","ます"
]
}
},
"analyzer": {
es_analyze: {
"type": "custom",
"char_filter": [
],
"tokenizer": "sudachi_tokenizer",
"filter": [
"stopword"
]
}
}
}
}
}
return ES_SETTINGS
def mapping(es_type, es_analyze):
ES_MAPPING = {
es_type: {
"properties": {
"text": {
"type": "string",
"include_in_all": False,
"store": False,
"index": "not_analyzed"
},
"_analyzed": {
"type": "string",
"store": True,
"index": "analyzed",
"analyzer": es_analyze
}
}
}
}
return ES_MAPPING
class ESAnalyzer(object):
def __init__(self, host="localhost", port=9200, index=None, type=None, analyzer=None):
if index is None:
index="test"
if type is None:
type="test"
if analyzer is None:
index="test"
self.es = Elasticsearch(hosts=[{"host": host, "port": port}], send_get_body_as="POST")
self.index = index
self.type = type
self.analyzer = analyzer
self.setting = setting(analyzer)
self.mapping = mapping(type, analyzer)
def __call__(self, text):
if not text:
return []
data = self.es.indices.analyze(index=self.index,
body={"analyzer": self.analyzer, "text": text} )
tokens = []
for token in data["tokens"]:
tokens.append((token["token"], token["position"]))
tokens = list(map(lambda x: x[0], sorted(tokens, key=lambda x: x[1])))
return tokens
def del_index(self):
self.es.indices.delete(index=self.index)
print("The index has been deleted.")
def init_index(self):
self.es.indices.create(index=self.index, body=self.setting)
self.es.indices.put_mapping(index=self.index, doc_type=self.type, body=self.mapping)
print("The index has been initialised.")
def main():
E_HOST = "localhost"
E_PORT = 9200
E_INDEX = 'sudachi_test'
E_TYPE = "sudachi_test_type"
E_ANALYZER = "sudachi_analyzer"
analyzer = ESAnalyzer(host=E_HOST, port=E_PORT, index=E_INDEX, type=E_TYPE, analyzer=E_ANALYZER)
text="東京特許許可局です"
print("-" * 30 + "Index変更前" + "-" * 30)
already_index_exist=False
try:
print(analyzer(text))
already_index_exist = True
except Exception as e:
print("-" * 30 + "Indexがないので作成します" + "-" * 30)
print(e)
if already_index_exist:
try:
analyzer.del_index()
except Exception as e:
print(e)
try:
analyzer.init_index()
except Exception as e:
print(e)
print("-" * 30 + "変更後" + "-" * 30)
print(analyzer(text))
if __name__ == "__main__":
main()
試しに、このまま実行すると
['東京特許許可局', '東京', '特許', '許可', '局']
上記の結果が得られます。
次に、settingの以下の部分をコメントアウト
els.py
"filter": [
# "stopword"
]
再度実行すると以下の結果を得ます。
------------------------------Index変更前------------------------------
['東京特許許可局', '東京', '特許', '許可', '局']
The index has been deleted.
The index has been initialised.
------------------------------変更後------------------------------
['東京特許許可局', '東京', '特許', '許可', '局', 'です']