1. hogedeath

    Posted

    hogedeath
Changes in title
+(続)ElasticSearch Sudachi Windows + Python
Changes in tags
Changes in body
Source | HTML | Preview
@@ -0,0 +1,177 @@
+# [ElasticSearch Sudachi Windows + Python](https://qiita.com/hogedeath/items/d609ce74f330d404fb6c)の続編
+- [構築はこちらを参考に](https://qiita.com/sorami/items/99604ef105f13d2d472b)
+- PythonでMapping/Index作成までやる
+ - SudachiのTokenizerを弄りたい場合は、settingのJson部分を変更して実行
+ - 自動的にIndexを作り直します
+ - 前回記事でわからなかったフォルダ指定も解決
+ - D:\elasticsearch-5.6.5\config\sudachiに辞書ファイルを突っ込むだけでOK
+
+
+```els.py
+from elasticsearch import Elasticsearch
+
+
+def setting(es_analyze):
+ ES_SETTINGS = {
+ "settings": {
+ "analysis": {
+ "tokenizer": {
+ "sudachi_tokenizer": {
+ "type": "sudachi_tokenizer",
+ "mode": "search",
+ "discard_punctuation": True,
+ "resources_path": "sudachi"
+ }
+
+ },
+ "filter": {
+ "stopword": {
+ "type": "stop",
+ "stopwords": [
+ "です","ます"
+ ]
+ }
+ },
+ "analyzer": {
+ es_analyze: {
+ "type": "custom",
+ "char_filter": [
+ ],
+ "tokenizer": "sudachi_tokenizer",
+ "filter": [
+ "stopword"
+ ]
+ }
+ }
+ }
+ }
+ }
+ return ES_SETTINGS
+
+
+def mapping(es_type, es_analyze):
+ ES_MAPPING = {
+ es_type: {
+ "properties": {
+ "text": {
+ "type": "string",
+ "include_in_all": False,
+ "store": False,
+ "index": "not_analyzed"
+ },
+ "_analyzed": {
+ "type": "string",
+ "store": True,
+ "index": "analyzed",
+ "analyzer": es_analyze
+ }
+ }
+ }
+ }
+ return ES_MAPPING
+
+
+class ESAnalyzer(object):
+ def __init__(self, host="localhost", port=9200, index=None, type=None, analyzer=None):
+ if index is None:
+ index="test"
+ if type is None:
+ type="test"
+ if analyzer is None:
+ index="test"
+
+ self.es = Elasticsearch(hosts=[{"host": host, "port": port}], send_get_body_as="POST")
+ self.index = index
+ self.type = type
+ self.analyzer = analyzer
+ self.setting = setting(analyzer)
+ self.mapping = mapping(type, analyzer)
+
+ def __call__(self, text):
+ if not text:
+ return []
+
+ data = self.es.indices.analyze(index=self.index,
+ body={"analyzer": self.analyzer, "text": text} )
+
+ tokens = []
+ for token in data["tokens"]:
+ tokens.append((token["token"], token["position"]))
+ tokens = list(map(lambda x: x[0], sorted(tokens, key=lambda x: x[1])))
+ return tokens
+
+ def del_index(self):
+ self.es.indices.delete(index=self.index)
+ print("The index has been deleted.")
+
+ def init_index(self):
+ self.es.indices.create(index=self.index, body=self.setting)
+ self.es.indices.put_mapping(index=self.index, doc_type=self.type, body=self.mapping)
+ print("The index has been initialised.")
+
+
+def main():
+ E_HOST = "localhost"
+ E_PORT = 9200
+ E_INDEX = 'sudachi_test'
+ E_TYPE = "sudachi_test_type"
+ E_ANALYZER = "sudachi_analyzer"
+ analyzer = ESAnalyzer(host=E_HOST, port=E_PORT, index=E_INDEX, type=E_TYPE, analyzer=E_ANALYZER)
+
+ text="東京特許許可局です"
+
+ print("-" * 30 + "Index変更前" + "-" * 30)
+ already_index_exist=False
+ try:
+ print(analyzer(text))
+ already_index_exist = True
+ except Exception as e:
+ print("-" * 30 + "Indexがないので作成します" + "-" * 30)
+ print(e)
+
+ if already_index_exist:
+ try:
+ analyzer.del_index()
+ except Exception as e:
+ print(e)
+
+ try:
+ analyzer.init_index()
+ except Exception as e:
+ print(e)
+
+ print("-" * 30 + "変更後" + "-" * 30)
+ print(analyzer(text))
+
+
+
+
+if __name__ == "__main__":
+ main()
+
+```
+
+試しに、このまま実行すると
+
+```text
+['東京特許許可局', '東京', '特許', '許可', '局']
+```
+
+上記の結果が得られます。
+次に、settingの以下の部分をコメントアウト
+
+```els.py
+ "filter": [
+ # "stopword"
+ ]
+```
+再度実行すると以下の結果を得ます。
+
+```text
+------------------------------Index変更前------------------------------
+['東京特許許可局', '東京', '特許', '許可', '局']
+The index has been deleted.
+The index has been initialised.
+------------------------------変更後------------------------------
+['東京特許許可局', '東京', '特許', '許可', '局', 'です']
+```