日本語検索(形態素解析)、nGram検索(2文字とかで分割して検索するやつ)をやってみたメモです。
環境はElasticSearchで日本語検索するためのローカル環境構築を使いました。
準備
設定とマッピングを登録する
index test
PUT http://localhost:9200/test
{
"settings": {
"index": {
"analysis": {
"tokenizer": {
"ja_text_tokenizer": {
"type": "kuromoji_tokenizer",
"mode": "search"
},
"ngram_tokenizer": {
"type": "nGram",
"min_gram" : 2,
"max_gram" : 2,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"ja_text_analyzer": {
"tokenizer": "ja_text_tokenizer",
"type": "custom"
},
"ngram_analyzer": {
"tokenizer": "ngram_tokenizer",
"type": "custom"
}
}
}
}
},
"mappings": {
"account": {
"properties": {
"name": {
"type": "string",
"analyzer": "ja_text_analyzer"
},
"userId": {
"type": "string",
"analyzer": "ngram_analyzer"
},
"id": {
"type": "long"
},
"discription": {
"type": "string",
"analyzer": "ngram_analyzer"
}
}
}
}
}
データを登録する
POST http://localhost:9200/test/account
{
"id" : 1,
"name" : "nattyナッティ",
"userId" : "natty420",
"description" : "ナッティと言います。よろしくお願いします。"
}
{
"id" : 2,
"name" : "もりちゃん",
"userId" : "morichan",
"description" : "冷麺の季節ですね"
}
{
"id" : 3,
"name" : "どんちゃん",
"userId" : "donchan",
"description" : "@morichanとは大の仲良し。また一緒に渋谷で冷麺食べにいきたいね!もりちゃん隊"
}
検索してみる
account内にて「name」に「ちゃん」が含まれているもの
POST http://localhost:9200/test/account/_search
{
"query": {
"match": {
"name" : "ちゃん"
}
}
}
結果
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits":
{
"total": 2,
"max_score": 0.25811607,
"hits": [
{
"_index": "test",
"_type": "account",
"_id": "AV0qOYLCNr31zgqLSH0o",
"_score": 0.25811607,
"_source": {
"id": "3",
"name": "どんちゃん",
"userId": "donchan",
"description": "@morichanとは大の仲良し。また一緒に渋谷で冷麺食べにいきたいね!もりちゃん隊"
}
},
{
"_index": "test",
"_type": "account",
"_id": "AV0qOWMrNr31zgqLSH0n",
"_score": 0.25811607,
"_source": {
"id": "2",
"name": "もりちゃん",
"userId": "morichan",
"description": "冷麺の季節ですね"
}
}
]
}
}
account内にて「description」に「冷麺食べに」が含まれているもの
POST http://localhost:9200/test/account/_search
{
"query": {
"match": {
"description" : "冷麺食べに"
}
}
}
結果
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits":
{
"total": 2,
"max_score": 1.3991506,
"hits": [
{
"_index": "test",
"_type": "account",
"_id": "AV0qOYLCNr31zgqLSH0o",
"_score": 1.3991506,
"_source": {
"id": "3",
"name": "どんちゃん",
"userId": "donchan",
"description": "@morichanとは大の仲良し。また一緒に渋谷で冷麺食べにいきたいね!もりちゃん隊"
}
},
{
"_index": "test",
"_type": "account",
"_id": "AV0qOWMrNr31zgqLSH0n",
"_score": 0.51623213,
"_source": {
"id": "2",
"name": "もりちゃん",
"userId": "morichan",
"description": "冷麺の季節ですね"
}
}
]
}
}
account内にて「userId」に「ch」が含まれているもの
POST http://localhost:9200/test/account/_search
{
"query": {
"match": {
"userId" : "ch"
}
}
}
結果
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.28582606,
"hits": [
{
"_index": "test",
"_type": "account",
"_id": "AV0qOWMrNr31zgqLSH0n",
"_score": 0.28582606,
"_source": {
"id": "2",
"name": "もりちゃん",
"userId": "morichan",
"description": "冷麺の季節ですね"
}
},
{
"_index": "test",
"_type": "account",
"_id": "AV0qOYLCNr31zgqLSH0o",
"_score": 0.26742277,
"_source": {
"id": "3",
"name": "どんちゃん",
"userId": "donchan",
"description": "@morichanとは大の仲良し。また一緒に渋谷で冷麺食べにいきたいね!もりちゃん隊"
}
}
]
}
}