概要
Solr の日本語解析はデフォルトで形態素解析が行われます。
Javaで形態素解析を利用したいときのコードです。
管理コンソールではAnalysisのページでVerbose Outputとすると同様の結果を得られます。
ソースコード
package hello.solr;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.DocumentAnalysisRequest;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
public class HelloAnalysisJapaneseSimple {
@SuppressWarnings({ "unchecked", "rawtypes" })
static public void main(String[] args) throws Exception {
String fieldName = "field_text_ja";
String coreName = "core_nlp";
String text = "こんにちは。今日はいい天気ですね。私は日産自動車の社員です。";
HashMap<String, SolrInputField> fields = new HashMap<String, SolrInputField>();
// Document
SolrInputDocument doc = new SolrInputDocument(fields);
{
// Document Field
doc.setField("id", "0");
doc.setField(fieldName, text);
}
// Request
DocumentAnalysisRequest request = new DocumentAnalysisRequest();
request.addDocument(doc);
String solrLocation = "http://localhost:8983/solr/" + coreName;
// NLP Client
SolrClient client = new HttpSolrClient.Builder(solrLocation).build();
// NLP Response
NamedList<Object> response = client.request(request);
// Get analysis response
NamedList<Object> analysis = (NamedList<Object>) response
.get("analysis");
SimpleOrderedMap f = ((SimpleOrderedMap) ((SimpleOrderedMap) analysis
.getVal(0)).get(fieldName));
SimpleOrderedMap index = (SimpleOrderedMap) f.get("index");
NamedList nlpResult = (NamedList) index.getVal(0);
System.err.println("Tokenizer,Filter ---");
{
for (int n = 0; n < nlpResult.size(); n++) {
System.err.println(nlpResult.getName(n) + "="
+ nlpResult.getVal(n));
}
}
ArrayList wordListPOS = (ArrayList) nlpResult
.get("org.apache.lucene.analysis.ja.JapaneseTokenizer");
if (wordListPOS != null) {
for (int n = 0; n < wordListPOS.size(); n++) {
SimpleOrderedMap wordPOS = (SimpleOrderedMap) wordListPOS
.get(n);
if (n == 0) {
System.err.println("<names>");
for (int m = 0; m < wordPOS.size(); m++) {
System.err.println(wordPOS.getName(m) + "="
+ wordPOS.getVal(m));
}
System.err.println("</names>");
}
String namePOS = "org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech";
String nameREADING = "org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading";
System.err.println( //
"text='" + wordPOS.get("text") + "'" //
+ ",type='" + wordPOS.get("type") + "'" //
+ ",partOfSpeech='" + wordPOS.get(namePOS) + "'" //
+ ",reading='" + wordPOS.get(nameREADING) + "'" //
);
}
}
}
}
結果
<names>
text=こんにちは
raw_bytes=[e3 81 93 e3 82 93 e3 81 ab e3 81 a1 e3 81 af]
start=0
end=5
org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#positionLength=1
type=word
org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute#termFrequency=1
org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute#baseForm=null
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech=感動詞
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech (en)=interjection
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading=コンニチハ
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading (en)=konnichiha
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation=コンニチワ
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation (en)=konnichiwa
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType (en)=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm (en)=null
position=1
positionHistory=[1]
</names>
text='こんにちは',type='word',partOfSpeech='感動詞',reading='コンニチハ'
text='今日',type='word',partOfSpeech='名詞-副詞可能',reading='キョウ'
text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ'
text='いい',type='word',partOfSpeech='形容詞-自立',reading='イイ'
text='天気',type='word',partOfSpeech='名詞-一般',reading='テンキ'
text='です',type='word',partOfSpeech='助動詞',reading='デス'
text='ね',type='word',partOfSpeech='助詞-終助詞',reading='ネ'
text='私',type='word',partOfSpeech='名詞-代名詞-一般',reading='ワタシ'
text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ'
text='日産',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサン'
text='日産自動車',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサンジドウシャ'
text='自動車',type='word',partOfSpeech='名詞-一般',reading='ジドウシャ'
text='の',type='word',partOfSpeech='助詞-連体化',reading='ノ'
text='社員',type='word',partOfSpeech='名詞-一般',reading='シャイン'
text='です',type='word',partOfSpeech='助動詞',reading='デス'