More than 5 years have passed since last update.

Apache Solr 7.6 + SolrJ で形態素解析の詳しい結果を取得する（日本語）

Last updated at 2018-12-27Posted at 2018-12-27

概要

Solr の日本語解析はデフォルトで形態素解析が行われます。
Javaで形態素解析を利用したいときのコードです。
管理コンソールではAnalysisのページでVerbose Outputとすると同様の結果を得られます。

ソースコード


package hello.solr;

import java.util.ArrayList;
import java.util.HashMap;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.DocumentAnalysisRequest;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;

public class HelloAnalysisJapaneseSimple {

	@SuppressWarnings({ "unchecked", "rawtypes" })
	static public void main(String[] args) throws Exception {

		String fieldName = "field_text_ja";
		String coreName = "core_nlp";
		String text = "こんにちは。今日はいい天気ですね。私は日産自動車の社員です。";

		HashMap<String, SolrInputField> fields = new HashMap<String, SolrInputField>();

		// Document
		SolrInputDocument doc = new SolrInputDocument(fields);
		{
			// Document Field
			doc.setField("id", "0");
			doc.setField(fieldName, text);
		}

		// Request
		DocumentAnalysisRequest request = new DocumentAnalysisRequest();
		request.addDocument(doc);

		String solrLocation = "http://localhost:8983/solr/" + coreName;

		// NLP Client
		SolrClient client = new HttpSolrClient.Builder(solrLocation).build();

		// NLP Response
		NamedList<Object> response = client.request(request);

		// Get analysis response
		NamedList<Object> analysis = (NamedList<Object>) response
				.get("analysis");

		SimpleOrderedMap f = ((SimpleOrderedMap) ((SimpleOrderedMap) analysis
				.getVal(0)).get(fieldName));

		SimpleOrderedMap index = (SimpleOrderedMap) f.get("index");

		NamedList nlpResult = (NamedList) index.getVal(0);

		System.err.println("Tokenizer,Filter ---");
		{
			for (int n = 0; n < nlpResult.size(); n++) {
				System.err.println(nlpResult.getName(n) + "="
						+ nlpResult.getVal(n));
			}
		}

		ArrayList wordListPOS = (ArrayList) nlpResult
				.get("org.apache.lucene.analysis.ja.JapaneseTokenizer");

		if (wordListPOS != null) {
			for (int n = 0; n < wordListPOS.size(); n++) {
				SimpleOrderedMap wordPOS = (SimpleOrderedMap) wordListPOS
						.get(n);

				if (n == 0) {
					System.err.println("<names>");
					for (int m = 0; m < wordPOS.size(); m++) {
						System.err.println(wordPOS.getName(m) + "="
								+ wordPOS.getVal(m));
					}
					System.err.println("</names>");
				}

				String namePOS = "org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech";
				String nameREADING = "org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading";

				System.err.println( //
						"text='" + wordPOS.get("text") + "'" //
						+ ",type='"	+ wordPOS.get("type") + "'" //
						+ ",partOfSpeech='" + wordPOS.get(namePOS) + "'" //
						+ ",reading='" + wordPOS.get(nameREADING) + "'" //
						);
			}
		}

	}
}

結果


<names>
text=こんにちは
raw_bytes=[e3 81 93 e3 82 93 e3 81 ab e3 81 a1 e3 81 af]
start=0
end=5
org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#positionLength=1
type=word
org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute#termFrequency=1
org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute#baseForm=null
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech=感動詞
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech (en)=interjection
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading=コンニチハ
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading (en)=konnichiha
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation=コンニチワ
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation (en)=konnichiwa
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType (en)=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm (en)=null
position=1
positionHistory=[1]
</names>
text='こんにちは',type='word',partOfSpeech='感動詞',reading='コンニチハ'
text='今日',type='word',partOfSpeech='名詞-副詞可能',reading='キョウ'
text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ'
text='いい',type='word',partOfSpeech='形容詞-自立',reading='イイ'
text='天気',type='word',partOfSpeech='名詞-一般',reading='テンキ'
text='です',type='word',partOfSpeech='助動詞',reading='デス'
text='ね',type='word',partOfSpeech='助詞-終助詞',reading='ネ'
text='私',type='word',partOfSpeech='名詞-代名詞-一般',reading='ワタシ'
text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ'
text='日産',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサン'
text='日産自動車',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサンジドウシャ'
text='自動車',type='word',partOfSpeech='名詞-一般',reading='ジドウシャ'
text='の',type='word',partOfSpeech='助詞-連体化',reading='ノ'
text='社員',type='word',partOfSpeech='名詞-一般',reading='シャイン'
text='です',type='word',partOfSpeech='助動詞',reading='デス'

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up