LoginSignup
1
0

More than 5 years have passed since last update.

Apache Solr 7.6 + SolrJ で形態素解析の詳しい結果を取得する(日本語)

Last updated at Posted at 2018-12-27

概要

Solr の日本語解析はデフォルトで形態素解析が行われます。
Javaで形態素解析を利用したいときのコードです。
管理コンソールではAnalysisのページでVerbose Outputとすると同様の結果を得られます。

ソースコード


package hello.solr;

import java.util.ArrayList;
import java.util.HashMap;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.DocumentAnalysisRequest;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;

public class HelloAnalysisJapaneseSimple {

    @SuppressWarnings({ "unchecked", "rawtypes" })
    static public void main(String[] args) throws Exception {

        String fieldName = "field_text_ja";
        String coreName = "core_nlp";
        String text = "こんにちは。今日はいい天気ですね。私は日産自動車の社員です。";

        HashMap<String, SolrInputField> fields = new HashMap<String, SolrInputField>();

        // Document
        SolrInputDocument doc = new SolrInputDocument(fields);
        {
            // Document Field
            doc.setField("id", "0");
            doc.setField(fieldName, text);
        }

        // Request
        DocumentAnalysisRequest request = new DocumentAnalysisRequest();
        request.addDocument(doc);

        String solrLocation = "http://localhost:8983/solr/" + coreName;

        // NLP Client
        SolrClient client = new HttpSolrClient.Builder(solrLocation).build();

        // NLP Response
        NamedList<Object> response = client.request(request);

        // Get analysis response
        NamedList<Object> analysis = (NamedList<Object>) response
                .get("analysis");

        SimpleOrderedMap f = ((SimpleOrderedMap) ((SimpleOrderedMap) analysis
                .getVal(0)).get(fieldName));

        SimpleOrderedMap index = (SimpleOrderedMap) f.get("index");

        NamedList nlpResult = (NamedList) index.getVal(0);

        System.err.println("Tokenizer,Filter ---");
        {
            for (int n = 0; n < nlpResult.size(); n++) {
                System.err.println(nlpResult.getName(n) + "="
                        + nlpResult.getVal(n));
            }
        }

        ArrayList wordListPOS = (ArrayList) nlpResult
                .get("org.apache.lucene.analysis.ja.JapaneseTokenizer");

        if (wordListPOS != null) {
            for (int n = 0; n < wordListPOS.size(); n++) {
                SimpleOrderedMap wordPOS = (SimpleOrderedMap) wordListPOS
                        .get(n);

                if (n == 0) {
                    System.err.println("<names>");
                    for (int m = 0; m < wordPOS.size(); m++) {
                        System.err.println(wordPOS.getName(m) + "="
                                + wordPOS.getVal(m));
                    }
                    System.err.println("</names>");
                }

                String namePOS = "org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech";
                String nameREADING = "org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading";

                System.err.println( //
                        "text='" + wordPOS.get("text") + "'" //
                        + ",type='" + wordPOS.get("type") + "'" //
                        + ",partOfSpeech='" + wordPOS.get(namePOS) + "'" //
                        + ",reading='" + wordPOS.get(nameREADING) + "'" //
                        );
            }
        }

    }
}


結果


<names>
text=こんにちは
raw_bytes=[e3 81 93 e3 82 93 e3 81 ab e3 81 a1 e3 81 af]
start=0
end=5
org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#positionLength=1
type=word
org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute#termFrequency=1
org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute#baseForm=null
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech=感動詞
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech (en)=interjection
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading=コンニチハ
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading (en)=konnichiha
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation=コンニチワ
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation (en)=konnichiwa
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType (en)=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm (en)=null
position=1
positionHistory=[1]
</names>
text='こんにちは',type='word',partOfSpeech='感動詞',reading='コンニチハ'
text='今日',type='word',partOfSpeech='名詞-副詞可能',reading='キョウ'
text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ'
text='いい',type='word',partOfSpeech='形容詞-自立',reading='イイ'
text='天気',type='word',partOfSpeech='名詞-一般',reading='テンキ'
text='です',type='word',partOfSpeech='助動詞',reading='デス'
text='ね',type='word',partOfSpeech='助詞-終助詞',reading='ネ'
text='私',type='word',partOfSpeech='名詞-代名詞-一般',reading='ワタシ'
text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ'
text='日産',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサン'
text='日産自動車',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサンジドウシャ'
text='自動車',type='word',partOfSpeech='名詞-一般',reading='ジドウシャ'
text='の',type='word',partOfSpeech='助詞-連体化',reading='ノ'
text='社員',type='word',partOfSpeech='名詞-一般',reading='シャイン'
text='です',type='word',partOfSpeech='助動詞',reading='デス'


1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0