LoginSignup
0

More than 1 year has passed since last update.

posted at

updated at

NLP4J - Java で 英語の形態素解析(Stanford NLPを利用)

Index

Stanford NLPとは

スタンフォードNLPグループが提供する、主要な計算言語学の問題に対して、統計NLP、深層学習NLP、およびルールベースのNLPツールです。

Software - The Stanford Natural Language Processing Group
https://nlp.stanford.edu/software/

項目 説明
提供者 スタンフォード大学 スタンフォードNLPグループ
提供形式 Java ライブラリ

Example (without NLP4J)

package hello.stanford;

import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

public class HelloStanfordNlp001PosLemma {
    public static void main(String[] args) {
        String text = "This is test. He runs fast.";
        Properties properties = new Properties();
        // "tokenize, ssplit, pos, lemma, depparse"
        // https://stanfordnlp.github.io/CoreNLP/annotators.html
        // tokenize : TokenizerAnnotator
        // ssplit : WordsToSentencesAnnotator : Splits a sequence of tokens into
        // sentences.
        // pos : POSTaggerAnnotator : Labels tokens with their POS tag.
        // lemma : 見出し
        // lemma : MorphaAnnotator : Generates the word lemmas for all tokens in the
        // corpus.
        properties.setProperty("annotators", "tokenize, ssplit, pos, lemma");
        StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
        Annotation annotation = new Annotation(text);
        coreNLP.annotate(annotation);
        {
            List<CoreLabel> cl = annotation.get(TokensAnnotation.class);
            for (CoreLabel label : cl) {
                System.err.println("<token>");
                // [class edu.stanford.nlp.ling.CoreAnnotations$ValueAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$OriginalTextAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$BeforeAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$AfterAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$IsNewlineAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$IndexAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$PartOfSpeechAnnotation,
                // class edu.stanford.nlp.ling.CoreAnnotations$LemmaAnnotation]
                System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
                System.err.println("PartOfSpeechAnnotation=" + label.get(PartOfSpeechAnnotation.class));
                System.err.println("LemmaAnnotation=" + label.get(LemmaAnnotation.class));
                System.err.println("</token>");
            }
        }
        System.err.println("---");
        System.err.println("TextAnnotation");
        // class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
        {
            System.err.println(annotation.get(TextAnnotation.class));
        }
        // class edu.stanford.nlp.ling.CoreAnnotations$SentencesAnnotation
        System.err.println("SentencesAnnotation");

        {
            List<CoreMap> sentenceMap = annotation.get(SentencesAnnotation.class);

            for (CoreMap label : sentenceMap) {
                System.err.println("<sentence>");
                // [class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
                System.err.println("TextAnnotation=" + label.get(TextAnnotation.class));
                // class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
                System.err.println(
                            "CharacterOffsetBeginAnnotation=" + label.get(CharacterOffsetBeginAnnotation.class));
                // class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
                System.err.println("CharacterOffsetEndAnnotation=" + label.get(CharacterOffsetEndAnnotation.class));
                // class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation,
                System.err.println("TokensAnnotation=" + label.get(TokensAnnotation.class));
                // class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
                System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
                // class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
                System.err.println("TokenBeginAnnotation=" + label.get(TokenBeginAnnotation.class));
                // class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation]
                System.err.println("TokenEndAnnotation=" + label.get(TokenEndAnnotation.class));
                System.err.println("</sentence>");
            }
        }
    }
}

Output Example (without NLP4J)

<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=DT
LemmaAnnotation=this
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=be
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=NN
LemmaAnnotation=test
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=PRP
LemmaAnnotation=he
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=run
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=RB
LemmaAnnotation=fast
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
---
TextAnnotation
This is test. He runs fast.
SentencesAnnotation
<sentence>
TextAnnotation=This is test.
CharacterOffsetBeginAnnotation=0
CharacterOffsetEndAnnotation=13
TokensAnnotation=[This-1, is-2, test-3, .-4]
SentenceIndexAnnotation=0
TokenBeginAnnotation=0
TokenEndAnnotation=4
</sentence>
<sentence>
TextAnnotation=He runs fast.
CharacterOffsetBeginAnnotation=14
CharacterOffsetEndAnnotation=27
TokensAnnotation=[He-1, runs-2, fast-3, .-4]
SentenceIndexAnnotation=1
TokenBeginAnnotation=4
TokenEndAnnotation=8
</sentence>

Maven Dependencies for NLP4J

<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-core</artifactId>
    <version>[1.3.1.0,)</version>
</dependency>
<dependency>
    <groupId>org.nlp4j</groupId>
    <artifactId>nlp4j-stanford</artifactId>
    <version>[1.3.0.0,)</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
    <groupId>edu.stanford.nlp</groupId>
    <artifactId>stanford-corenlp</artifactId>
    <version>4.0.0</version>
    <scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
    <groupId>edu.stanford.nlp</groupId>
    <artifactId>stanford-corenlp</artifactId>
    <version>4.0.0</version>
    <classifier>models</classifier>
    <scope>provided</scope>
</dependency>

コード with NLP4J

package nlp4j.stanford.examples;

import nlp4j.Document;
import nlp4j.Keyword;
import nlp4j.impl.DefaultDocument;
import nlp4j.stanford.StanfordPosAnnotator;

public class StanfordPosAnnotatorExample0 {

    public static void main(String[] args) throws Exception {
        Document doc = new DefaultDocument();
        {
            doc.putAttribute("text", "I eat sushi with chopsticks.");
        }
        StanfordPosAnnotator ann = new StanfordPosAnnotator();
        {
            ann.setProperty("target", "text");
        }
        ann.annotate(doc); // do annotation
        for (Keyword kwd : doc.getKeywords()) {
            System.err.println(kwd);
        }
    }
}

結果 with NLP4J

I [facet=word.PRP, str=I]
eat [facet=word.VBP, str=eat]
sushi [facet=word.NN, str=sushi]
with [facet=word.IN, str=with]
chopstick [facet=word.NNS, str=chopsticks]
. [facet=word.., str=.]

Index

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
What you can do with signing up
0