More than 3 years have passed since last update.

NLP4J - Java で英語の形態素解析（Stanford NLPを利用）

Last updated at 2021-07-19Posted at 2021-07-18

Stanford NLPとは

スタンフォードNLPグループが提供する、主要な計算言語学の問題に対して、統計NLP、深層学習NLP、およびルールベースのNLPツールです。

Software - The Stanford Natural Language Processing Group
https://nlp.stanford.edu/software/

項目	説明
提供者	スタンフォード大学スタンフォードNLPグループ
提供形式	Java ライブラリ

Example (without NLP4J)

package hello.stanford;

import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

public class HelloStanfordNlp001PosLemma {
	public static void main(String[] args) {
		String text = "This is test. He runs fast.";
		Properties properties = new Properties();
		// "tokenize, ssplit, pos, lemma, depparse"
		// https://stanfordnlp.github.io/CoreNLP/annotators.html
		// tokenize : TokenizerAnnotator
		// ssplit : WordsToSentencesAnnotator : Splits a sequence of tokens into
		// sentences.
		// pos : POSTaggerAnnotator : Labels tokens with their POS tag.
		// lemma : 見出し
		// lemma : MorphaAnnotator : Generates the word lemmas for all tokens in the
		// corpus.
		properties.setProperty("annotators", "tokenize, ssplit, pos, lemma");
		StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
		Annotation annotation = new Annotation(text);
		coreNLP.annotate(annotation);
		{
			List<CoreLabel> cl = annotation.get(TokensAnnotation.class);
			for (CoreLabel label : cl) {
				System.err.println("<token>");
				// [class edu.stanford.nlp.ling.CoreAnnotations$ValueAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$OriginalTextAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$BeforeAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$AfterAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$IsNewlineAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$IndexAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$PartOfSpeechAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$LemmaAnnotation]
				System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
				System.err.println("PartOfSpeechAnnotation=" + label.get(PartOfSpeechAnnotation.class));
				System.err.println("LemmaAnnotation=" + label.get(LemmaAnnotation.class));
				System.err.println("</token>");
			}
		}
		System.err.println("---");
		System.err.println("TextAnnotation");
		// class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
		{
			System.err.println(annotation.get(TextAnnotation.class));
		}
		// class edu.stanford.nlp.ling.CoreAnnotations$SentencesAnnotation
		System.err.println("SentencesAnnotation");

		{
			List<CoreMap> sentenceMap = annotation.get(SentencesAnnotation.class);

			for (CoreMap label : sentenceMap) {
				System.err.println("<sentence>");
				// [class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
				System.err.println("TextAnnotation=" + label.get(TextAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
				System.err.println(
							"CharacterOffsetBeginAnnotation=" + label.get(CharacterOffsetBeginAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
				System.err.println("CharacterOffsetEndAnnotation=" + label.get(CharacterOffsetEndAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation,
				System.err.println("TokensAnnotation=" + label.get(TokensAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
				System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
				System.err.println("TokenBeginAnnotation=" + label.get(TokenBeginAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation]
				System.err.println("TokenEndAnnotation=" + label.get(TokenEndAnnotation.class));
				System.err.println("</sentence>");
			}
		}
	}
}

Output Example (without NLP4J)

<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=DT
LemmaAnnotation=this
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=be
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=NN
LemmaAnnotation=test
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=PRP
LemmaAnnotation=he
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=run
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=RB
LemmaAnnotation=fast
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
---
TextAnnotation
This is test. He runs fast.
SentencesAnnotation
<sentence>
TextAnnotation=This is test.
CharacterOffsetBeginAnnotation=0
CharacterOffsetEndAnnotation=13
TokensAnnotation=[This-1, is-2, test-3, .-4]
SentenceIndexAnnotation=0
TokenBeginAnnotation=0
TokenEndAnnotation=4
</sentence>
<sentence>
TextAnnotation=He runs fast.
CharacterOffsetBeginAnnotation=14
CharacterOffsetEndAnnotation=27
TokensAnnotation=[He-1, runs-2, fast-3, .-4]
SentenceIndexAnnotation=1
TokenBeginAnnotation=4
TokenEndAnnotation=8
</sentence>

Maven Dependencies for NLP4J

<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
	<groupId>org.nlp4j</groupId>
	<artifactId>nlp4j-core</artifactId>
	<version>[1.3.1.0,)</version>
</dependency>
<dependency>
	<groupId>org.nlp4j</groupId>
	<artifactId>nlp4j-stanford</artifactId>
	<version>[1.3.0.0,)</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.0.0</version>
	<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.0.0</version>
	<classifier>models</classifier>
	<scope>provided</scope>
</dependency>

コード with NLP4J

package nlp4j.stanford.examples;

import nlp4j.Document;
import nlp4j.Keyword;
import nlp4j.impl.DefaultDocument;
import nlp4j.stanford.StanfordPosAnnotator;

public class StanfordPosAnnotatorExample0 {

	public static void main(String[] args) throws Exception {
		Document doc = new DefaultDocument();
		{
			doc.putAttribute("text", "I eat sushi with chopsticks.");
		}
		StanfordPosAnnotator ann = new StanfordPosAnnotator();
		{
			ann.setProperty("target", "text");
		}
		ann.annotate(doc); // do annotation
		for (Keyword kwd : doc.getKeywords()) {
			System.err.println(kwd);
		}
	}
}

結果 with NLP4J

I [facet=word.PRP, str=I]
eat [facet=word.VBP, str=eat]
sushi [facet=word.NN, str=sushi]
with [facet=word.IN, str=with]
chopstick [facet=word.NNS, str=chopsticks]
. [facet=word.., str=.]

Index

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up

NLP4J - Java で 英語の形態素解析（Stanford NLPを利用）