0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

NLP4J - Java で 英語の形態素解析(Stanford NLPを利用)

Last updated at Posted at 2021-07-18

Index

Stanford NLPとは

スタンフォードNLPグループが提供する、主要な計算言語学の問題に対して、統計NLP、深層学習NLP、およびルールベースのNLPツールです。

Software - The Stanford Natural Language Processing Group
https://nlp.stanford.edu/software/

項目 説明
提供者 スタンフォード大学 スタンフォードNLPグループ
提供形式 Java ライブラリ

Example (without NLP4J)

package hello.stanford;

import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

public class HelloStanfordNlp001PosLemma {
	public static void main(String[] args) {
		String text = "This is test. He runs fast.";
		Properties properties = new Properties();
		// "tokenize, ssplit, pos, lemma, depparse"
		// https://stanfordnlp.github.io/CoreNLP/annotators.html
		// tokenize : TokenizerAnnotator
		// ssplit : WordsToSentencesAnnotator : Splits a sequence of tokens into
		// sentences.
		// pos : POSTaggerAnnotator : Labels tokens with their POS tag.
		// lemma : 見出し
		// lemma : MorphaAnnotator : Generates the word lemmas for all tokens in the
		// corpus.
		properties.setProperty("annotators", "tokenize, ssplit, pos, lemma");
		StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
		Annotation annotation = new Annotation(text);
		coreNLP.annotate(annotation);
		{
			List<CoreLabel> cl = annotation.get(TokensAnnotation.class);
			for (CoreLabel label : cl) {
				System.err.println("<token>");
				// [class edu.stanford.nlp.ling.CoreAnnotations$ValueAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$OriginalTextAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$BeforeAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$AfterAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$IsNewlineAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$IndexAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$PartOfSpeechAnnotation,
				// class edu.stanford.nlp.ling.CoreAnnotations$LemmaAnnotation]
				System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
				System.err.println("PartOfSpeechAnnotation=" + label.get(PartOfSpeechAnnotation.class));
				System.err.println("LemmaAnnotation=" + label.get(LemmaAnnotation.class));
				System.err.println("</token>");
			}
		}
		System.err.println("---");
		System.err.println("TextAnnotation");
		// class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
		{
			System.err.println(annotation.get(TextAnnotation.class));
		}
		// class edu.stanford.nlp.ling.CoreAnnotations$SentencesAnnotation
		System.err.println("SentencesAnnotation");

		{
			List<CoreMap> sentenceMap = annotation.get(SentencesAnnotation.class);

			for (CoreMap label : sentenceMap) {
				System.err.println("<sentence>");
				// [class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
				System.err.println("TextAnnotation=" + label.get(TextAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
				System.err.println(
							"CharacterOffsetBeginAnnotation=" + label.get(CharacterOffsetBeginAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
				System.err.println("CharacterOffsetEndAnnotation=" + label.get(CharacterOffsetEndAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation,
				System.err.println("TokensAnnotation=" + label.get(TokensAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
				System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
				System.err.println("TokenBeginAnnotation=" + label.get(TokenBeginAnnotation.class));
				// class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation]
				System.err.println("TokenEndAnnotation=" + label.get(TokenEndAnnotation.class));
				System.err.println("</sentence>");
			}
		}
	}
}

#Output Example (without NLP4J)

<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=DT
LemmaAnnotation=this
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=be
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=NN
LemmaAnnotation=test
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=PRP
LemmaAnnotation=he
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=run
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=RB
LemmaAnnotation=fast
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
---
TextAnnotation
This is test. He runs fast.
SentencesAnnotation
<sentence>
TextAnnotation=This is test.
CharacterOffsetBeginAnnotation=0
CharacterOffsetEndAnnotation=13
TokensAnnotation=[This-1, is-2, test-3, .-4]
SentenceIndexAnnotation=0
TokenBeginAnnotation=0
TokenEndAnnotation=4
</sentence>
<sentence>
TextAnnotation=He runs fast.
CharacterOffsetBeginAnnotation=14
CharacterOffsetEndAnnotation=27
TokensAnnotation=[He-1, runs-2, fast-3, .-4]
SentenceIndexAnnotation=1
TokenBeginAnnotation=4
TokenEndAnnotation=8
</sentence>

#Maven Dependencies for NLP4J

<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
	<groupId>org.nlp4j</groupId>
	<artifactId>nlp4j-core</artifactId>
	<version>[1.3.1.0,)</version>
</dependency>
<dependency>
	<groupId>org.nlp4j</groupId>
	<artifactId>nlp4j-stanford</artifactId>
	<version>[1.3.0.0,)</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.0.0</version>
	<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.0.0</version>
	<classifier>models</classifier>
	<scope>provided</scope>
</dependency>

コード with NLP4J

package nlp4j.stanford.examples;

import nlp4j.Document;
import nlp4j.Keyword;
import nlp4j.impl.DefaultDocument;
import nlp4j.stanford.StanfordPosAnnotator;

public class StanfordPosAnnotatorExample0 {

	public static void main(String[] args) throws Exception {
		Document doc = new DefaultDocument();
		{
			doc.putAttribute("text", "I eat sushi with chopsticks.");
		}
		StanfordPosAnnotator ann = new StanfordPosAnnotator();
		{
			ann.setProperty("target", "text");
		}
		ann.annotate(doc); // do annotation
		for (Keyword kwd : doc.getKeywords()) {
			System.err.println(kwd);
		}
	}
}

#結果 with NLP4J

I [facet=word.PRP, str=I]
eat [facet=word.VBP, str=eat]
sushi [facet=word.NN, str=sushi]
with [facet=word.IN, str=with]
chopstick [facet=word.NNS, str=chopsticks]
. [facet=word.., str=.]

Index

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?