Stanford NLPとは
スタンフォードNLPグループが提供する、主要な計算言語学の問題に対して、統計NLP、深層学習NLP、およびルールベースのNLPツールです。
Software - The Stanford Natural Language Processing Group
https://nlp.stanford.edu/software/
項目 | 説明 |
---|---|
提供者 | スタンフォード大学 スタンフォードNLPグループ |
提供形式 | Java ライブラリ |
Example (without NLP4J)
package hello.stanford;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
public class HelloStanfordNlp001PosLemma {
public static void main(String[] args) {
String text = "This is test. He runs fast.";
Properties properties = new Properties();
// "tokenize, ssplit, pos, lemma, depparse"
// https://stanfordnlp.github.io/CoreNLP/annotators.html
// tokenize : TokenizerAnnotator
// ssplit : WordsToSentencesAnnotator : Splits a sequence of tokens into
// sentences.
// pos : POSTaggerAnnotator : Labels tokens with their POS tag.
// lemma : 見出し
// lemma : MorphaAnnotator : Generates the word lemmas for all tokens in the
// corpus.
properties.setProperty("annotators", "tokenize, ssplit, pos, lemma");
StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
Annotation annotation = new Annotation(text);
coreNLP.annotate(annotation);
{
List<CoreLabel> cl = annotation.get(TokensAnnotation.class);
for (CoreLabel label : cl) {
System.err.println("<token>");
// [class edu.stanford.nlp.ling.CoreAnnotations$ValueAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$OriginalTextAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$BeforeAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$AfterAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$IsNewlineAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$IndexAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$PartOfSpeechAnnotation,
// class edu.stanford.nlp.ling.CoreAnnotations$LemmaAnnotation]
System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
System.err.println("PartOfSpeechAnnotation=" + label.get(PartOfSpeechAnnotation.class));
System.err.println("LemmaAnnotation=" + label.get(LemmaAnnotation.class));
System.err.println("</token>");
}
}
System.err.println("---");
System.err.println("TextAnnotation");
// class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
{
System.err.println(annotation.get(TextAnnotation.class));
}
// class edu.stanford.nlp.ling.CoreAnnotations$SentencesAnnotation
System.err.println("SentencesAnnotation");
{
List<CoreMap> sentenceMap = annotation.get(SentencesAnnotation.class);
for (CoreMap label : sentenceMap) {
System.err.println("<sentence>");
// [class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation,
System.err.println("TextAnnotation=" + label.get(TextAnnotation.class));
// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation,
System.err.println(
"CharacterOffsetBeginAnnotation=" + label.get(CharacterOffsetBeginAnnotation.class));
// class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation,
System.err.println("CharacterOffsetEndAnnotation=" + label.get(CharacterOffsetEndAnnotation.class));
// class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation,
System.err.println("TokensAnnotation=" + label.get(TokensAnnotation.class));
// class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation,
System.err.println("SentenceIndexAnnotation=" + label.get(SentenceIndexAnnotation.class));
// class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation,
System.err.println("TokenBeginAnnotation=" + label.get(TokenBeginAnnotation.class));
// class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation]
System.err.println("TokenEndAnnotation=" + label.get(TokenEndAnnotation.class));
System.err.println("</sentence>");
}
}
}
}
#Output Example (without NLP4J)
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=DT
LemmaAnnotation=this
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=be
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=NN
LemmaAnnotation=test
</token>
<token>
SentenceIndexAnnotation=0
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=PRP
LemmaAnnotation=he
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=VBZ
LemmaAnnotation=run
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=RB
LemmaAnnotation=fast
</token>
<token>
SentenceIndexAnnotation=1
PartOfSpeechAnnotation=.
LemmaAnnotation=.
</token>
---
TextAnnotation
This is test. He runs fast.
SentencesAnnotation
<sentence>
TextAnnotation=This is test.
CharacterOffsetBeginAnnotation=0
CharacterOffsetEndAnnotation=13
TokensAnnotation=[This-1, is-2, test-3, .-4]
SentenceIndexAnnotation=0
TokenBeginAnnotation=0
TokenEndAnnotation=4
</sentence>
<sentence>
TextAnnotation=He runs fast.
CharacterOffsetBeginAnnotation=14
CharacterOffsetEndAnnotation=27
TokensAnnotation=[He-1, runs-2, fast-3, .-4]
SentenceIndexAnnotation=1
TokenBeginAnnotation=4
TokenEndAnnotation=8
</sentence>
#Maven Dependencies for NLP4J
<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
<groupId>org.nlp4j</groupId>
<artifactId>nlp4j-core</artifactId>
<version>[1.3.1.0,)</version>
</dependency>
<dependency>
<groupId>org.nlp4j</groupId>
<artifactId>nlp4j-stanford</artifactId>
<version>[1.3.0.0,)</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>4.0.0</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>4.0.0</version>
<classifier>models</classifier>
<scope>provided</scope>
</dependency>
コード with NLP4J
package nlp4j.stanford.examples;
import nlp4j.Document;
import nlp4j.Keyword;
import nlp4j.impl.DefaultDocument;
import nlp4j.stanford.StanfordPosAnnotator;
public class StanfordPosAnnotatorExample0 {
public static void main(String[] args) throws Exception {
Document doc = new DefaultDocument();
{
doc.putAttribute("text", "I eat sushi with chopsticks.");
}
StanfordPosAnnotator ann = new StanfordPosAnnotator();
{
ann.setProperty("target", "text");
}
ann.annotate(doc); // do annotation
for (Keyword kwd : doc.getKeywords()) {
System.err.println(kwd);
}
}
}
#結果 with NLP4J
I [facet=word.PRP, str=I]
eat [facet=word.VBP, str=eat]
sushi [facet=word.NN, str=sushi]
with [facet=word.IN, str=with]
chopstick [facet=word.NNS, str=chopsticks]
. [facet=word.., str=.]