LoginSignup
1
0

More than 1 year has passed since last update.

NLP4J - Java で 英語の構文解析(Stanford NLPを利用)

Last updated at Posted at 2021-07-18

NLP4J Index

Stanford NLPとは

スタンフォードNLPグループが提供する、主要な計算言語学の問題に対して、統計NLP、深層学習NLP、およびルールベースのNLPツールです。

Software - The Stanford Natural Language Processing Group
https://nlp.stanford.edu/software/

項目 説明
提供者 スタンフォード大学 スタンフォードNLPグループ
提供形式 Java ライブラリ

#Example without NLP4J

package hello.stanford;

import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.IndexAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.OriginalTextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.util.CoreMap;

public class HelloStanfordNLP {
	public static void main(String[] args) {
		String text = "A natural language parser is a program that works out the grammatical structure of sentences";
		Properties properties = new Properties();
		properties.setProperty("annotators", "tokenize, ssplit, pos, lemma, depparse");
		StanfordCoreNLP coreNLP = new StanfordCoreNLP(properties);
		Annotation annotation = new Annotation(text);
		coreNLP.annotate(annotation);
		List<CoreMap> sentenceMap = annotation.get(SentencesAnnotation.class);
		for (CoreMap label : sentenceMap) {
			SemanticGraph graph = label.get(BasicDependenciesAnnotation.class);
			IndexedWord root = graph.getFirstRoot();
			printWord(root, graph, 0);
			System.err.println("<graph>");
			System.err.println(graph.toString());
			System.err.println("</graph>");
		}
	}

	public static void printWord(IndexedWord word, SemanticGraph graph, int tab) {
		System.err.println("<word>");
		System.err.println("depth:" + tab);
		System.err.println("TextAnnotation:" + word.get(TextAnnotation.class));
		System.err.println("OriginalTextAnnotation:" + word.get(OriginalTextAnnotation.class));
		System.err.println("CharacterOffsetBeginAnnotation:" + word.get(CharacterOffsetBeginAnnotation.class));
		System.err.println("CharacterOffsetEndAnnotation:" + word.get(CharacterOffsetEndAnnotation.class));
		System.err.println("IndexAnnotation:" + word.get(IndexAnnotation.class));
		System.err.println("SentenceIndexAnnotation:" + word.get(SentenceIndexAnnotation.class));
		System.err.println("PartOfSpeechAnnotation:" + word.get(PartOfSpeechAnnotation.class));
		System.err.println("</word>");

		List<IndexedWord> list = graph.getChildList(word);
		for (int n = 0; n < list.size(); n++) {
			printWord(list.get(n), graph, tab + 1);
		}
	}
}

#Output without NLP4j

<word>
depth:0
TextAnnotation:program
OriginalTextAnnotation:program
CharacterOffsetBeginAnnotation:31
CharacterOffsetEndAnnotation:38
IndexAnnotation:7
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:NN
</word>
<word>
depth:1
TextAnnotation:parser
OriginalTextAnnotation:parser
CharacterOffsetBeginAnnotation:19
CharacterOffsetEndAnnotation:25
IndexAnnotation:4
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:NN
</word>
<word>
depth:2
TextAnnotation:A
OriginalTextAnnotation:A
CharacterOffsetBeginAnnotation:0
CharacterOffsetEndAnnotation:1
IndexAnnotation:1
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:DT
</word>
<word>
depth:2
TextAnnotation:natural
OriginalTextAnnotation:natural
CharacterOffsetBeginAnnotation:2
CharacterOffsetEndAnnotation:9
IndexAnnotation:2
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:JJ
</word>
<word>
depth:2
TextAnnotation:language
OriginalTextAnnotation:language
CharacterOffsetBeginAnnotation:10
CharacterOffsetEndAnnotation:18
IndexAnnotation:3
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:NN
</word>
<word>
depth:1
TextAnnotation:is
OriginalTextAnnotation:is
CharacterOffsetBeginAnnotation:26
CharacterOffsetEndAnnotation:28
IndexAnnotation:5
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:VBZ
</word>
<word>
depth:1
TextAnnotation:a
OriginalTextAnnotation:a
CharacterOffsetBeginAnnotation:29
CharacterOffsetEndAnnotation:30
IndexAnnotation:6
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:DT
</word>
<word>
depth:1
TextAnnotation:works
OriginalTextAnnotation:works
CharacterOffsetBeginAnnotation:44
CharacterOffsetEndAnnotation:49
IndexAnnotation:9
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:VBZ
</word>
<word>
depth:2
TextAnnotation:that
OriginalTextAnnotation:that
CharacterOffsetBeginAnnotation:39
CharacterOffsetEndAnnotation:43
IndexAnnotation:8
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:WDT
</word>
<word>
depth:2
TextAnnotation:out
OriginalTextAnnotation:out
CharacterOffsetBeginAnnotation:50
CharacterOffsetEndAnnotation:53
IndexAnnotation:10
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:RP
</word>
<word>
depth:2
TextAnnotation:structure
OriginalTextAnnotation:structure
CharacterOffsetBeginAnnotation:70
CharacterOffsetEndAnnotation:79
IndexAnnotation:13
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:NN
</word>
<word>
depth:3
TextAnnotation:the
OriginalTextAnnotation:the
CharacterOffsetBeginAnnotation:54
CharacterOffsetEndAnnotation:57
IndexAnnotation:11
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:DT
</word>
<word>
depth:3
TextAnnotation:grammatical
OriginalTextAnnotation:grammatical
CharacterOffsetBeginAnnotation:58
CharacterOffsetEndAnnotation:69
IndexAnnotation:12
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:JJ
</word>
<word>
depth:3
TextAnnotation:sentences
OriginalTextAnnotation:sentences
CharacterOffsetBeginAnnotation:83
CharacterOffsetEndAnnotation:92
IndexAnnotation:15
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:NNS
</word>
<word>
depth:4
TextAnnotation:of
OriginalTextAnnotation:of
CharacterOffsetBeginAnnotation:80
CharacterOffsetEndAnnotation:82
IndexAnnotation:14
SentenceIndexAnnotation:0
PartOfSpeechAnnotation:IN
</word>
<graph>
-> program/NN (root)
  -> parser/NN (nsubj)
    -> A/DT (det)
    -> natural/JJ (amod)
    -> language/NN (compound)
  -> is/VBZ (cop)
  -> a/DT (det)
  -> works/VBZ (acl:relcl)
    -> that/WDT (nsubj)
    -> out/RP (compound:prt)
    -> structure/NN (dobj)
      -> the/DT (det)
      -> grammatical/JJ (amod)
      -> sentences/NNS (nmod)
        -> of/IN (case)

</graph>

#Maven

<!-- https://mvnrepository.com/artifact/org.nlp4j/nlp4j-stanford -->
<dependency>
	<groupId>org.nlp4j</groupId>
	<artifactId>nlp4j-core</artifactId>
	<version>[1.3.1.0,)</version>
</dependency>
<dependency>
	<groupId>org.nlp4j</groupId>
	<artifactId>nlp4j-stanford</artifactId>
	<version>[1.3.0.0,)</version>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.0.0</version>
	<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp -->
<dependency>
	<groupId>edu.stanford.nlp</groupId>
	<artifactId>stanford-corenlp</artifactId>
	<version>4.0.0</version>
	<classifier>models</classifier>
	<scope>provided</scope>
</dependency>

コード

package nlp4j.stanford.examples;

import nlp4j.Document;
import nlp4j.Keyword;
import nlp4j.KeywordWithDependency;
import nlp4j.impl.DefaultDocument;
import nlp4j.stanford.StanfordPosDependencyAnnotator;

public class StanfordPosDependencyAnnotatorExample0 {
	public static void main(String[] args) throws Exception {
		StanfordPosDependencyAnnotator ann = new StanfordPosDependencyAnnotator();
		Document doc = new DefaultDocument();
		doc.putAttribute("text", "I eat sushi with chopsticks.");
		ann.setProperty("target", "text");
		ann.annotate(doc);
		for (Keyword kwd : doc.getKeywords()) {
			if (kwd instanceof KeywordWithDependency) {
				KeywordWithDependency kd = (KeywordWithDependency) kwd;
				System.err.println(kd.toStringAsXml());
			}
		}
	}
}

#結果

<?xml version="1.0" encoding="UTF-8"?>
<w begin="2" depth="0" end="5" facet="VBP" id="0" lex="eat" relation="root" sequence="0" str="eat">
    <w begin="0" depth="1" end="1" facet="PRP" id="1" lex="I" relation="nsubj" sequence="1" str="I"/>
    <w begin="6" depth="1" end="11" facet="NN" id="2" lex="sushi" relation="obj" sequence="2" str="sushi"/>
    <w begin="17" depth="1" end="27" facet="NNS" id="3" lex="chopstick" relation="obl" sequence="3" str="chopsticks">
        <w begin="12" depth="2" end="16" facet="IN" id="4" lex="with" relation="case" sequence="4" str="with"/>
    </w>
    <w begin="27" depth="1" end="28" facet="." id="5" lex="." relation="punct" sequence="5" str="."/>
</w>

NLP4J Index

1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0