目的 Objective
OSやランタイムプラットフォームにできるだけ依存しない分散自然言語処理システムを作るためです。複数台での処理やPython 以外のシステムとの連携がよりよくできます。
Mecab, Cabocha の動作環境を用意するのが面倒だなと思ったのもきっかけです。
前提
ginza が動作していること
>ginza
今日はいい天気です。
# text = 今日はいい天気です。
1 今日 今日 NOUN 名詞-普通名詞-副詞可能 _ 4 nsubj _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=キョウ
2 は は ADP 助詞-係助詞 _ 1 case _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ハ
3 いい いい ADJ 形容詞-非自立可能 _ 4 acl _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|Inf=形容詞,連体形-一般|Reading=イイ
4 天気 天気 NOUN 名詞-普通名詞-一般 _ 0 root _ SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=ROOT|NP_B|Reading=テンキ
5 です です AUX 助動詞 _ 4 cop _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-デス,終止形-一般|Reading=デス
6 。 。 PUNCT 補助記号-句点 _ 4 punct _ SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。
確認済み動作環境
>python --version
Python 3.8.0
コード Code
# coding: utf-8
from http.server import HTTPServer
from http.server import BaseHTTPRequestHandler
from urllib.parse import urlparse
import urllib.parse
from ginza import *
import spacy
import json
from collections import defaultdict
import traceback
from socketserver import ThreadingMixIn
import threading
class GinzaHttpRequestHandler(BaseHTTPRequestHandler):
def __init__(self, request, client_address, server):
print("init request handler")
BaseHTTPRequestHandler.__init__(self,request,client_address,server)
def do_GET(self):
query = urlparse(self.path).query
qs_d = urllib.parse.parse_qs(query)
if ("text" in qs_d) == False:
self.send_response(404)
self.end_headers()
return
text = qs_d["text"][0]
text = urllib.parse.unquote(text)
try:
doc = self.nlp(text)
res = []
for sent in doc.sents:
resx = {}
res.append(resx)
pgps = {}
resx["paragraphs"] = pgps
stcs = {}
pgps["raw"] = sent.text
pgps["sentences"] = stcs
tokens = []
stcs["tokens"] = tokens
for token in sent:
tk = {}
tk["id"] = token.i
tk["orth"] = token.orth_
tk["tag"] = token.tag_
tk["pos"] = token.pos_
tk["lemma"] = token.lemma_
tk["head"] = token.head.i
tk["dep"] = token.dep_
tokens.append(tk)
self.send_response(200)
self.send_header("Content-type","application/json; charset=utf-8")
self.end_headers()
html = json.dumps(res)
self.wfile.write(html.encode())
return
except:
print(traceback.format_exc())
self.send_response(500)
class GinzaHttpServer(ThreadingMixIn, HTTPServer):
def __init__(self, address, handlerClass=GinzaHttpRequestHandler):
print("init GinzaHttpServer")
handlerClass.nlp = spacy.load("ja_ginza")
super().__init__(address, handlerClass)
def main():
ip = '127.0.0.1'
port = 8888
server = GinzaHttpServer((ip,port),GinzaHttpRequestHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
pass
finally:
server.server_close()
if __name__ == '__main__':
main()
呼び出し例
下記のURLにアクセスします。
http://localhost:8888/?text=今日はいい天気です。
呼び出し結果例
[{"paragraphs": {"raw": "\u4eca\u65e5\u306f\u3044\u3044\u5929\u6c17\u3067\u3059\u3002", "sentences": {"tokens": [{"id": 0, "orth": "\u4eca\u65e5", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u526f\u8a5e\u53ef\u80fd", "pos": "NOUN", "lemma": "\u4eca\u65e5", "head": 3, "dep": "nsubj"}, {"id": 1, "orth": "\u306f", "tag": "\u52a9\u8a5e-\u4fc2\u52a9\u8a5e", "pos": "ADP", "lemma": "\u306f", "head": 0, "dep": "case"}, {"id": 2, "orth": "\u3044\u3044", "tag": "\u5f62\u5bb9\u8a5e-\u975e\u81ea\u7acb\u53ef\u80fd", "pos": "ADJ", "lemma": "\u3044\u3044", "head": 3, "dep": "acl"}, {"id": 3, "orth": "\u5929\u6c17", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u4e00\u822c", "pos": "NOUN", "lemma": "\u5929\u6c17", "head": 3, "dep": "ROOT"}, {"id": 4, "orth": "\u3067\u3059", "tag": "\u52a9\u52d5\u8a5e", "pos": "AUX", "lemma": "\u3067\u3059", "head": 3, "dep": "cop"}, {"id": 5, "orth": "\u3002", "tag": "\u88dc\u52a9\u8a18\u53f7-\u53e5\u70b9", "pos": "PUNCT", "lemma": "\u3002", "head": 3, "dep": "punct"}]}}}, [{"id": 0, "orth": "\u4eca\u65e5", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u526f\u8a5e\u53ef\u80fd", "pos": "NOUN", "lemma": "\u4eca\u65e5", "head": 3, "dep": "nsubj"}, {"id": 1, "orth": "\u306f", "tag": "\u52a9\u8a5e-\u4fc2\u52a9\u8a5e", "pos": "ADP", "lemma": "\u306f", "head": 0, "dep": "case"}, {"id": 2, "orth": "\u3044\u3044", "tag": "\u5f62\u5bb9\u8a5e-\u975e\u81ea\u7acb\u53ef\u80fd", "pos": "ADJ", "lemma": "\u3044\u3044", "head": 3, "dep": "acl"}, {"id": 3, "orth": "\u5929\u6c17", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u4e00\u822c", "pos": "NOUN", "lemma": "\u5929\u6c17", "head": 3, "dep": "ROOT"}, {"id": 4, "orth": "\u3067\u3059", "tag": "\u52a9\u52d5\u8a5e", "pos": "AUX", "lemma": "\u3067\u3059", "head": 3, "dep": "cop"}, {"id": 5, "orth": "\u3002", "tag": "\u88dc\u52a9\u8a18\u53f7-\u53e5\u70b9", "pos": "PUNCT", "lemma": "\u3002", "head": 3, "dep": "punct"}]]
↑だとわかりにくいので見やすくしたのが以下のものです。
[
{
"paragraphs": [
{
"sentences": [
{
"tokens": [
{
"id": 0,
"orth": "今日",
"tag": "名詞-普通名詞-副詞可能",
"pos": "NOUN",
"lemma": "今日",
"head": 3,
"dep": "obl"
},
{
"id": 1,
"orth": "は",
"tag": "助詞-係助詞",
"pos": "ADP",
"lemma": "は",
"head": 0,
"dep": "case"
},
{
"id": 2,
"orth": "いい",
"tag": "形容詞-非自立可能",
"pos": "ADJ",
"lemma": "いい",
"head": 3,
"dep": "acl"
},
{
"id": 3,
"orth": "天気",
"tag": "名詞-普通名詞-一般",
"pos": "NOUN",
"lemma": "天気",
"head": 3,
"dep": "ROOT"
},
{
"id": 4,
"orth": "です",
"tag": "助動詞",
"pos": "AUX",
"lemma": "です",
"head": 3,
"dep": "cop"
},
{
"id": 5,
"orth": "。",
"tag": "補助記号-句点",
"pos": "PUNCT",
"lemma": "。",
"head": 3,
"dep": "punct"
}
]
}
],
"raw": "今日はいい天気です。"
}
]
}
]
PIP化
以下コマンドでインストールできるようにしました。
pip install git+https://github.com/oyahiroki/ginzaserver
以下のコマンドで起動します。
ginzaserver
以下URLにアクセスするとJSONが返ります。PORTは固定です。すみません。
http://localhost:8888/?text=今日はいい天気です。
NLP4J
NLP4J プロジェクトのコンポーネントとして利用する予定です。
参考 Reference
GitHub - ginzaserver
GiNZA - Japanese NLP Library | Universal Dependenciesに基づくオープンソース日本語NLPライブラリ
https://megagonlabs.github.io/ginza/