3
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

GiNZA を HTTP Server 化する (ginzaserver)

Last updated at Posted at 2021-04-19

目的 Objective

OSやランタイムプラットフォームにできるだけ依存しない分散自然言語処理システムを作るためです。複数台での処理やPython 以外のシステムとの連携がよりよくできます。

Mecab, Cabocha の動作環境を用意するのが面倒だなと思ったのもきっかけです。

前提

ginza が動作していること

>ginza
今日はいい天気です。
# text = 今日はいい天気です。
1       今日    今日    NOUN    名詞-普通名詞-副詞可能  _       4       nsubj   _       SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|NP_B|Reading=キョウ
2       は      は      ADP     助詞-係助詞     _       1       case    _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Reading=ハ
3       いい    いい    ADJ     形容詞-非自立可能       _       4       acl     _       SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=SEM_HEAD|Inf=形容詞,連体形-一般|Reading=イイ
4       天気    天気    NOUN    名詞-普通名詞-一般      _       0       root    _       SpaceAfter=No|BunsetuBILabel=B|BunsetuPositionType=ROOT|NP_B|Reading=テンキ
5       です    です    AUX     助動詞  _       4       cop     _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=SYN_HEAD|Inf=助動詞-デス,終止形-一般|Reading=デス
6       。      。      PUNCT   補助記号-句点   _       4       punct   _       SpaceAfter=No|BunsetuBILabel=I|BunsetuPositionType=CONT|Reading=。

確認済み動作環境

>python --version
Python 3.8.0

コード Code

# coding: utf-8

from http.server import HTTPServer
from http.server import BaseHTTPRequestHandler
from urllib.parse import urlparse
import urllib.parse
from ginza import *
import spacy
import json
from collections import defaultdict
import traceback

from socketserver import ThreadingMixIn
import threading

class GinzaHttpRequestHandler(BaseHTTPRequestHandler):
    def __init__(self, request, client_address, server):
        print("init request handler")
        BaseHTTPRequestHandler.__init__(self,request,client_address,server)
    
    def do_GET(self):
        query = urlparse(self.path).query
        qs_d = urllib.parse.parse_qs(query)
        if ("text" in qs_d) == False:
            self.send_response(404)
            self.end_headers()
            return
        text = qs_d["text"][0]
        text = urllib.parse.unquote(text)
        try:
            doc = self.nlp(text)
            res = []
            for sent in doc.sents:
                resx = {}
                res.append(resx)
                pgps = {}
                resx["paragraphs"] = pgps
                stcs = {}
                pgps["raw"] = sent.text
                pgps["sentences"] = stcs
                tokens = []
                stcs["tokens"] = tokens
                for token in sent:
                    tk = {}
                    tk["id"] = token.i
                    tk["orth"] = token.orth_
                    tk["tag"] = token.tag_
                    tk["pos"] = token.pos_
                    tk["lemma"] = token.lemma_
                    tk["head"] = token.head.i
                    tk["dep"] = token.dep_
                    tokens.append(tk)
            self.send_response(200)
            self.send_header("Content-type","application/json; charset=utf-8")
            self.end_headers()
            html = json.dumps(res)
            self.wfile.write(html.encode())
            return
        except:
            print(traceback.format_exc())
            self.send_response(500)
class GinzaHttpServer(ThreadingMixIn, HTTPServer):
    def __init__(self, address, handlerClass=GinzaHttpRequestHandler):
        print("init GinzaHttpServer")
        handlerClass.nlp = spacy.load("ja_ginza")
        super().__init__(address, handlerClass)

def main():
    ip = '127.0.0.1'
    port = 8888
    server = GinzaHttpServer((ip,port),GinzaHttpRequestHandler)
    try:
        server.serve_forever()
    except KeyboardInterrupt:
        pass
    finally:
        server.server_close()

if __name__ == '__main__':
    main()

呼び出し例

下記のURLにアクセスします。

http://localhost:8888/?text=今日はいい天気です。

呼び出し結果例

[{"paragraphs": {"raw": "\u4eca\u65e5\u306f\u3044\u3044\u5929\u6c17\u3067\u3059\u3002", "sentences": {"tokens": [{"id": 0, "orth": "\u4eca\u65e5", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u526f\u8a5e\u53ef\u80fd", "pos": "NOUN", "lemma": "\u4eca\u65e5", "head": 3, "dep": "nsubj"}, {"id": 1, "orth": "\u306f", "tag": "\u52a9\u8a5e-\u4fc2\u52a9\u8a5e", "pos": "ADP", "lemma": "\u306f", "head": 0, "dep": "case"}, {"id": 2, "orth": "\u3044\u3044", "tag": "\u5f62\u5bb9\u8a5e-\u975e\u81ea\u7acb\u53ef\u80fd", "pos": "ADJ", "lemma": "\u3044\u3044", "head": 3, "dep": "acl"}, {"id": 3, "orth": "\u5929\u6c17", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u4e00\u822c", "pos": "NOUN", "lemma": "\u5929\u6c17", "head": 3, "dep": "ROOT"}, {"id": 4, "orth": "\u3067\u3059", "tag": "\u52a9\u52d5\u8a5e", "pos": "AUX", "lemma": "\u3067\u3059", "head": 3, "dep": "cop"}, {"id": 5, "orth": "\u3002", "tag": "\u88dc\u52a9\u8a18\u53f7-\u53e5\u70b9", "pos": "PUNCT", "lemma": "\u3002", "head": 3, "dep": "punct"}]}}}, [{"id": 0, "orth": "\u4eca\u65e5", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u526f\u8a5e\u53ef\u80fd", "pos": "NOUN", "lemma": "\u4eca\u65e5", "head": 3, "dep": "nsubj"}, {"id": 1, "orth": "\u306f", "tag": "\u52a9\u8a5e-\u4fc2\u52a9\u8a5e", "pos": "ADP", "lemma": "\u306f", "head": 0, "dep": "case"}, {"id": 2, "orth": "\u3044\u3044", "tag": "\u5f62\u5bb9\u8a5e-\u975e\u81ea\u7acb\u53ef\u80fd", "pos": "ADJ", "lemma": "\u3044\u3044", "head": 3, "dep": "acl"}, {"id": 3, "orth": "\u5929\u6c17", "tag": "\u540d\u8a5e-\u666e\u901a\u540d\u8a5e-\u4e00\u822c", "pos": "NOUN", "lemma": "\u5929\u6c17", "head": 3, "dep": "ROOT"}, {"id": 4, "orth": "\u3067\u3059", "tag": "\u52a9\u52d5\u8a5e", "pos": "AUX", "lemma": "\u3067\u3059", "head": 3, "dep": "cop"}, {"id": 5, "orth": "\u3002", "tag": "\u88dc\u52a9\u8a18\u53f7-\u53e5\u70b9", "pos": "PUNCT", "lemma": "\u3002", "head": 3, "dep": "punct"}]]

↑だとわかりにくいので見やすくしたのが以下のものです。

image.png

[
    {
        "paragraphs": [
            {
                "sentences": [
                    {
                        "tokens": [
                            {
                                "id": 0,
                                "orth": "今日",
                                "tag": "名詞-普通名詞-副詞可能",
                                "pos": "NOUN",
                                "lemma": "今日",
                                "head": 3,
                                "dep": "obl"
                            },
                            {
                                "id": 1,
                                "orth": "は",
                                "tag": "助詞-係助詞",
                                "pos": "ADP",
                                "lemma": "は",
                                "head": 0,
                                "dep": "case"
                            },
                            {
                                "id": 2,
                                "orth": "いい",
                                "tag": "形容詞-非自立可能",
                                "pos": "ADJ",
                                "lemma": "いい",
                                "head": 3,
                                "dep": "acl"
                            },
                            {
                                "id": 3,
                                "orth": "天気",
                                "tag": "名詞-普通名詞-一般",
                                "pos": "NOUN",
                                "lemma": "天気",
                                "head": 3,
                                "dep": "ROOT"
                            },
                            {
                                "id": 4,
                                "orth": "です",
                                "tag": "助動詞",
                                "pos": "AUX",
                                "lemma": "です",
                                "head": 3,
                                "dep": "cop"
                            },
                            {
                                "id": 5,
                                "orth": "。",
                                "tag": "補助記号-句点",
                                "pos": "PUNCT",
                                "lemma": "。",
                                "head": 3,
                                "dep": "punct"
                            }
                        ]
                    }
                ],
                "raw": "今日はいい天気です。"
            }
        ]
    }
]

PIP化

以下コマンドでインストールできるようにしました。

pip install git+https://github.com/oyahiroki/ginzaserver

以下のコマンドで起動します。

ginzaserver

以下URLにアクセスするとJSONが返ります。PORTは固定です。すみません。

http://localhost:8888/?text=今日はいい天気です。

NLP4J

NLP4J プロジェクトのコンポーネントとして利用する予定です。

NLP4J Index

参考 Reference

GitHub - ginzaserver

GiNZA - Japanese NLP Library | Universal Dependenciesに基づくオープンソース日本語NLPライブラリ
https://megagonlabs.github.io/ginza/

3
2
3

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
3
2

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?