More than 3 years have passed since last update.

言語処理100本ノック(2020): 47

Last updated at Posted at 2020-09-30
## 47. 機能動詞構文のマイニング[Permalink](https://nlp100.github.io/ja/ch05.html#47-機能動詞構文のマイニング)


- 「サ変接続名詞+を(助詞)」で構成される文節が動詞に係る場合のみを対象とする
- 述語は「サ変接続名詞+を+動詞の基本形」とし,文節中に複数の動詞があるときは,最左の動詞を用いる
- 述語に係る助詞(文節)が複数あるときは,すべての助詞をスペース区切りで辞書順に並べる
- 述語に係る文節が複数ある場合は,すべての項をスペース区切りで並べる(助詞の並び順と揃えよ)


    学習を行う	に を	元に 経験を

from collections import defaultdict
from typing import List

def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

        fpath (str): File path.

        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '* 1 2D 0/1 -0.764522',
                             '* 2 -1D 0/2 0.000000',
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]

class Morph:
    """Morph information for each token.

        data (dict): A dictionary contains necessary information.

        surface (str): 表層形(surface)
        base (str): 基本形(base)
        pos (str): 品詞(base)
        pos1 (str): 品詞細分類1(pos1)

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1

class Chunk:
    """Containing information for Clause/phrase.

        data (dict): A dictionary contains necessary information.

        chunk_id (str): The number of clause chunk (文節番号).
        morphs List[Morph]: Morph (形態素) list.
        dst (str): The index of dependency target (係り先文節インデックス番号).
        srcs (List[str]): The index list of dependency source. (係り元文節インデックス番号).

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs

    def get_surface(self) -> str:
        """Concatenate morph surfaces in a chink.

            chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(吾輩), Morph(は)]
            e.g. '吾輩は'
        morphs = self.morphs
        res = ""
        for morph in morphs:
            if morph.pos != "記号":
                res += morph.surface
        return res

    def validate_pos(self, pos: str) -> bool:
        """Return Ture if '名詞' or '動詞' in chunk's morphs. Otherwise, return False."""
        morphs = self.morphs
        return any([morph.pos == pos for morph in morphs])

def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                '* 1 -1D 0/2 0.000000',

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        | カラム | 意味                                                         |
        | :----: | :----------------------------------------------------------- |
        |   1    | 先頭カラムは`*`。係り受け解析結果であることを示す。          |
        |   2    | 文節番号(0から始まる整数)                                  |
        |   3    | 係り先番号+`D`                                              |
        |   4    | 主辞/機能語の位置と任意の個数の素性列                        |
        |   5    | 係り関係のスコア。係りやすさの度合で、一般に大きな値ほど係りやすい。 |

        List[Chunk]: List of chunks.
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],

            if i == len(sent) - 1:  # Add the last chunk

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks

def validate_chunk(chunk: Chunk, chunks: List[Chunk]) -> bool:
    """Validate chunk contains サ変接続名詞 (sa_connect_noun) and を(助詞)(particle).

        chunk (Chunk): Chunk object.
            e.g. Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(もの), Morph(を)] )

        bool: True or False
    sa_hen_flag = False
    jyo_shi_flag = False
    verb_flag = False

    for morph in chunk.morphs:
        if morph.pos1 == "サ変接続":
            sa_hen_flag = True
        if morph.pos == "助詞" and morph.surface == "":
            jyo_shi_flag = True

    if (not sa_hen_flag) and (not jyo_shi_flag):
        return False

    for src in chunk.srcs:
        src_chunk = chunks[int(src)]
        for morph in src_chunk.morphs:
            if morph.pos == "動詞":
                verb_flag = True

    return all([sa_hen_flag, jyo_shi_flag, verb_flag])

def get_verb_frame(chunks: List[Chunk]) -> dict:
    """Get edges from sentence chunks.

        - 述語 (predicate)
        - 項 (argument)
        - 格 (case)

        - Chunk.morphsは「サ変接続名詞」と「を(助詞)」が持っている
        - Chunk.srcsはは「動詞」が持っている

        chunks (List[Chunk]): A sentence contains many chunks.
            e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(吾輩), Morph(は)] ),
                  Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(ここ), Morph(で)] ),
                  Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(始め), Morph(て)] ),
                  Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(人間), Morph(という)] ),
                  Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(もの), Morph(を)] ),
                  Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(見), Morph(た), Morph(。)] )]

        dict: Predicate frame.
            e.g. {'pred': '話をする', 'case': [], 'arg': []}
              or {'pred': '失笑を見る', 'case': [''], 'arg': ['見て']}
              or {'pred': '発揮を持つ', 'case': ['', ''], 'arg': ['持って', '性格を']},
    frame = {"pred": None, "case": [], "arg": []}
    for chunk in chunks:
        # Initialize
        sa_hen_surface = None

        # Skip if not valid
        if not validate_chunk(chunk, chunks):

        # Get sa_hen
        for morph in chunk.morphs:
            if morph.pos1 == "サ変接続":
                sa_hen_surface = morph.surface

        # Get verb
        src_verb_chunks = [
            for src in chunk.srcs
            for morph in chunks[int(src)].morphs
            if morph.pos == "動詞"

        # Get predicate
        frame["pred"] = sa_hen_surface + "" + src_verb_chunks[0].base

        # Get case
        for src in chunk.srcs:
            src_chunk = chunks[int(src)]
            for morph in src_chunk.morphs:
                if morph.pos == "助詞":

    return frame

def write_to_file(sents: List[dict], path):
    """Write patterns to file.

        sents ([type]): predicate-verb frame
            e.g.   [{'pred': '話をする', 'case': [], 'arg': []},
                    {'pred': '失笑を見る', 'case': [''], 'arg': ['見て']},
                    {'pred': '発揮を持つ', 'case': ['', ''], 'arg': ['持って', '性格を']}]
    # convert_frame_to_text
    lines = []

    for frame in sents:
        case_text = " ".join(frame["case"])
        arg_text = " ".join(frame["arg"])
        lines.append((frame["pred"], case_text, arg_text))

    # write_to_file
    with open(path, "w") as f:
        for line in lines:

fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences]  # ans41

# ans47
pattern_sents = [get_verb_frame(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: x["pred"] is not None, pattern_sents))

write_to_file(pattern_sents, "predicate_verb_frame.txt")

# "predicate_verb_frame.txt":
# 話をする
# 失笑を見る	て	見て
# 報道を悲しむ
# 想像を聞く	は て か と	吾輩は 聞いて 記さるるであろうかと 記さるるであろうかと
# 議論を云う
# 挨拶を云う	の	見当違いの
# 朗読をよる	て から	よりまして せんだってから
# 発揮を持つ	て を	持って 性格を


