0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

言語処理100本ノック(2020): 46

Last updated at Posted at 2020-09-30
"""
## 46. 動詞の格フレーム情報の抽出[Permalink](https://nlp100.github.io/ja/ch05.html#46-動詞の格フレーム情報の抽出)

45のプログラムを改変し,述語と格パターンに続けて項(述語に係っている文節そのもの)をタブ区切り形式で出力せよ.45の仕様に加えて,以下の仕様を満たすようにせよ.

- 項は述語に係っている文節の単語列とする(末尾の助詞を取り除く必要はない)
- 述語に係る文節が複数あるときは,助詞と同一の基準・順序でスペース区切りで並べる

「吾輩はここで始めて人間というものを見た」という例文(neko.txt.cabochaの8文目)を考える. この文は「始める」と「見る」の2つの動詞を含み,「始める」に係る文節は「ここで」,「見る」に係る文節は「吾輩は」と「ものを」と解析された場合は,次のような出力になるはずである.


    始める  で      ここで
    見る    は を   吾輩は ものを


"""
from collections import defaultdict
from typing import List


def read_file(fpath: str) -> List[List[str]]:
    """Get clear format of parsed sentences.

    Args:
        fpath (str): File path.

    Returns:
        List[List[str]]: List of sentences, and each sentence contains a word list.
                         e.g. result[1]:
                            ['* 0 2D 0/0 -0.764522',
                             '\u3000\t記号,空白,*,*,*,*,\u3000,\u3000,\u3000',
                             '* 1 2D 0/1 -0.764522',
                             '吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
                             '\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
                             '* 2 -1D 0/2 0.000000',
                             '\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
                             '\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
                             'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
                             '\t記号,句点,*,*,*,*,。,。,。']
    """
    with open(fpath, mode="rt", encoding="utf-8") as f:
        sentences = f.read().split("EOS\n")
    return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]


class Morph:
    """Morph information for each token.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        surface (str): 表層形(surface)
        base (str): 基本形(base)
        pos (str): 品詞(base)
        pos1 (str): 品詞細分類1(pos1)
    """

    def __init__(self, data):
        self.surface = data["surface"]
        self.base = data["base"]
        self.pos = data["pos"]
        self.pos1 = data["pos1"]

    def __repr__(self):
        return f"Morph({self.surface})"

    def __str__(self):
        return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
            self.surface, self.base, self.pos, self.pos1
        )


class Chunk:
    """Containing information for Clause/phrase.

    Args:
        data (dict): A dictionary contains necessary information.

    Attributes:
        chunk_id (str): The number of clause chunk (文節番号).
        morphs List[Morph]: Morph (形態素) list.
        dst (str): The index of dependency target (係り先文節インデックス番号).
        srcs (List[str]): The index list of dependency source. (係り元文節インデックス番号).
    """

    def __init__(self, chunk_id, dst):
        self.id = chunk_id
        self.morphs = []
        self.dst = dst
        self.srcs = []

    def __repr__(self):
        return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
            self.id, self.dst, self.srcs, self.morphs
        )

    def get_surface(self) -> str:
        """Concatenate morph surfaces in a chink.

        Args:
            chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(吾輩), Morph(は)]
        Return:
            e.g. '吾輩は'
        """
        morphs = self.morphs
        res = ""
        for morph in morphs:
            if morph.pos != "記号":
                res += morph.surface
        return res

    def validate_pos(self, pos: str) -> bool:
        """Return Ture if '名詞' or '動詞' in chunk's morphs. Otherwise, return False."""
        morphs = self.morphs
        return any([morph.pos == pos for morph in morphs])


def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
    """Extract word and convert to morph.

    Args:
        sent (List[str]): A sentence contains a word list.
                            e.g. sent:
                               ['* 0 1D 0/1 0.000000',
                                '吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
                                '\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
                                '* 1 -1D 0/2 0.000000',
                                '\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
                                '\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
                                'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
                                '\t記号,句点,*,*,*,*,。,。,。']

    Parsing format:
        e.g. "* 0 1D 0/1 0.000000"
        | カラム | 意味                                                         |
        | :----: | :----------------------------------------------------------- |
        |   1    | 先頭カラムは`*`。係り受け解析結果であることを示す。          |
        |   2    | 文節番号(0から始まる整数)                                  |
        |   3    | 係り先番号+`D`                                              |
        |   4    | 主辞/機能語の位置と任意の個数の素性列                        |
        |   5    | 係り関係のスコア。係りやすさの度合で、一般に大きな値ほど係りやすい。 |

    Returns:
        List[Chunk]: List of chunks.
    """
    chunks = []
    chunk = None
    srcs = defaultdict(list)

    for i, word in enumerate(sent):
        if word[0] == "*":
            # Add chunk to chunks
            if chunk is not None:
                chunks.append(chunk)

            # eNw Chunk beggin
            chunk_id = word.split(" ")[1]
            dst = word.split(" ")[2].rstrip("D")
            chunk = Chunk(chunk_id, dst)
            srcs[dst].append(chunk_id)  # Add target->source to mapping list

        else:  # Add Morch to chunk.morphs
            features = word.split(",")
            dic = {
                "surface": features[0].split("\t")[0],
                "base": features[6],
                "pos": features[0].split("\t")[1],
                "pos1": features[1],
            }
            chunk.morphs.append(Morph(dic))

            if i == len(sent) - 1:  # Add the last chunk
                chunks.append(chunk)

    # Add srcs to each chunk
    for chunk in chunks:
        chunk.srcs = list(srcs[chunk.id])

    return chunks


def get_predicate_frame(chunks: List[Chunk]) -> List[dict]:
    """Get edges from sentence chunks.

    Terms:
        - 述語 (predicate)
        - 項 (argument)
        - 格 (case)

    Args:
        chunks (List[Chunk]): A sentence contains many chunks.
            e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(吾輩), Morph(は)] ),
                  Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(ここ), Morph(で)] ),
                  Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(始め), Morph(て)] ),
                  Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(人間), Morph(という)] ),
                  Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(もの), Morph(を)] ),
                  Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(見), Morph(た), Morph(。)] )]

    Returns:
        List[dict]: Predicate frame.
            e.g. [{'pred': '始める', 'case': [''], 'arg': ['ここで']},
                  {'pred': '見る', 'case': ['', ''], 'arg': ['吾輩は', 'ものを']}]
    """
    patterns = []
    for chunk in chunks:
        # Skip if not valid
        if len(chunk.srcs) == 0 or all([morph.pos != "動詞" for morph in chunk.morphs]):
            continue

        # Initialize
        pred_frame = {"pred": None, "case": [], "arg": []}

        # Get predicate
        for morph in chunk.morphs:
            if morph.pos == "動詞":
                pred_frame["pred"] = morph.base
                break

        # Get case
        for src in chunk.srcs:
            src_chunk = chunks[int(src)]
            for morph in src_chunk.morphs:
                if morph.pos == "助詞":
                    pred_frame["case"].append(morph.base)
                    pred_frame["arg"].append(src_chunk.get_surface())

        # Add to patterns
        patterns.append(pred_frame)
    return patterns


def write_to_file(sents, path):
    """Write patterns to file.

    Args:
        pattern_sents ([type]): predicate-case patterns.
            e.g. [[{'pred': '生れる', 'case': [''], 'arg': ['どこで']},
                   {'pred': 'つく', 'case': ['', ''], 'arg': ['生れたか', '見当が']}],
                  [{'pred': '泣く', 'case': [''], 'arg': ['所で']},
                   {'pred': 'する', 'case': ['', 'だけ', ''], 'arg': ['泣いて', 'いた事だけは', 'いた事だけは']}]]
    """
    # convert_frame_to_text
    lines = []
    for sent in sents:
        for frame in sent:  # pattern: {'pred': '泣く', 'case': ['で'], 'arg': ['所で']}
            case_text = " ".join(frame["case"])
            arg_text = " ".join(frame["arg"])
            lines.append((frame["pred"], case_text, arg_text))

    # write_to_file
    with open(path, "w") as f:
        for line in lines:
            f.write(f"{line[0]}\t{line[1]}\t{line[2]}\n")


fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences]  # ans41

# ans46
pattern_sents = [get_predicate_frame(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: len(x) != 0, pattern_sents))
write_to_file(pattern_sents, "predicate_frame.txt")

# "predicate_frame.txt":
# 生れる	で	どこで
# つく	か が	生れたか 見当が
# 泣く	で	所で
# する	て だけ は	泣いて いた事だけは いた事だけは
# 始める	で	ここで
# 見る	は を	吾輩は ものを
# 聞く	で	あとで
# 捕える	を	我々を

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?