"""
## 47. 機能動詞構文のマイニング[Permalink](https://nlp100.github.io/ja/ch05.html#47-機能動詞構文のマイニング)
動詞のヲ格にサ変接続名詞が入っている場合のみに着目したい.46のプログラムを以下の仕様を満たすように改変せよ.
- 「サ変接続名詞+を(助詞)」で構成される文節が動詞に係る場合のみを対象とする
- 述語は「サ変接続名詞+を+動詞の基本形」とし,文節中に複数の動詞があるときは,最左の動詞を用いる
- 述語に係る助詞(文節)が複数あるときは,すべての助詞をスペース区切りで辞書順に並べる
- 述語に係る文節が複数ある場合は,すべての項をスペース区切りで並べる(助詞の並び順と揃えよ)
例えば「また、自らの経験を元に学習を行う強化学習という手法もある。」という文から,以下の出力が得られるはずである.
学習を行う に を 元に 経験を
"""
from collections import defaultdict
from typing import List
def read_file(fpath: str) -> List[List[str]]:
"""Get clear format of parsed sentences.
Args:
fpath (str): File path.
Returns:
List[List[str]]: List of sentences, and each sentence contains a word list.
e.g. result[1]:
['* 0 2D 0/0 -0.764522',
'\u3000\t記号,空白,*,*,*,*,\u3000,\u3000,\u3000',
'* 1 2D 0/1 -0.764522',
'吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
'は\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
'* 2 -1D 0/2 0.000000',
'猫\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
'で\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
'。\t記号,句点,*,*,*,*,。,。,。']
"""
with open(fpath, mode="rt", encoding="utf-8") as f:
sentences = f.read().split("EOS\n")
return [sent.strip().split("\n") for sent in sentences if sent.strip() != ""]
class Morph:
"""Morph information for each token.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
surface (str): 表層形(surface)
base (str): 基本形(base)
pos (str): 品詞(base)
pos1 (str): 品詞細分類1(pos1)
"""
def __init__(self, data):
self.surface = data["surface"]
self.base = data["base"]
self.pos = data["pos"]
self.pos1 = data["pos1"]
def __repr__(self):
return f"Morph({self.surface})"
def __str__(self):
return "surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]".format(
self.surface, self.base, self.pos, self.pos1
)
class Chunk:
"""Containing information for Clause/phrase.
Args:
data (dict): A dictionary contains necessary information.
Attributes:
chunk_id (str): The number of clause chunk (文節番号).
morphs List[Morph]: Morph (形態素) list.
dst (str): The index of dependency target (係り先文節インデックス番号).
srcs (List[str]): The index list of dependency source. (係り元文節インデックス番号).
"""
def __init__(self, chunk_id, dst):
self.id = chunk_id
self.morphs = []
self.dst = dst
self.srcs = []
def __repr__(self):
return "Chunk( id: {}, dst: {}, srcs: {}, morphs: {} )".format(
self.id, self.dst, self.srcs, self.morphs
)
def get_surface(self) -> str:
"""Concatenate morph surfaces in a chink.
Args:
chunk (Chunk): e.g. Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(吾輩), Morph(は)]
Return:
e.g. '吾輩は'
"""
morphs = self.morphs
res = ""
for morph in morphs:
if morph.pos != "記号":
res += morph.surface
return res
def validate_pos(self, pos: str) -> bool:
"""Return Ture if '名詞' or '動詞' in chunk's morphs. Otherwise, return False."""
morphs = self.morphs
return any([morph.pos == pos for morph in morphs])
def convert_sent_to_chunks(sent: List[str]) -> List[Morph]:
"""Extract word and convert to morph.
Args:
sent (List[str]): A sentence contains a word list.
e.g. sent:
['* 0 1D 0/1 0.000000',
'吾輩\t名詞,代名詞,一般,*,*,*,吾輩,ワガハイ,ワガハイ',
'は\t助詞,係助詞,*,*,*,*,は,ハ,ワ',
'* 1 -1D 0/2 0.000000',
'猫\t名詞,一般,*,*,*,*,猫,ネコ,ネコ',
'で\t助動詞,*,*,*,特殊・ダ,連用形,だ,デ,デ',
'ある\t助動詞,*,*,*,五段・ラ行アル,基本形,ある,アル,アル',
'。\t記号,句点,*,*,*,*,。,。,。']
Parsing format:
e.g. "* 0 1D 0/1 0.000000"
| カラム | 意味 |
| :----: | :----------------------------------------------------------- |
| 1 | 先頭カラムは`*`。係り受け解析結果であることを示す。 |
| 2 | 文節番号(0から始まる整数) |
| 3 | 係り先番号+`D` |
| 4 | 主辞/機能語の位置と任意の個数の素性列 |
| 5 | 係り関係のスコア。係りやすさの度合で、一般に大きな値ほど係りやすい。 |
Returns:
List[Chunk]: List of chunks.
"""
chunks = []
chunk = None
srcs = defaultdict(list)
for i, word in enumerate(sent):
if word[0] == "*":
# Add chunk to chunks
if chunk is not None:
chunks.append(chunk)
# eNw Chunk beggin
chunk_id = word.split(" ")[1]
dst = word.split(" ")[2].rstrip("D")
chunk = Chunk(chunk_id, dst)
srcs[dst].append(chunk_id) # Add target->source to mapping list
else: # Add Morch to chunk.morphs
features = word.split(",")
dic = {
"surface": features[0].split("\t")[0],
"base": features[6],
"pos": features[0].split("\t")[1],
"pos1": features[1],
}
chunk.morphs.append(Morph(dic))
if i == len(sent) - 1: # Add the last chunk
chunks.append(chunk)
# Add srcs to each chunk
for chunk in chunks:
chunk.srcs = list(srcs[chunk.id])
return chunks
def validate_chunk(chunk: Chunk, chunks: List[Chunk]) -> bool:
"""Validate chunk contains サ変接続名詞 (sa_connect_noun) and を(助詞)(particle).
Args:
chunk (Chunk): Chunk object.
e.g. Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(もの), Morph(を)] )
Returns:
bool: True or False
"""
sa_hen_flag = False
jyo_shi_flag = False
verb_flag = False
for morph in chunk.morphs:
if morph.pos1 == "サ変接続":
sa_hen_flag = True
if morph.pos == "助詞" and morph.surface == "を":
jyo_shi_flag = True
if (not sa_hen_flag) and (not jyo_shi_flag):
return False
for src in chunk.srcs:
src_chunk = chunks[int(src)]
for morph in src_chunk.morphs:
if morph.pos == "動詞":
verb_flag = True
return all([sa_hen_flag, jyo_shi_flag, verb_flag])
def get_verb_frame(chunks: List[Chunk]) -> dict:
"""Get edges from sentence chunks.
Terms:
- 述語 (predicate)
- 項 (argument)
- 格 (case)
Notice:
- Chunk.morphsは「サ変接続名詞」と「を(助詞)」が持っている
- Chunk.srcsはは「動詞」が持っている
Args:
chunks (List[Chunk]): A sentence contains many chunks.
e.g. [Chunk( id: 0, dst: 5, srcs: [], morphs: [Morph(吾輩), Morph(は)] ),
Chunk( id: 1, dst: 2, srcs: [], morphs: [Morph(ここ), Morph(で)] ),
Chunk( id: 2, dst: 3, srcs: ['1'], morphs: [Morph(始め), Morph(て)] ),
Chunk( id: 3, dst: 4, srcs: ['2'], morphs: [Morph(人間), Morph(という)] ),
Chunk( id: 4, dst: 5, srcs: ['3'], morphs: [Morph(もの), Morph(を)] ),
Chunk( id: 5, dst: -1, srcs: ['0', '4'], morphs: [Morph(見), Morph(た), Morph(。)] )]
Returns:
dict: Predicate frame.
e.g. {'pred': '話をする', 'case': [], 'arg': []}
or {'pred': '失笑を見る', 'case': ['て'], 'arg': ['見て']}
or {'pred': '発揮を持つ', 'case': ['て', 'を'], 'arg': ['持って', '性格を']},
"""
frame = {"pred": None, "case": [], "arg": []}
for chunk in chunks:
# Initialize
sa_hen_surface = None
# Skip if not valid
if not validate_chunk(chunk, chunks):
continue
# Get sa_hen
for morph in chunk.morphs:
if morph.pos1 == "サ変接続":
sa_hen_surface = morph.surface
# Get verb
src_verb_chunks = [
morph
for src in chunk.srcs
for morph in chunks[int(src)].morphs
if morph.pos == "動詞"
]
# Get predicate
frame["pred"] = sa_hen_surface + "を" + src_verb_chunks[0].base
# Get case
for src in chunk.srcs:
src_chunk = chunks[int(src)]
for morph in src_chunk.morphs:
if morph.pos == "助詞":
frame["case"].append(morph.base)
frame["arg"].append(src_chunk.get_surface())
return frame
def write_to_file(sents: List[dict], path):
"""Write patterns to file.
Args:
sents ([type]): predicate-verb frame
e.g. [{'pred': '話をする', 'case': [], 'arg': []},
{'pred': '失笑を見る', 'case': ['て'], 'arg': ['見て']},
{'pred': '発揮を持つ', 'case': ['て', 'を'], 'arg': ['持って', '性格を']}]
"""
# convert_frame_to_text
lines = []
for frame in sents:
case_text = " ".join(frame["case"])
arg_text = " ".join(frame["arg"])
lines.append((frame["pred"], case_text, arg_text))
# write_to_file
with open(path, "w") as f:
for line in lines:
f.write(f"{line[0]}\t{line[1]}\t{line[2]}\n")
fpath = "neko.txt.cabocha"
sentences = read_file(fpath)
sentences = [convert_sent_to_chunks(sent) for sent in sentences] # ans41
# ans47
pattern_sents = [get_verb_frame(sent) for sent in sentences]
pattern_sents = list(filter(lambda x: x["pred"] is not None, pattern_sents))
write_to_file(pattern_sents, "predicate_verb_frame.txt")
# "predicate_verb_frame.txt":
# 話をする
# 失笑を見る て 見て
# 報道を悲しむ
# 想像を聞く は て か と 吾輩は 聞いて 記さるるであろうかと 記さるるであろうかと
# 議論を云う
# 挨拶を云う の 見当違いの
# 朗読をよる て から よりまして せんだってから
# 発揮を持つ て を 持って 性格を
More than 5 years have passed since last update.
Register as a new user and use Qiita more conveniently
- You get articles that match your needs
- You can efficiently read back useful information
- You can use dark theme