0
0

More than 3 years have passed since last update.

言語処理100本ノック(2020): 39

Posted at
"""
37. 「猫」と共起頻度の高い上位10語
「猫」とよく共起する(共起頻度が高い)10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ.


sentence_list:
[[{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
  {'surface': '一', 'base': '一', 'pos': '名詞', 'pos1': '数'},
  {'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
 [{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
  {'surface': '吾輩', 'base': '吾輩', 'pos': '名詞', 'pos1': '代名詞'},
  {'surface': 'は', 'base': 'は', 'pos': '助詞', 'pos1': '係助詞'},
  {'surface': '猫', 'base': '猫', 'pos': '名詞', 'pos1': '一般'},
  {'surface': 'で', 'base': 'だ', 'pos': '助動詞', 'pos1': '*'},
  {'surface': 'ある', 'base': 'ある', 'pos': '助動詞', 'pos1': '*'},
  {'surface': '。', 'base': '。', 'pos': '記号', 'pos1': '句点'},
  {'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],

Memo:
    - 共起頻度: https://www.jtp.co.jp/techport/2018-04-18-001/
"""
from collections import defaultdict
from typing import List

import matplotlib.pyplot as plt

import utils

plt.style.use("ggplot")
plt.rcParams["font.family"] = "Hiragino Mincho ProN"  # 日本語対応


def get_co_occurrence(sentence_list: List[List[dict]]) -> list:
    sents = [
        [word["surface"] for word in sent[1:-1]] for sent in sentence_list
    ]  # [['一'], ['吾輩', 'は', '猫', 'で', 'ある', '。']]
    counter = defaultdict(int)

    for sent in sents:
        if "猫" in sent:
            for word in sent:
                counter[word] += 1

    del counter["猫"]

    sorted_counter = {
        k: v for k, v in sorted(counter.items(), key=lambda item: item[1], reverse=True)
    }
    return list(sorted_counter.items())


def plot_co_occurrence(x: list, y: list) -> None:
    x_pos = [i for i, _ in enumerate(x)]

    plt.bar(x, y)
    plt.xlabel("Term")
    plt.ylabel("Frequency")
    plt.title("Co-occurrence with '猫'")

    plt.xticks(x_pos, x)

    plt.show()


sentence_list = utils.read_json("30_neko_mecab.json")
counter = get_co_occurrence(sentence_list)
# [('の', 391), ('は', 272), ('、', 252), ('に', 250), ('を', 232)]

x = [word[0] for word in counter[:10]]
y = [word[1] for word in counter[:10]]
plot_co_occurrence(x, y)
# ![image-20200527193140109](https://raw.githubusercontent.com/LearnXu/images/master/imgs/image-20200527193140109.png)

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0