More than 5 years have passed since last update.

ニートの自然言語処理100本ノック:39

Python

Posted at 2019-06-20

とりあえず頻度を順序に変換する。
そして、x座標y座標共にlogの設定をグラフ作成関数にさせた。
またgrid optionを入れた。
出力結果があってるかわからないがとりあえず上げる(日曜日にでも確認する)。
てかまず実父の法則ってなんやねん...

"""
39. Zipfの法則
単語の出現頻度順位を横軸，その出現頻度を縦軸として，両対数グラフをプロットせよ．
"""

import MeCab
from pprint import pprint
import collections
from n30_execise import maping_morphology
from n31_execise import load_mecab_file
from n36_execise import extract_type_all,extract_frequent_words
from n37_execise import create_words_frequency_graph
import matplotlib.pyplot as plt
import pandas as pd


#  TODO:freq_count　出現頻度の順序化
def freq_to_order(freq_count):

    freq_count = list(freq_count)
    uniq_freq = list(set(freq_count))
    uniq_freq.sort(reverse=True)

    freq_order_list = []

    for index,_uniq in enumerate(uniq_freq):
        for _frequency in freq_count:
            if _uniq == _frequency:
                freq_order_list.append(int(index+1))
            else:
                continue
    
    return freq_order_list


if __name__ == "__main__":
    
    # ファイルパス
    mecab_file = "neko.txt.mecab"

    # neko.txt.mecabを読み込む
    mecab_text = load_mecab_file(mecab_file)

    # 形態解析後のマッピングリスト
    analyzed_list = maping_morphology(mecab_text)
    # 表層形だけを取り出したリストを作成する
    surface_list = extract_type_all(analyzed_list)
    # 出現頻度の高い値とその回数を取り出す
    freq_value,freq_count = extract_frequent_words(surface_list)
    print(freq_count)
    freq_order_list = freq_to_order(freq_count)
    print(freq_order_list)
    # グラフ作成
    create_words_frequency_graph(freq_order_list,freq_count,graph='line',title="出現頻度順序と出現頻度の両対数グラフ(Zipfの法則)",xlabel="出現頻度の順序",ylabel="出現頻度",log_log=True,_grid=True)
    # HACK:イメージとしてデータがある場所の上に値が表示されるようにしたい。

出力結果

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up