第4章の後半の問題を解いた記録。
対象とするファイルはwebページにもある通り、neko.txtとする。
夏目漱石の小説『吾輩は猫である』の文章(neko.txt)をMeCabを使って形態素解析し,その結果をneko.txt.mecabというファイルに保存せよ.このファイルを用いて,以下の問に対応するプログラムを実装せよ.
なお,問題37, 38, 39はmatplotlibもしくはGnuplotを用いるとよい.
35. 名詞の連接
名詞の連接(連続して出現する名詞)を最長一致で抽出せよ.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
def extract_seqs(sentences):
seqs = []
seq = []
for sentence in sentences:
for morpheme in sentence:
if morpheme['pos'] == "名詞":
seq.append(morpheme['surface'])
else:
if len(seq) > 1:
seqs.append(seq)
seq = []
return seqs
if __name__ == "__main__":
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_sequences.txt'
f = open(inputfile, "r")
g = open(outputfile, "w")
sentences = problem30.mecab_reader(f)
sequences = extract_seqs(sentences)
for sequence in sequences:
# print "".join(sequence)
g.write("".join(sequence) + '\n')
f.close()
g.close()
36. 単語の出現頻度
文章中に出現する単語とその出現頻度を求め,出現頻度の高い順に並べよ.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
from collections import Counter
def count_words(sentences):
words = []
for sentence in sentences:
for morpheme in sentence:
words.append(morpheme['surface'])
return Counter(words)
if __name__ == "__main__":
inputfile = "neko.txt.mecab"
outputfile = "neko.mecab_words.txt"
f = open(inputfile, 'r')
g = open(outputfile, 'w')
sentences = problem30.mecab_reader(f)
counter = count_words(sentences)
for word, count in counter.most_common():
# print word, count
g.write("%s %s\n" % (word, count))
f.close()
g.close()
37. 頻度上位10語
出現頻度が高い10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
import problem36
import matplotlib.pyplot as plt
def plot_words(words, counts, file):
from matplotlib.font_manager import FontProperties
fp = FontProperties(fname='/usr/local/Cellar/ricty/3.2.4/share/fonts/Ricty-Regular.ttf')
plt.bar(range(10), counts, align='center')
plt.xticks(range(0, 10), words, fontproperties=fp)
plt.savefig(file)
if __name__ == '__main__':
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_words.png'
f = open(inputfile, 'r')
words = []
counts = []
sentences = problem30.mecab_reader(f)
counter = problem36.count_words(sentences)
for word, count in counter.most_common(10):
# print word, count
words.append(word.decode('utf8'))
counts.append(count)
plot_words(words, counts, outputfile)
f.close()
38. ヒストグラム
単語の出現頻度のヒストグラム(横軸に出現頻度,縦軸に出現頻度をとる単語の種類数を棒グラフで表したもの)を描け.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
import problem36
import pandas as pd
def plot_words_hist(freq, file):
plot = freq.hist()
fig = plot.get_figure()
fig.savefig(file)
if __name__ == '__main__':
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_words_hist.png'
f = open(inputfile, 'r')
words = []
counts = []
sentences = problem30.mecab_reader(f)
counter = problem36.count_words(sentences)
freq = pd.Series(list(counter.values()), index=list(counter.keys()))
plot_words_hist(freq, outputfile)
39. Zipfの法則
単語の出現頻度順位を横軸,その出現頻度を縦軸として,両対数グラフをプロットせよ.
# -*- coding: utf-8 -
__author__ = 'todoroki'
import problem30
import problem36
import matplotlib.pyplot as plt
def plot_words_hist_log(counter, file):
from matplotlib.font_manager import FontProperties
fp = FontProperties(fname='/usr/local/Cellar/ricty/3.2.4/share/fonts/Ricty-Regular.ttf')
plt.figure()
plt.xscale('log')
plt.yscale('log')
plt.plot(sorted(list(counter.values()), reverse=True), range(1, len(list(counter))+1))
plt.savefig(file)
if __name__ == '__main__':
inputfile = 'neko.txt.mecab'
outputfile = 'neko.mecab_words_hist_log.png'
f = open(inputfile, 'r')
words = []
counts = []
sentences = problem30.mecab_reader(f)
counter = problem36.count_words(sentences)
plot_words_hist_log(counter, outputfile)
f.close()