はじめに
- matplotlibを使用しましたが、こんな簡単にグラフ描画できるんだと感動しました
- No.30の問題を読み間違えたまま進み、No.37でつまずきました
- ~No.36まで解き直しが発生しましたが、幸い各2行程度の修正で済みました
- 問題文はしっかり読むようにします(当たり前ですが)
- 引き続き頑張ります
前準備
# MeCab関連
! apt install mecab libmecab-dev mecab-ipadic-utf8
! pip install mecab-python3
! pip install unidic-lite
# neko.txt.mecab作成
! mecab -o ./neko.txt.mecab ./neko.txt
# matplotlibで日本語表示できるようにしてくれる
! pip install japanize-matplotlib
30. 形態素解析結果の読み込み
with open(f"{dirpath}/neko.txt.mecab", "r") as f:
lines = f.read().split("EOS\n")
blocks = []
for line in lines:
if len(line) == 0:
continue
else:
line = line.split("\n")
blocks.append(line)
sentences = []
for block in blocks:
sentence = []
for data in block:
if len(data) != 0:
_d = data.split("\t")
surface = _d[0]
others = _d[1].split(",")
d = {
"surface": surface,
"base": others[6],
"pos": others[0],
"pos1": others[1],
}
if surface != "":
sentence.append(d)
if len(sentence) != 0:
sentences.append(sentence)
with open(f"{dirpath}/30_output.txt", "w") as f:
for sentence in sentences:
f.write(f"{sentence}\n")
31. 動詞
with open(f"{dirpath}/31_output.txt", "w") as f:
for sentence in sentences:
for line in sentence:
if line["pos"] == "動詞":
f.write(f"{line['surface']}\n")
32. 動詞の基本形
with open(f"{dirpath}/32_output.txt", "w") as f:
for sentence in sentences:
for line in sentence:
if line["pos"] == "動詞":
f.write(f"{line['base']}\n")
33. 「AのB」
result = []
for sentence in sentences:
for i, line in enumerate(sentence):
if line["surface"] == "の" and line["pos"] == "助詞":
try:
if sentence[i - 1]["pos"] == "名詞":
if sentence[i + 1]["pos"] == "名詞":
result.append(f"{sentence[i-1]['surface']}{sentence[i]['surface']}{sentence[i+1]['surface']}")
except IndexError as ie:
continue
with open(f"{dirpath}/33_output.txt", "w") as f:
for line in result:
f.write(f"{line}\n")
34. 名詞の連接
result = []
tmp = ""
count = 0
for sentence in sentences:
for line in sentence:
if line["pos"] == "名詞":
tmp += line["surface"]
count += 1
else:
if count >= 2:
result.append(tmp)
tmp = ""
count = 0
with open(f"{dirpath}/34_output.txt", "w") as f:
for line in result:
f.write(f"{line}\n")
35. 単語の出現頻度
from collections import Counter
words = []
for sentence in sentences:
for data in sentence:
if data["pos"] == "名詞":
words.append(data["surface"])
counter = Counter(words)
print(counter)
36. 頻度上位10語
import matplotlib.pyplot as plt
import japanize_matplotlib
words = []
for sentence in sentences:
for data in sentence:
if data["pos"] == "名詞":
words.append(data["surface"])
counter = Counter(words)
result = counter.most_common()
x = []
y = []
for word, count in result[:10]:
x.append(word)
y.append(count)
plt.bar(x, y)
plt.show()
37. 「猫」と共起頻度の高い上位10語
from collections import defaultdict
dd = defaultdict(int)
for sentence in sentences:
if any(line["surface"] == "猫" for line in sentence):
for data in sentence:
if data["pos"] == "名詞":
d = data["surface"]
if d != "猫":
dd[d] += 1
counter = Counter(dd)
result = counter.most_common()
x = []
y = []
for word, count in result[:10]:
x.append(word)
y.append(count)
plt.bar(x, y)
plt.show()
38. ヒストグラム
_result = []
for sentence in sentences:
for line in sentence:
if line["pos"] == "名詞":
_result.append(line["surface"])
counter = Counter(_result)
result = [ count for w, count in counter.most_common() ]
plt.hist(result, bins=20, range=(1,20))
39. Zipfの法則
import numpy as np
_result = []
for sentence in sentences:
for line in sentence:
if line["pos"] == "名詞":
_result.append(line["surface"])
counter = Counter(_result)
result = counter.most_common()
x = []
y = []
for index, data in enumerate(result):
x.append(np.log(index + 1))
y.append(np.log(data[1]))
plt.scatter(x, y)