0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

言語処理100本ノック 第4章 解いてみた

Posted at

はじめに

  • matplotlibを使用しましたが、こんな簡単にグラフ描画できるんだと感動しました
  • No.30の問題を読み間違えたまま進み、No.37でつまずきました
  • ~No.36まで解き直しが発生しましたが、幸い各2行程度の修正で済みました
  • 問題文はしっかり読むようにします(当たり前ですが)
  • 引き続き頑張ります

前準備

# MeCab関連
! apt install mecab libmecab-dev mecab-ipadic-utf8 
! pip install mecab-python3
! pip install unidic-lite

# neko.txt.mecab作成
! mecab -o ./neko.txt.mecab ./neko.txt

# matplotlibで日本語表示できるようにしてくれる
! pip install japanize-matplotlib

30. 形態素解析結果の読み込み

with open(f"{dirpath}/neko.txt.mecab", "r") as f:
  lines = f.read().split("EOS\n")
blocks = []

for line in lines:
  if len(line) == 0:
    continue
  else:
    line = line.split("\n")
    blocks.append(line)

sentences = []
for block in blocks:
  sentence = []
  for data in block:
    if len(data) != 0:
      _d = data.split("\t")
      surface = _d[0]
      others = _d[1].split(",")
      d = {
          "surface": surface,
          "base": others[6],
          "pos": others[0],
          "pos1": others[1],
      }
      if surface != "":
        sentence.append(d)
  if len(sentence) != 0:
    sentences.append(sentence)

with open(f"{dirpath}/30_output.txt", "w") as f:
  for sentence in sentences:
    f.write(f"{sentence}\n")

31. 動詞

with open(f"{dirpath}/31_output.txt", "w") as f:
  for sentence in sentences:
    for line in sentence:
      if line["pos"] == "動詞":
        f.write(f"{line['surface']}\n")

32. 動詞の基本形

with open(f"{dirpath}/32_output.txt", "w") as f:
  for sentence in sentences:
    for line in sentence:
      if line["pos"] == "動詞":
        f.write(f"{line['base']}\n")

33. 「AのB」

result = []
for sentence in sentences:
  for i, line in enumerate(sentence):
    if line["surface"] == "" and line["pos"] == "助詞":
      try:
        if sentence[i - 1]["pos"] == "名詞":
          if sentence[i + 1]["pos"] == "名詞":
            result.append(f"{sentence[i-1]['surface']}{sentence[i]['surface']}{sentence[i+1]['surface']}")
      except IndexError as ie:
        continue

with open(f"{dirpath}/33_output.txt", "w") as f:
  for line in result:
    f.write(f"{line}\n")

34. 名詞の連接

result = []
tmp = ""
count = 0
for sentence in sentences:
  for line in sentence:
    if line["pos"] == "名詞":
      tmp += line["surface"]
      count += 1
    else:
      if count >= 2:
        result.append(tmp)
      tmp = ""
      count = 0

with open(f"{dirpath}/34_output.txt", "w") as f:
  for line in result:
    f.write(f"{line}\n")

35. 単語の出現頻度

from collections import Counter

words = []
for sentence in sentences:
  for data in sentence:
    if data["pos"] == "名詞":
      words.append(data["surface"])
counter = Counter(words)

print(counter)

36. 頻度上位10語

import matplotlib.pyplot as plt
import japanize_matplotlib

words = []
for sentence in sentences:
  for data in sentence:
    if data["pos"] == "名詞":
      words.append(data["surface"])

counter = Counter(words)
result = counter.most_common()

x = []
y = []

for word, count in result[:10]:
  x.append(word)
  y.append(count)

plt.bar(x, y)
plt.show()

37. 「猫」と共起頻度の高い上位10語

from collections import defaultdict
dd = defaultdict(int)
for sentence in sentences:
  if any(line["surface"] == "" for line in sentence):
    for data in sentence:
      if data["pos"] == "名詞":
        d = data["surface"]
        if d != "":
          dd[d] += 1

counter = Counter(dd)
result = counter.most_common()

x = []
y = []

for word, count in result[:10]:
  x.append(word)
  y.append(count)
plt.bar(x, y)
plt.show()

38. ヒストグラム

_result = []
for sentence in sentences:
  for line in sentence:
    if line["pos"] == "名詞":
      _result.append(line["surface"])

counter = Counter(_result)
result = [ count for w, count in counter.most_common() ]
plt.hist(result, bins=20, range=(1,20))

39. Zipfの法則

import numpy as np

_result = []
for sentence in sentences:
  for line in sentence:
    if line["pos"] == "名詞":
      _result.append(line["surface"])

counter = Counter(_result)
result = counter.most_common()

x = []
y = []

for index, data in enumerate(result):
  x.append(np.log(index + 1))
  y.append(np.log(data[1]))

plt.scatter(x, y)
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?