More than 3 years have passed since last update.

手元のテキストデータでワードクラウド描こう（パート2）～TF-IDF編～

Last updated at 2021-11-22Posted at 2021-11-07

はじめに

前回、WordCloudの記事を書いたあと、「TF-IDF」という文書中に含まれる単語の重要度を評価する手法の存在を知りました。
ワードの出現頻度TFとワードのレア度IDFを掛け合わせたもので、出現頻度が高くかつレアなワードほど大きい値を示すもののようです。

これを知ると、出現頻度だけじゃなく、TF-IDFでもワードクラウドが描きたくなり、やってみたという記事です。

TF-IDFは、scikit-learnにTfidfVectorizerというライブラリで計算できるのですが、対象品詞の設定やこのライブラリで計算したTF-IDFをうまくワードクラウドに持ち込むことができず、このライブラリの使用は断念。
　
以下のサイトに公開されているTF-IDFマニュアル計算コードに手を加えさせていただき、

ワードクラウド描画可能な形式にTFIDF計算結果を加工、TF-IDFのワードクラウドの描画を可能に
対象品詞に基礎型の動詞と形容詞を追加
nlplotというライブラリで、頻出ワードのグラフを追加

を実行できるようにしてみました。

実行条件など

・Google colabで実行
・**青空文庫の「こころ」**で実行
※手元データを読込んで実行する場合も記載していますので、簡単にできるはずです。

ライブラリのインストール

MeCabのインストール

# Mecabのインストール
!pip install mecab-python3==0.996.5

日本語フォント

# 日本語フォントをインストール
!apt-get -y install fonts-ipafont-gothic

nlplot

# nlplotをインストール
pip install nlplot

モジュールの準備

ライブラリインポート

import MeCab as mc
import re           # 正規表現
import numpy as np

モジュールの準備

def strip_CRLF_from_Text(text):
    """
    テキストファイルの改行，タブを削除する．
    改行文字やタブ文字の前後が日本語文字の場合はそれを削除する．
    それ以外はスペースに置換する．
    """
    # 改行前後の文字が日本語文字の場合は改行を削除する
    plaintext = re.sub('([ぁ-んー]+|[ァ-ンー]+|[\\u4e00-\\u9FFF]+|[ぁ-んァ-ンー\\u4e00-\\u9FFF]+)(\n)([ぁ-んー]+|[ァ-ンー]+|[\\u4e00-\\u9FFF]+|[ぁ-んァ-ンー\\u4e00-\\u9FFF]+)',
                       r'\1\3',
                       text)
    # 残った改行とタブ記号はスペースに置換する
    plaintext = plaintext.replace('\n', ' ').replace('\t', ' ')
    return plaintext
 
 
def get_Text_from_Filenames(filenames):
    """
    ファイル名のリストを与えて，辞書を作成して返す
    """
    raw = ''
    # ファイルを開く
    for filename in filenames:
        f = open(filename,encoding='utf-8')
        raw += f.read()
        f.close()
 
    # 改行を削除する
    text = strip_CRLF_from_Text(raw)
    return text
 
def mecab_analysis(text):
    """
    MeCabをつかって単語を切り出してリストに詰める関数．
    可視化して意味がありそうな単語を抽出するために品詞は名詞だけ（あるいは名詞、動詞、形容詞、副詞）に限定．
    """
#    t = mc.Tagger('-Owakati')
    t = mc.Tagger('-Ochasen')

    node = t.parseToNode(text)

    words = []

    while(node):

        if node.surface != "":  # ヘッダとフッタを除外
            word_type = node.feature.split(",")[0]
            sub_type = node.feature.split(',')[1]
            features_ = node.feature.split(",")

            #ストップワード設定（出力結果から手動で設定）
            stop_words = ["し","い","ある", "おる", "せる", "ない", "いる", "する", "の", "よう", "なる", "それ", "そこ", "これ", "こう", "ため", "そう", "れる", "られる"]

            if word_type in ["名詞"]:  # 名詞をリストに追加する
                if sub_type in ['一般']:
                    words.append(node.surface)

            #動詞、形容詞[基礎型]を抽出（名詞のみを抽出したい場合は以下コードを除く）
            elif word_type in ['動詞','形容詞'] and not (features_[6] in stop_words):
                    words.append(features_[6])

        node = node.next
        if node is None:
            break
    return words
 
def get_DF_from_Filenames(filenames, vocab):
    """
    ファイル名のリストを与えて，DFの値を返します．
    DFは索引後が出現する文書数のこと．
    """
    # 辞書の長さと同じ長さで DF を初期化する
    df = np.zeros((len(vocab), 1))
 
    for filename in filenames:
        f = open(filename, encoding='utf-8')
        raw = f.read()
        text = strip_CRLF_from_Text(raw) # 改行を削除
        words = mecab_analysis(text) 
        for s in set(words):  # 単語の重複を除いて登場した単語を数える
            df[vocab.index(s), 0] += 1
    return df
 
def get_TF_from_Filenames(filenames, vocab):
    """
    ファイル名のリストを与えて，TFの値を返します．
    TFは索引後の出現頻度のこと．
    """
    n_docs = len(filenames)
    n_vocab = len(vocab)
 
    # 行数 = 登録辞書数， 列数 = 文書数 で tf を初期化する
    tf = np.zeros((n_vocab, n_docs))
 
    for filename in filenames:
        f = open(filename,encoding = 'utf-8')
        raw = f.read()
        text = strip_CRLF_from_Text(raw)
        words = mecab_analysis(text)
        for w in words:
            tf[vocab.index(w), filenames.index(filename)] += 1
    return tf
 
def get_TFIDF_from_TF_DF(tf, df):
    """
    TFとDFを与えて，TF-IDFの値を返します．
    """
    return tf/df
 
def get_distance_matrix(tfidf):
    """
    tfidf の行列を渡せば，文書間の距離を計算して，行列を返します．
    """
    n_docs = tfidf.shape[1]
    n_words = tfidf.shape[0]
    # 結果を格納する行列を準備（初期化）する
    distance_matrix = np.zeros([n_docs, n_docs])    # 文書数 x 文書数
 
    for origin in range(n_docs):   # origin : 比較元文書
        tmp_matrix = np.zeros([n_words, n_docs])    # 単語数 x 文書数
 
        # 比較元文書のTFIDFを取得する
        origin_tfidf = tfidf[0:tfidf.shape[0], origin]
 
        # 各要素の二乗誤差を取る
        for i in range(tfidf.shape[1]):    # 列のループ    0:2
            for j in range(tfidf.shape[0]):   # 行のループ   0:3
                tmp_matrix[j, i] = (tfidf[j, i] - origin_tfidf[j])**2
 
        # 二乗誤差の合計の平方根を計算
        for i in range(distance_matrix.shape[1]):
            distance_matrix[origin, i] = np.sqrt(tmp_matrix.sum(axis=0)[i])
    return distance_matrix

データ読込み

事前に解析を行う対象のテキストデータを用意する必要があります。これでは、任意のテキストデータ（txtファイル）を読み込ませます。

※青空文庫のデータを読込む場合、以下を参照してください。「走れメロス」ならば、URL にある「テキストファイル(ルビあり)」（zipファイル）をダウンロード後、解凍し、hashire_merosu.txt を sample.txt の代わりに設定します。他の書籍でも要領は同じですが、textファイル名は個々に異なります。「こころ」の場合、URLからzipファイルをダウンロードし、解凍したデータは kokoro.txt です。
※textファイルは、文字コードを「UTF-8」としてください。
　

任意のテキストデータ（sample.txt）を読み込む場合

任意（sample.txt）を読込む場合

###   文書ファイルを指定する．
filenames = ['sample.txt',
#                'sample_2.txt',
#                'sample_3.txt',
#                'sample_4.txt',
#                'sample_5.txt'
            ]
 
###  文書ファイルを開いてテキストデータを取得する
text = get_Text_from_Filenames(filenames)
print(text)

Mecabを用いた形態素解析の実行

ここからは最初にインストールしたMeCabを利用して形態素解析を行っていきます。出力フォーマットにはいくつか種類がありますが、ここでは「chasen」としています。（ ※MeCabでは '-Ochasen' と記載して指定します。）

抽出対象とした品詞のカウント、TF-IDF計算を実行します。

MeCab実行

### Mecabによる形態素解析
words = mecab_analysis(text)
print(words)

ワード出現頻度カウント

from collections import Counter

# 出現回数を集計し、最頻順にソート
words_count = Counter(words)
result = words_count.most_common()

# 出現回数結果の画面出力
for word, cnt in result:
    print(word, cnt)

ソート

vocab = sorted(set(words))
print(vocab)

DF,TF,TFIDF

df = get_DF_from_Filenames(filenames, vocab)
tf = get_TF_from_Filenames(filenames, vocab)
tfidf= get_TFIDF_from_TF_DF(tf, df)

TF-IDFを一次元リストに

tfidf_list = tfidf.tolist()
# sum で二次元リストを一次元リストに変換
tfidf_list2 = sum(tfidf_list, [])

TF-IDFを辞書形式に

tfidf_dict = {vocab: val for vocab, val in zip(vocab, tfidf_list2)}
tfidf_dict

抽出語のカウント数グラフ化‗nlplot

抽出ワードをデータフレームに格納

# 抽出したワードをデータフレームdfに
import pandas as pd
df = pd.DataFrame(words, columns = ['Word'], index=None)
df

nlplotグラフ①

# nlplot：top_nで頻出上位単語, min_freqで頻出下位単語を指定できる
import nlplot

npt = nlplot.NLPlot(df, target_col='Word')
# stopwords = npt.get_stopword(top_n=0, min_freq=0)

npt.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    #stopwords=stopwords,
)

nlplotグラフ②

npt.treemap(
    title='Tree of Most Common Words',
    ngram=1,
    top_n=30,
    #stopwords=stopwords,
)

ワードクラウド生成

ワードクラウド

# wordcloud取込用に辞書型ヘ変換
dic_result = dict(result)

# Word Cloudで画像生成
from wordcloud import WordCloud

wordcloud = WordCloud(background_color='white',
                      max_words=125,
                      font_path='/usr/share/fonts/truetype/fonts-japanese-gothic.ttf',
                      width=1000,
                      height=600,
                      ).fit_words(dic_result)

# 生成した画像の表示
import matplotlib.pyplot as plt 
from matplotlib import rcParams

plt.figure(figsize=(15,10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

ワードクラフトを任意の画像の形に描画する場合

donutsデータダウンロード

# donutsデータダウンロード
import requests

url = "https://github.com/hima2b4/Word-Cloud/raw/main/donuts.png"

file_name = "donuts.png"

response = requests.get(url)
image = response.content

with open(file_name, "wb") as f:
    f.write(image)

ワードクラウド（Donuts型）

# ライブラリインポート
from PIL import Image
import numpy as np

custom_mask = np.array(Image.open('donuts.png'))
wordcloud = WordCloud(background_color='white',
                      max_words=125,
                      mask=custom_mask,
                      font_path='/usr/share/fonts/truetype/fonts-japanese-gothic.ttf',
                      width=1200,
                      height=1200
                      ).fit_words(dic_result)

# 生成した画像の表示
plt.figure(figsize=(15,10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

TF-IDFでワードクラウド描画

TF-IDFでワードクラウド

# Word Cloudで画像生成
wordcloud = WordCloud(background_color='white',
                      max_words=125,
                      font_path='/usr/share/fonts/truetype/fonts-japanese-gothic.ttf',
                      width=1000,
                      height=600,
                      ).generate_from_frequencies(tfidf_dict)

# 生成した画像の表示
plt.figure(figsize=(15,10))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

最後に

ちょっと無駄なコードもあると思いますが、なんとかワードクラウドが描けるようになった。
しばらくはまりそう。

参考サイト

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up