形態素解析によるアンケートの結果分類

Posted at 2024-11-11

アンケートをざっくりと分類する

Pythonで形態素解析をMeCabライブラリを用いて実施する際の注意事項

１．MeCabライブラリのインストール

２．フォルダ　C:\Users\ユーザー名\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\lib\site-packages にある『 libmecab.dll 』を
C:\Users\ユーザー名\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\MeCab フォルダにコピーする．

３．辞書をインストールする

pip install unidic-lite

４．プロットの日本語表示用ライブラリをインストールする

pip install japanize_matplotlib

サンプル：安全に関するアンケート

safety_comment.txt

安全教育の充実が必要だと感じています。定期的な訓練が重要です。
作業現場での安全確認が不十分だと思います。チェックリストの導入を提案します。
安全意識の向上が課題です。全社的な取り組みが必要だと考えます。
設備の定期点検が重要です。事故防止のために徹底すべきです。
コミュニケーションの改善が安全につながると思います。情報共有を促進すべきです。

MeCabの使用例

mecab_samp.py

import MeCab
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# ファイルを読み込んで内容をリストに格納する関数
def read_survey_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # 各行を読み込んでリストに格納
        responses = file.readlines()
        # 各行の末尾の改行文字を削除
        responses = [response.strip() for response in responses]
    return responses

# ファイルパスを指定（実際のファイルパスに変更してください）
file_path = 'safety_comment.txt'

try:
    # ファイルを読み込んで safety_responses に格納
    safety_responses = read_survey_file(file_path)
    
    # 読み込んだ内容を確認
    print(f"読み込んだ回答の数: {len(safety_responses)}")
    print("最初の5つの回答:")
    for response in safety_responses[:5]:
        print(response)

except FileNotFoundError:
    print(f"エラー: ファイル '{file_path}' が見つかりません。")
except Exception as e:
    print(f"エラーが発生しました: {e}")


def tokenize(text):
    tagger = MeCab.Tagger()
    tagger.parse('')
    node = tagger.parseToNode(text)
    words = []
    while node:
        pos = node.feature.split(',')[0]
        if pos in ['名詞', '動詞', '形容詞']:  # 名詞、動詞、形容詞を抽出
            words.append(node.surface)
        node = node.next
    return words



vectorizer = TfidfVectorizer(tokenizer=tokenize)
X = vectorizer.fit_transform(safety_responses)

num_clusters = 3
model = KMeans(n_clusters=num_clusters, random_state=42)
model.fit(X)

labels = model.labels_

for i, label in enumerate(labels):
    print(f"回答 {i+1}: クラスタ {label}")
    


feature_names = vectorizer.get_feature_names_out()
for i in range(num_clusters):
    top_words = [feature_names[j] for j in model.cluster_centers_[i].argsort()[-5:][::-1]]
    print(f"クラスタ {i} の特徴語: {', '.join(top_words)}")



import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import japanize_matplotlib  # 日本語表示用

# 単語ベクトルを取得
word_vectors = vectorizer.fit_transform(safety_responses).T.toarray()

# PCAで2次元に削減
pca = PCA(n_components=2)
word_coords = pca.fit_transform(word_vectors)

# 単語のクラスタリング
word_kmeans = KMeans(n_clusters=5, random_state=42)
word_clusters = word_kmeans.fit_predict(word_vectors)

# 散布図の作成
plt.figure(figsize=(10, 6))
scatter = plt.scatter(word_coords[:, 0], word_coords[:, 1], 
                     c=word_clusters, cmap='Set3', alpha=0.6)

# 重要な単語にラベルを付ける（上位30単語）
important_indices = word_vectors.sum(axis=1).argsort()[-30:]
for idx in important_indices:
    plt.annotate(feature_names[idx], 
                (word_coords[idx, 0], word_coords[idx, 1]),
                xytext=(5, 5), textcoords='offset points',
                fontsize=10)

plt.title('単語のクラスタリング可視化')
plt.colorbar(scatter, label='クラスター')
plt.xlabel('第1主成分')
plt.ylabel('第2主成分')
plt.tight_layout()
plt.show()

# 各クラスターの特徴語を表示
for cluster_id in range(5):
    cluster_words = [feature_names[i] for i in range(len(feature_names)) 
                    if word_clusters[i] == cluster_id]
    top_words = sorted(cluster_words, 
                      key=lambda w: word_vectors[feature_names.tolist().index(w)].sum(),
                      reverse=True)[:5]
    print(f"\nクラスター{cluster_id}の特徴語:")
    print(", ".join(top_words))




# データの標準化
scaler = StandardScaler()
data_scaled = scaler.fit_transform(word_vectors)

# 階層的クラスタリングの実行
linkage_matrix = linkage(data_scaled, method='ward')
    
# デンドログラムの作成
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix)
plt.title('安全に関するアンケートのクラスター分析結果')
plt.xlabel('回答者ID')
plt.ylabel('距離')
plt.show()


# 重要な単語（出現頻度上位50語）を選択
word_importance = word_vectors.sum(axis=1)
top_word_indices = word_importance.argsort()[-50:]
top_words = [feature_names[i] for i in top_word_indices]
top_word_vectors = word_vectors[top_word_indices]

# データの標準化
scaler = StandardScaler()
data_scaled = scaler.fit_transform(top_word_vectors)

# 階層的クラスタリングの実行
linkage_matrix = linkage(data_scaled, method='ward')

# デンドログラムの作成
plt.figure(figsize=(10, 6))
dendrogram(
    linkage_matrix,
    labels=top_words,  # 単語をラベルとして使用
    leaf_rotation=90,  # ラベルを90度回転
    leaf_font_size=8   # フォントサイズを調整
)
plt.title('単語の階層的クラスタリング結果')
plt.xlabel('単語')
plt.ylabel('距離')
plt.tight_layout()  # レイアウトの自動調整
plt.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up